2 ** x86/x64 IR assembler (SSA IR -> machine code).
3 ** Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h
6 /* -- Guard handling ------------------------------------------------------ */
8 /* Generate an exit stub group at the bottom of the reserved MCode memory. */
9 static MCode
*asm_exitstub_gen(ASMState
*as
, ExitNo group
)
11 ExitNo i
, groupofs
= (group
*EXITSTUBS_PER_GROUP
) & 0xff;
12 MCode
*mxp
= as
->mcbot
;
13 MCode
*mxpstart
= mxp
;
14 if (mxp
+ (2+2)*EXITSTUBS_PER_GROUP
+8+5 >= as
->mctop
)
16 /* Push low byte of exitno for each exit stub. */
17 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)groupofs
;
18 for (i
= 1; i
< EXITSTUBS_PER_GROUP
; i
++) {
19 *mxp
++ = XI_JMPs
; *mxp
++ = (MCode
)((2+2)*(EXITSTUBS_PER_GROUP
- i
) - 2);
20 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)(groupofs
+ i
);
22 /* Push the high byte of the exitno for each exit stub group. */
23 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)((group
*EXITSTUBS_PER_GROUP
)>>8);
24 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
26 *mxp
++ = MODRM(XM_OFS8
, 0, RID_ESP
);
27 *mxp
++ = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
28 *mxp
++ = 2*sizeof(void *);
29 *(int32_t *)mxp
= ptr2addr(J2GG(as
->J
)->dispatch
); mxp
+= 4;
30 /* Jump to exit handler which fills in the ExitState. */
31 *mxp
++ = XI_JMP
; mxp
+= 4;
32 *((int32_t *)(mxp
-4)) = jmprel(mxp
, (MCode
*)(void *)lj_vm_exit_handler
);
33 /* Commit the code for this group (even if assembly fails later on). */
34 lj_mcode_commitbot(as
->J
, mxp
);
36 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
40 /* Setup all needed exit stubs. */
41 static void asm_exitstub_setup(ASMState
*as
, ExitNo nexits
)
44 if (nexits
>= EXITSTUBS_PER_GROUP
*LJ_MAX_EXITSTUBGR
)
45 lj_trace_err(as
->J
, LJ_TRERR_SNAPOV
);
46 for (i
= 0; i
< (nexits
+EXITSTUBS_PER_GROUP
-1)/EXITSTUBS_PER_GROUP
; i
++)
47 if (as
->J
->exitstubgroup
[i
] == NULL
)
48 as
->J
->exitstubgroup
[i
] = asm_exitstub_gen(as
, i
);
51 /* Emit conditional branch to exit for guard.
52 ** It's important to emit this *after* all registers have been allocated,
53 ** because rematerializations may invalidate the flags.
55 static void asm_guardcc(ASMState
*as
, int cc
)
57 MCode
*target
= exitstub_addr(as
->J
, as
->snapno
);
59 if (LJ_UNLIKELY(p
== as
->invmcp
)) {
61 *(int32_t *)(p
+1) = jmprel(p
+5, target
);
65 emit_sjcc(as
, cc
, target
);
69 emit_jcc(as
, cc
, target
);
72 /* -- Memory operand fusion ----------------------------------------------- */
74 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
75 #define CONFLICT_SEARCH_LIM 31
77 /* Check if a reference is a signed 32 bit constant. */
78 static int asm_isk32(ASMState
*as
, IRRef ref
, int32_t *k
)
82 if (ir
->o
!= IR_KINT64
) {
85 } else if (checki32((int64_t)ir_kint64(ir
)->u64
)) {
86 *k
= (int32_t)ir_kint64(ir
)->u64
;
93 /* Check if there's no conflicting instruction between curins and ref.
94 ** Also avoid fusing loads if there are multiple references.
96 static int noconflict(ASMState
*as
, IRRef ref
, IROp conflict
, int noload
)
100 if (i
> ref
+ CONFLICT_SEARCH_LIM
)
101 return 0; /* Give up, ref is too far away. */
103 if (ir
[i
].o
== conflict
)
104 return 0; /* Conflict found. */
105 else if (!noload
&& (ir
[i
].op1
== ref
|| ir
[i
].op2
== ref
))
108 return 1; /* Ok, no conflict. */
111 /* Fuse array base into memory operand. */
112 static IRRef
asm_fuseabase(ASMState
*as
, IRRef ref
)
114 IRIns
*irb
= IR(ref
);
116 if (irb
->o
== IR_FLOAD
) {
117 IRIns
*ira
= IR(irb
->op1
);
118 lua_assert(irb
->op2
== IRFL_TAB_ARRAY
);
119 /* We can avoid the FLOAD of t->array for colocated arrays. */
120 if (ira
->o
== IR_TNEW
&& ira
->op1
<= LJ_MAX_COLOSIZE
&&
121 !neverfuse(as
) && noconflict(as
, irb
->op1
, IR_NEWREF
, 1)) {
122 as
->mrm
.ofs
= (int32_t)sizeof(GCtab
); /* Ofs to colocated array. */
123 return irb
->op1
; /* Table obj. */
125 } else if (irb
->o
== IR_ADD
&& irref_isk(irb
->op2
)) {
126 /* Fuse base offset (vararg load). */
127 as
->mrm
.ofs
= IR(irb
->op2
)->i
;
130 return ref
; /* Otherwise use the given array base. */
133 /* Fuse array reference into memory operand. */
134 static void asm_fusearef(ASMState
*as
, IRIns
*ir
, RegSet allow
)
137 lua_assert(ir
->o
== IR_AREF
);
138 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, asm_fuseabase(as
, ir
->op1
), allow
);
140 if (irref_isk(ir
->op2
)) {
141 as
->mrm
.ofs
+= 8*irx
->i
;
142 as
->mrm
.idx
= RID_NONE
;
144 rset_clear(allow
, as
->mrm
.base
);
145 as
->mrm
.scale
= XM_SCALE8
;
146 /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
147 ** Doesn't help much without ABCelim, but reduces register pressure.
149 if (!LJ_64
&& /* Has bad effects with negative index on x64. */
150 mayfuse(as
, ir
->op2
) && ra_noreg(irx
->r
) &&
151 irx
->o
== IR_ADD
&& irref_isk(irx
->op2
)) {
152 as
->mrm
.ofs
+= 8*IR(irx
->op2
)->i
;
153 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, irx
->op1
, allow
);
155 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, ir
->op2
, allow
);
160 /* Fuse array/hash/upvalue reference into memory operand.
161 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
162 ** pass the final allow mask, excluding any GPRs used for other inputs.
163 ** In particular: 2-operand GPR instructions need to call ra_dest() first!
165 static void asm_fuseahuref(ASMState
*as
, IRRef ref
, RegSet allow
)
168 if (ra_noreg(ir
->r
)) {
169 switch ((IROp
)ir
->o
) {
171 if (mayfuse(as
, ref
)) {
172 asm_fusearef(as
, ir
, allow
);
177 if (mayfuse(as
, ref
)) {
178 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
179 as
->mrm
.ofs
= (int32_t)(IR(ir
->op2
)->op2
* sizeof(Node
));
180 as
->mrm
.idx
= RID_NONE
;
185 if (irref_isk(ir
->op1
)) {
186 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
187 GCupval
*uv
= &gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)])->uv
;
188 as
->mrm
.ofs
= ptr2addr(&uv
->tv
);
189 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
194 lua_assert(ir
->o
== IR_HREF
|| ir
->o
== IR_NEWREF
|| ir
->o
== IR_UREFO
||
199 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
201 as
->mrm
.idx
= RID_NONE
;
204 /* Fuse FLOAD/FREF reference into memory operand. */
205 static void asm_fusefref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
207 lua_assert(ir
->o
== IR_FLOAD
|| ir
->o
== IR_FREF
);
208 as
->mrm
.ofs
= field_ofs
[ir
->op2
];
209 as
->mrm
.idx
= RID_NONE
;
210 if (irref_isk(ir
->op1
)) {
211 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
212 as
->mrm
.base
= RID_NONE
;
214 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
218 /* Fuse string reference into memory operand. */
219 static void asm_fusestrref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
222 lua_assert(ir
->o
== IR_STRREF
);
223 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
224 as
->mrm
.scale
= XM_SCALE1
;
225 as
->mrm
.ofs
= sizeof(GCstr
);
226 if (irref_isk(ir
->op1
)) {
227 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
229 Reg r
= ra_alloc1(as
, ir
->op1
, allow
);
230 rset_clear(allow
, r
);
231 as
->mrm
.base
= (uint8_t)r
;
234 if (irref_isk(ir
->op2
)) {
235 as
->mrm
.ofs
+= irr
->i
;
238 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
239 if (!LJ_64
&& /* Has bad effects with negative index on x64. */
240 mayfuse(as
, ir
->op2
) && irr
->o
== IR_ADD
&& irref_isk(irr
->op2
)) {
241 as
->mrm
.ofs
+= IR(irr
->op2
)->i
;
242 r
= ra_alloc1(as
, irr
->op1
, allow
);
244 r
= ra_alloc1(as
, ir
->op2
, allow
);
246 if (as
->mrm
.base
== RID_NONE
)
247 as
->mrm
.base
= (uint8_t)r
;
249 as
->mrm
.idx
= (uint8_t)r
;
253 static void asm_fusexref(ASMState
*as
, IRRef ref
, RegSet allow
)
256 as
->mrm
.idx
= RID_NONE
;
257 if (ir
->o
== IR_KPTR
|| ir
->o
== IR_KKPTR
) {
259 as
->mrm
.base
= RID_NONE
;
260 } else if (ir
->o
== IR_STRREF
) {
261 asm_fusestrref(as
, ir
, allow
);
264 if (canfuse(as
, ir
) && ir
->o
== IR_ADD
&& ra_noreg(ir
->r
)) {
265 /* Gather (base+idx*sz)+ofs as emitted by cdata ptr/array indexing. */
269 if (asm_isk32(as
, ir
->op2
, &as
->mrm
.ofs
)) { /* Recognize x+ofs. */
272 if (!(ir
->o
== IR_ADD
&& canfuse(as
, ir
) && ra_noreg(ir
->r
)))
275 as
->mrm
.scale
= XM_SCALE1
;
279 if (!(irx
->o
== IR_BSHL
|| irx
->o
== IR_ADD
)) { /* Try other operand. */
284 if (canfuse(as
, irx
) && ra_noreg(irx
->r
)) {
285 if (irx
->o
== IR_BSHL
&& irref_isk(irx
->op2
) && IR(irx
->op2
)->i
<= 3) {
286 /* Recognize idx<<b with b = 0-3, corresponding to sz = (1),2,4,8. */
288 as
->mrm
.scale
= (uint8_t)(IR(irx
->op2
)->i
<< 6);
289 } else if (irx
->o
== IR_ADD
&& irx
->op1
== irx
->op2
) {
290 /* FOLD does idx*2 ==> idx<<1 ==> idx+idx. */
292 as
->mrm
.scale
= XM_SCALE2
;
295 r
= ra_alloc1(as
, idx
, allow
);
296 rset_clear(allow
, r
);
297 as
->mrm
.idx
= (uint8_t)r
;
300 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
304 /* Fuse load into memory operand. */
305 static Reg
asm_fuseload(ASMState
*as
, IRRef ref
, RegSet allow
)
308 if (ra_hasreg(ir
->r
)) {
309 if (allow
!= RSET_EMPTY
) { /* Fast path. */
310 ra_noweak(as
, ir
->r
);
314 /* Force a spill if only memory operands are allowed (asm_x87load). */
315 as
->mrm
.base
= RID_ESP
;
316 as
->mrm
.ofs
= ra_spill(as
, ir
);
317 as
->mrm
.idx
= RID_NONE
;
320 if (ir
->o
== IR_KNUM
) {
321 RegSet avail
= as
->freeset
& ~as
->modset
& RSET_FPR
;
322 lua_assert(allow
!= RSET_EMPTY
);
323 if (!(avail
& (avail
-1))) { /* Fuse if less than two regs available. */
324 as
->mrm
.ofs
= ptr2addr(ir_knum(ir
));
325 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
328 } else if (ref
== REF_BASE
|| ir
->o
== IR_KINT64
) {
329 RegSet avail
= as
->freeset
& ~as
->modset
& RSET_GPR
;
330 lua_assert(allow
!= RSET_EMPTY
);
331 if (!(avail
& (avail
-1))) { /* Fuse if less than two regs available. */
332 as
->mrm
.ofs
= ptr2addr(ref
== REF_BASE
? (void *)&J2G(as
->J
)->jit_base
: (void *)ir_kint64(ir
));
333 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
336 } else if (mayfuse(as
, ref
)) {
337 RegSet xallow
= (allow
& RSET_GPR
) ? allow
: RSET_GPR
;
338 if (ir
->o
== IR_SLOAD
) {
339 if (!(ir
->op2
& (IRSLOAD_PARENT
|IRSLOAD_CONVERT
)) &&
340 noconflict(as
, ref
, IR_RETF
, 0)) {
341 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, REF_BASE
, xallow
);
342 as
->mrm
.ofs
= 8*((int32_t)ir
->op1
-1) + ((ir
->op2
&IRSLOAD_FRAME
)?4:0);
343 as
->mrm
.idx
= RID_NONE
;
346 } else if (ir
->o
== IR_FLOAD
) {
347 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
348 if ((irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
)) &&
349 noconflict(as
, ref
, IR_FSTORE
, 0)) {
350 asm_fusefref(as
, ir
, xallow
);
353 } else if (ir
->o
== IR_ALOAD
|| ir
->o
== IR_HLOAD
|| ir
->o
== IR_ULOAD
) {
354 if (noconflict(as
, ref
, ir
->o
+ IRDELTA_L2S
, 0)) {
355 asm_fuseahuref(as
, ir
->op1
, xallow
);
358 } else if (ir
->o
== IR_XLOAD
) {
359 /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp).
360 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
362 if ((!irt_typerange(ir
->t
, IRT_I8
, IRT_U16
)) &&
363 noconflict(as
, ref
, IR_XSTORE
, 0)) {
364 asm_fusexref(as
, ir
->op1
, xallow
);
367 } else if (ir
->o
== IR_VLOAD
) {
368 asm_fuseahuref(as
, ir
->op1
, xallow
);
372 if (!(as
->freeset
& allow
) && !emit_canremat(ref
) &&
373 (allow
== RSET_EMPTY
|| ra_hasspill(ir
->s
) || iscrossref(as
, ref
)))
375 return ra_allocref(as
, ref
, allow
);
379 /* Don't fuse a 32 bit load into a 64 bit operation. */
380 static Reg
asm_fuseloadm(ASMState
*as
, IRRef ref
, RegSet allow
, int is64
)
382 if (is64
&& !irt_is64(IR(ref
)->t
))
383 return ra_alloc1(as
, ref
, allow
);
384 return asm_fuseload(as
, ref
, allow
);
387 #define asm_fuseloadm(as, ref, allow, is64) asm_fuseload(as, (ref), (allow))
390 /* -- Calls --------------------------------------------------------------- */
392 /* Count the required number of stack slots for a call. */
393 static int asm_count_call_slots(ASMState
*as
, const CCallInfo
*ci
, IRRef
*args
)
395 uint32_t i
, nargs
= CCI_NARGS(ci
);
399 nslots
= (int)(nargs
*2); /* Only matters for more than four args. */
401 int ngpr
= REGARG_NUMGPR
, nfpr
= REGARG_NUMFPR
;
402 for (i
= 0; i
< nargs
; i
++)
403 if (args
[i
] && irt_isfp(IR(args
[i
])->t
)) {
404 if (nfpr
> 0) nfpr
--; else nslots
+= 2;
406 if (ngpr
> 0) ngpr
--; else nslots
+= 2;
411 if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_FASTCALL
)
413 else if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_THISCALL
)
415 for (i
= 0; i
< nargs
; i
++)
416 if (args
[i
] && irt_isfp(IR(args
[i
])->t
)) {
417 nslots
+= irt_isnum(IR(args
[i
])->t
) ? 2 : 1;
419 if (ngpr
> 0) ngpr
--; else nslots
++;
425 /* Generate a call to a C function. */
426 static void asm_gencall(ASMState
*as
, const CCallInfo
*ci
, IRRef
*args
)
428 uint32_t n
, nargs
= CCI_NARGS(ci
);
429 int32_t ofs
= STACKARG_OFS
;
431 uint32_t gprs
= REGARG_GPRS
;
432 Reg fpr
= REGARG_FIRSTFPR
;
434 MCode
*patchnfpr
= NULL
;
438 if ((ci
->flags
& CCI_CC_MASK
) != CCI_CC_CDECL
) {
439 if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_THISCALL
)
440 gprs
= (REGARG_GPRS
& 31);
441 else if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_FASTCALL
)
445 if ((void *)ci
->func
)
446 emit_call(as
, ci
->func
);
448 if ((ci
->flags
& CCI_VARARG
)) { /* Special handling for vararg calls. */
450 for (n
= 0; n
< 4 && n
< nargs
; n
++) {
451 IRIns
*ir
= IR(args
[n
]);
452 if (irt_isfp(ir
->t
)) /* Duplicate FPRs in GPRs. */
453 emit_rr(as
, XO_MOVDto
, (irt_isnum(ir
->t
) ? REX_64
: 0) | (fpr
+n
),
454 ((gprs
>> (n
*5)) & 31)); /* Either MOVD or MOVQ. */
457 patchnfpr
= --as
->mcp
; /* Indicate number of used FPRs in register al. */
458 *--as
->mcp
= XI_MOVrib
| RID_EAX
;
462 for (n
= 0; n
< nargs
; n
++) { /* Setup args. */
466 #if LJ_64 && LJ_ABI_WIN
467 /* Windows/x64 argument registers are strictly positional. */
468 r
= irt_isfp(ir
->t
) ? (fpr
<= REGARG_LASTFPR
? fpr
: 0) : (gprs
& 31);
471 /* POSIX/x64 argument registers are used in order of appearance. */
472 if (irt_isfp(ir
->t
)) {
473 r
= fpr
<= REGARG_LASTFPR
? fpr
++ : 0;
475 r
= gprs
& 31; gprs
>>= 5;
478 if (ref
&& irt_isfp(ir
->t
)) {
481 r
= gprs
& 31; gprs
>>= 5;
485 if (r
) { /* Argument is in a register. */
486 if (r
< RID_MAX_GPR
&& ref
< ASMREF_TMP1
) {
488 if (ir
->o
== IR_KINT64
)
489 emit_loadu64(as
, r
, ir_kint64(ir
)->u64
);
492 emit_loadi(as
, r
, ir
->i
);
494 lua_assert(rset_test(as
->freeset
, r
)); /* Must have been evicted. */
495 if (ra_hasreg(ir
->r
)) {
496 ra_noweak(as
, ir
->r
);
497 emit_movrr(as
, ir
, r
, ir
->r
);
499 ra_allocref(as
, ref
, RID2RSET(r
));
502 } else if (irt_isfp(ir
->t
)) { /* FP argument is on stack. */
503 lua_assert(!(irt_isfloat(ir
->t
) && irref_isk(ref
))); /* No float k. */
504 if (LJ_32
&& (ofs
& 4) && irref_isk(ref
)) {
505 /* Split stores for unaligned FP consts. */
506 emit_movmroi(as
, RID_ESP
, ofs
, (int32_t)ir_knum(ir
)->u32
.lo
);
507 emit_movmroi(as
, RID_ESP
, ofs
+4, (int32_t)ir_knum(ir
)->u32
.hi
);
509 r
= ra_alloc1(as
, ref
, RSET_FPR
);
510 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_MOVSDto
: XO_MOVSSto
,
513 ofs
+= (LJ_32
&& irt_isfloat(ir
->t
)) ? 4 : 8;
514 } else { /* Non-FP argument is on stack. */
515 if (LJ_32
&& ref
< ASMREF_TMP1
) {
516 emit_movmroi(as
, RID_ESP
, ofs
, ir
->i
);
518 r
= ra_alloc1(as
, ref
, RSET_GPR
);
519 emit_movtomro(as
, REX_64
+ r
, RID_ESP
, ofs
);
521 ofs
+= sizeof(intptr_t);
525 #if LJ_64 && !LJ_ABI_WIN
526 if (patchnfpr
) *patchnfpr
= fpr
- REGARG_FIRSTFPR
;
530 /* Setup result reg/sp for call. Evict scratch regs. */
531 static void asm_setupresult(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
533 RegSet drop
= RSET_SCRATCH
;
534 int hiop
= (LJ_32
&& (ir
+1)->o
== IR_HIOP
&& !irt_isnil((ir
+1)->t
));
535 if ((ci
->flags
& CCI_NOFPRCLOBBER
))
537 if (ra_hasreg(ir
->r
))
538 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
539 if (hiop
&& ra_hasreg((ir
+1)->r
))
540 rset_clear(drop
, (ir
+1)->r
); /* Dest reg handled below. */
541 ra_evictset(as
, drop
); /* Evictions must be performed first. */
543 if (irt_isfp(ir
->t
)) {
544 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
546 if ((ci
->flags
& CCI_CASTU64
)) {
548 if (ra_hasreg(dest
)) {
550 ra_modified(as
, dest
);
551 emit_rr(as
, XO_MOVD
, dest
|REX_64
, RID_RET
); /* Really MOVQ. */
553 if (ofs
) emit_movtomro(as
, RID_RET
|REX_64
, RID_ESP
, ofs
);
555 ra_destreg(as
, ir
, RID_FPRET
);
558 /* Number result is in x87 st0 for x86 calling convention. */
560 if (ra_hasreg(dest
)) {
562 ra_modified(as
, dest
);
563 emit_rmro(as
, irt_isnum(ir
->t
) ? XMM_MOVRM(as
) : XO_MOVSS
,
566 if ((ci
->flags
& CCI_CASTU64
)) {
567 emit_movtomro(as
, RID_RETLO
, RID_ESP
, ofs
);
568 emit_movtomro(as
, RID_RETHI
, RID_ESP
, ofs
+4);
570 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_FSTPq
: XO_FSTPd
,
571 irt_isnum(ir
->t
) ? XOg_FSTPq
: XOg_FSTPd
, RID_ESP
, ofs
);
579 lua_assert(!irt_ispri(ir
->t
));
580 ra_destreg(as
, ir
, RID_RET
);
582 } else if (LJ_32
&& irt_isfp(ir
->t
) && !(ci
->flags
& CCI_CASTU64
)) {
583 emit_x87op(as
, XI_FPOP
); /* Pop unused result from x87 st0. */
587 static void asm_call(ASMState
*as
, IRIns
*ir
)
589 IRRef args
[CCI_NARGS_MAX
];
590 const CCallInfo
*ci
= &lj_ir_callinfo
[ir
->op2
];
591 asm_collectargs(as
, ir
, ci
, args
);
592 asm_setupresult(as
, ir
, ci
);
593 asm_gencall(as
, ci
, args
);
596 /* Return a constant function pointer or NULL for indirect calls. */
597 static void *asm_callx_func(ASMState
*as
, IRIns
*irf
, IRRef func
)
602 return (void *)irf
->i
;
604 if (irref_isk(func
)) {
606 if (irf
->o
== IR_KINT64
)
607 p
= (MCode
*)(void *)ir_k64(irf
)->u64
;
609 p
= (MCode
*)(void *)(uintptr_t)(uint32_t)irf
->i
;
610 if (p
- as
->mcp
== (int32_t)(p
- as
->mcp
))
611 return p
; /* Call target is still in +-2GB range. */
612 /* Avoid the indirect case of emit_call(). Try to hoist func addr. */
618 static void asm_callx(ASMState
*as
, IRIns
*ir
)
620 IRRef args
[CCI_NARGS_MAX
*2];
625 ci
.flags
= asm_callx_flags(as
, ir
);
626 asm_collectargs(as
, ir
, &ci
, args
);
627 asm_setupresult(as
, ir
, &ci
);
629 /* Have to readjust stack after non-cdecl calls due to callee cleanup. */
630 if ((ci
.flags
& CCI_CC_MASK
) != CCI_CC_CDECL
)
631 spadj
= 4 * asm_count_call_slots(as
, &ci
, args
);
633 func
= ir
->op2
; irf
= IR(func
);
634 if (irf
->o
== IR_CARG
) { func
= irf
->op1
; irf
= IR(func
); }
635 ci
.func
= (ASMFunction
)asm_callx_func(as
, irf
, func
);
636 if (!(void *)ci
.func
) {
637 /* Use a (hoistable) non-scratch register for indirect calls. */
638 RegSet allow
= (RSET_GPR
& ~RSET_SCRATCH
);
639 Reg r
= ra_alloc1(as
, func
, allow
);
640 if (LJ_32
) emit_spsub(as
, spadj
); /* Above code may cause restores! */
641 emit_rr(as
, XO_GROUP5
, XOg_CALL
, r
);
643 emit_spsub(as
, spadj
);
645 asm_gencall(as
, &ci
, args
);
648 /* -- Returns ------------------------------------------------------------- */
650 /* Return to lower frame. Guard that it goes to the right spot. */
651 static void asm_retf(ASMState
*as
, IRIns
*ir
)
653 Reg base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
654 void *pc
= ir_kptr(IR(ir
->op2
));
655 int32_t delta
= 1+bc_a(*((const BCIns
*)pc
- 1));
656 as
->topslot
-= (BCReg
)delta
;
657 if ((int32_t)as
->topslot
< 0) as
->topslot
= 0;
658 irt_setmark(IR(REF_BASE
)->t
); /* Children must not coalesce with BASE reg. */
659 emit_setgl(as
, base
, jit_base
);
660 emit_addptr(as
, base
, -8*delta
);
661 asm_guardcc(as
, CC_NE
);
662 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), base
, -4, ptr2addr(pc
));
665 /* -- Type conversions ---------------------------------------------------- */
667 static void asm_tointg(ASMState
*as
, IRIns
*ir
, Reg left
)
669 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_FPR
, left
));
670 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
671 asm_guardcc(as
, CC_P
);
672 asm_guardcc(as
, CC_NE
);
673 emit_rr(as
, XO_UCOMISD
, left
, tmp
);
674 emit_rr(as
, XO_CVTSI2SD
, tmp
, dest
);
675 if (!(as
->flags
& JIT_F_SPLIT_XMM
))
676 emit_rr(as
, XO_XORPS
, tmp
, tmp
); /* Avoid partial register stall. */
677 emit_rr(as
, XO_CVTTSD2SI
, dest
, left
);
678 /* Can't fuse since left is needed twice. */
681 static void asm_tobit(ASMState
*as
, IRIns
*ir
)
683 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
684 Reg tmp
= ra_noreg(IR(ir
->op1
)->r
) ?
685 ra_alloc1(as
, ir
->op1
, RSET_FPR
) :
686 ra_scratch(as
, RSET_FPR
);
687 Reg right
= asm_fuseload(as
, ir
->op2
, rset_exclude(RSET_FPR
, tmp
));
688 emit_rr(as
, XO_MOVDto
, tmp
, dest
);
689 emit_mrm(as
, XO_ADDSD
, tmp
, right
);
690 ra_left(as
, tmp
, ir
->op1
);
693 static void asm_conv(ASMState
*as
, IRIns
*ir
)
695 IRType st
= (IRType
)(ir
->op2
& IRCONV_SRCMASK
);
696 int st64
= (st
== IRT_I64
|| st
== IRT_U64
|| (LJ_64
&& st
== IRT_P64
));
697 int stfp
= (st
== IRT_NUM
|| st
== IRT_FLOAT
);
698 IRRef lref
= ir
->op1
;
699 lua_assert(irt_type(ir
->t
) != st
);
700 lua_assert(!(LJ_32
&& (irt_isint64(ir
->t
) || st64
))); /* Handled by SPLIT. */
701 if (irt_isfp(ir
->t
)) {
702 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
703 if (stfp
) { /* FP to FP conversion. */
704 Reg left
= asm_fuseload(as
, lref
, RSET_FPR
);
705 emit_mrm(as
, st
== IRT_NUM
? XO_CVTSD2SS
: XO_CVTSS2SD
, dest
, left
);
706 if (left
== dest
) return; /* Avoid the XO_XORPS. */
707 } else if (LJ_32
&& st
== IRT_U32
) { /* U32 to FP conversion on x86. */
708 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
709 cTValue
*k
= lj_ir_k64_find(as
->J
, U64x(43380000,00000000));
710 Reg bias
= ra_scratch(as
, rset_exclude(RSET_FPR
, dest
));
711 if (irt_isfloat(ir
->t
))
712 emit_rr(as
, XO_CVTSD2SS
, dest
, dest
);
713 emit_rr(as
, XO_SUBSD
, dest
, bias
); /* Subtract 2^52+2^51 bias. */
714 emit_rr(as
, XO_XORPS
, dest
, bias
); /* Merge bias and integer. */
715 emit_loadn(as
, bias
, k
);
716 emit_mrm(as
, XO_MOVD
, dest
, asm_fuseload(as
, lref
, RSET_GPR
));
718 } else { /* Integer to FP conversion. */
719 Reg left
= (LJ_64
&& (st
== IRT_U32
|| st
== IRT_U64
)) ?
720 ra_alloc1(as
, lref
, RSET_GPR
) :
721 asm_fuseloadm(as
, lref
, RSET_GPR
, st64
);
722 if (LJ_64
&& st
== IRT_U64
) {
723 MCLabel l_end
= emit_label(as
);
724 const void *k
= lj_ir_k64_find(as
->J
, U64x(43f00000
,00000000));
725 emit_rma(as
, XO_ADDSD
, dest
, k
); /* Add 2^64 to compensate. */
726 emit_sjcc(as
, CC_NS
, l_end
);
727 emit_rr(as
, XO_TEST
, left
|REX_64
, left
); /* Check if u64 >= 2^63. */
729 emit_mrm(as
, irt_isnum(ir
->t
) ? XO_CVTSI2SD
: XO_CVTSI2SS
,
730 dest
|((LJ_64
&& (st64
|| st
== IRT_U32
)) ? REX_64
: 0), left
);
732 if (!(as
->flags
& JIT_F_SPLIT_XMM
))
733 emit_rr(as
, XO_XORPS
, dest
, dest
); /* Avoid partial register stall. */
734 } else if (stfp
) { /* FP to integer conversion. */
735 if (irt_isguard(ir
->t
)) {
736 /* Checked conversions are only supported from number to int. */
737 lua_assert(irt_isint(ir
->t
) && st
== IRT_NUM
);
738 asm_tointg(as
, ir
, ra_alloc1(as
, lref
, RSET_FPR
));
740 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
741 x86Op op
= st
== IRT_NUM
?
742 ((ir
->op2
& IRCONV_TRUNC
) ? XO_CVTTSD2SI
: XO_CVTSD2SI
) :
743 ((ir
->op2
& IRCONV_TRUNC
) ? XO_CVTTSS2SI
: XO_CVTSS2SI
);
744 if (LJ_64
? irt_isu64(ir
->t
) : irt_isu32(ir
->t
)) {
745 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */
746 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */
747 Reg tmp
= ra_noreg(IR(lref
)->r
) ? ra_alloc1(as
, lref
, RSET_FPR
) :
748 ra_scratch(as
, RSET_FPR
);
749 MCLabel l_end
= emit_label(as
);
751 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, (int32_t)0x80000000);
752 emit_rr(as
, op
, dest
|REX_64
, tmp
);
754 emit_rma(as
, XO_ADDSD
, tmp
, lj_ir_k64_find(as
->J
,
755 LJ_64
? U64x(c3f00000
,00000000) : U64x(c1e00000
,00000000)));
757 emit_rma(as
, XO_ADDSS
, tmp
, lj_ir_k64_find(as
->J
,
758 LJ_64
? U64x(00000000,df800000
) : U64x(00000000,cf000000
)));
759 emit_sjcc(as
, CC_NS
, l_end
);
760 emit_rr(as
, XO_TEST
, dest
|REX_64
, dest
); /* Check if dest negative. */
761 emit_rr(as
, op
, dest
|REX_64
, tmp
);
762 ra_left(as
, tmp
, lref
);
764 Reg left
= asm_fuseload(as
, lref
, RSET_FPR
);
765 if (LJ_64
&& irt_isu32(ir
->t
))
766 emit_rr(as
, XO_MOV
, dest
, dest
); /* Zero hiword. */
769 (irt_is64(ir
->t
) || irt_isu32(ir
->t
))) ? REX_64
: 0),
773 } else if (st
>= IRT_I8
&& st
<= IRT_U16
) { /* Extend to 32 bit integer. */
774 Reg left
, dest
= ra_dest(as
, ir
, RSET_GPR
);
775 RegSet allow
= RSET_GPR
;
777 lua_assert(irt_isint(ir
->t
) || irt_isu32(ir
->t
));
779 op
= XO_MOVSXb
; allow
= RSET_GPR8
; dest
|= FORCE_REX
;
780 } else if (st
== IRT_U8
) {
781 op
= XO_MOVZXb
; allow
= RSET_GPR8
; dest
|= FORCE_REX
;
782 } else if (st
== IRT_I16
) {
787 left
= asm_fuseload(as
, lref
, allow
);
788 /* Add extra MOV if source is already in wrong register. */
789 if (!LJ_64
&& left
!= RID_MRM
&& !rset_test(allow
, left
)) {
790 Reg tmp
= ra_scratch(as
, allow
);
791 emit_rr(as
, op
, dest
, tmp
);
792 emit_rr(as
, XO_MOV
, tmp
, left
);
794 emit_mrm(as
, op
, dest
, left
);
796 } else { /* 32/64 bit integer conversions. */
797 if (LJ_32
) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
798 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
799 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
800 } else if (irt_is64(ir
->t
)) {
801 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
802 if (st64
|| !(ir
->op2
& IRCONV_SEXT
)) {
803 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
804 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
805 } else { /* 32 to 64 bit sign extension. */
806 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
807 emit_mrm(as
, XO_MOVSXd
, dest
|REX_64
, left
);
810 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
812 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
813 /* This is either a 32 bit reg/reg mov which zeroes the hiword
814 ** or a load of the loword from a 64 bit address.
816 emit_mrm(as
, XO_MOV
, dest
, left
);
817 } else { /* 32/32 bit no-op (cast). */
818 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
824 #if LJ_32 && LJ_HASFFI
825 /* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
827 /* 64 bit integer to FP conversion in 32 bit mode. */
828 static void asm_conv_fp_int64(ASMState
*as
, IRIns
*ir
)
830 Reg hi
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
831 Reg lo
= ra_alloc1(as
, (ir
-1)->op1
, rset_exclude(RSET_GPR
, hi
));
832 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
834 if (ra_hasreg(dest
)) {
836 ra_modified(as
, dest
);
837 emit_rmro(as
, irt_isnum(ir
->t
) ? XMM_MOVRM(as
) : XO_MOVSS
,
840 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_FSTPq
: XO_FSTPd
,
841 irt_isnum(ir
->t
) ? XOg_FSTPq
: XOg_FSTPd
, RID_ESP
, ofs
);
842 if (((ir
-1)->op2
& IRCONV_SRCMASK
) == IRT_U64
) {
843 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
844 MCLabel l_end
= emit_label(as
);
845 emit_rma(as
, XO_FADDq
, XOg_FADDq
,
846 lj_ir_k64_find(as
->J
, U64x(43f00000
,00000000)));
847 emit_sjcc(as
, CC_NS
, l_end
);
848 emit_rr(as
, XO_TEST
, hi
, hi
); /* Check if u64 >= 2^63. */
850 lua_assert(((ir
-1)->op2
& IRCONV_SRCMASK
) == IRT_I64
);
852 emit_rmro(as
, XO_FILDq
, XOg_FILDq
, RID_ESP
, 0);
853 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
854 emit_rmro(as
, XO_MOVto
, hi
, RID_ESP
, 4);
855 emit_rmro(as
, XO_MOVto
, lo
, RID_ESP
, 0);
858 /* FP to 64 bit integer conversion in 32 bit mode. */
859 static void asm_conv_int64_fp(ASMState
*as
, IRIns
*ir
)
861 IRType st
= (IRType
)((ir
-1)->op2
& IRCONV_SRCMASK
);
862 IRType dt
= (((ir
-1)->op2
& IRCONV_DSTMASK
) >> IRCONV_DSH
);
864 lua_assert(st
== IRT_NUM
|| st
== IRT_FLOAT
);
865 lua_assert(dt
== IRT_I64
|| dt
== IRT_U64
);
866 lua_assert(((ir
-1)->op2
& IRCONV_TRUNC
));
867 hi
= ra_dest(as
, ir
, RSET_GPR
);
868 lo
= ra_dest(as
, ir
-1, rset_exclude(RSET_GPR
, hi
));
869 if (ra_used(ir
-1)) emit_rmro(as
, XO_MOV
, lo
, RID_ESP
, 0);
870 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
871 if (!(as
->flags
& JIT_F_SSE3
)) { /* Set FPU rounding mode to default. */
872 emit_rmro(as
, XO_FLDCW
, XOg_FLDCW
, RID_ESP
, 4);
873 emit_rmro(as
, XO_MOVto
, lo
, RID_ESP
, 4);
874 emit_gri(as
, XG_ARITHi(XOg_AND
), lo
, 0xf3ff);
877 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
878 MCLabel l_pop
, l_end
= emit_label(as
);
879 emit_x87op(as
, XI_FPOP
);
880 l_pop
= emit_label(as
);
881 emit_sjmp(as
, l_end
);
882 emit_rmro(as
, XO_MOV
, hi
, RID_ESP
, 4);
883 if ((as
->flags
& JIT_F_SSE3
))
884 emit_rmro(as
, XO_FISTTPq
, XOg_FISTTPq
, RID_ESP
, 0);
886 emit_rmro(as
, XO_FISTPq
, XOg_FISTPq
, RID_ESP
, 0);
887 emit_rma(as
, XO_FADDq
, XOg_FADDq
,
888 lj_ir_k64_find(as
->J
, U64x(c3f00000
,00000000)));
889 emit_sjcc(as
, CC_NS
, l_pop
);
890 emit_rr(as
, XO_TEST
, hi
, hi
); /* Check if out-of-range (2^63). */
892 emit_rmro(as
, XO_MOV
, hi
, RID_ESP
, 4);
893 if ((as
->flags
& JIT_F_SSE3
)) { /* Truncation is easy with SSE3. */
894 emit_rmro(as
, XO_FISTTPq
, XOg_FISTTPq
, RID_ESP
, 0);
895 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
896 emit_rmro(as
, XO_FISTPq
, XOg_FISTPq
, RID_ESP
, 0);
897 emit_rmro(as
, XO_FLDCW
, XOg_FLDCW
, RID_ESP
, 0);
898 emit_rmro(as
, XO_MOVtow
, lo
, RID_ESP
, 0);
899 emit_rmro(as
, XO_ARITHw(XOg_OR
), lo
, RID_ESP
, 0);
900 emit_loadi(as
, lo
, 0xc00);
901 emit_rmro(as
, XO_FNSTCW
, XOg_FNSTCW
, RID_ESP
, 0);
904 emit_x87op(as
, XI_FDUP
);
905 emit_mrm(as
, st
== IRT_NUM
? XO_FLDq
: XO_FLDd
,
906 st
== IRT_NUM
? XOg_FLDq
: XOg_FLDd
,
907 asm_fuseload(as
, ir
->op1
, RSET_EMPTY
));
911 static void asm_strto(ASMState
*as
, IRIns
*ir
)
913 /* Force a spill slot for the destination register (if any). */
914 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_strscan_num
];
916 RegSet drop
= RSET_SCRATCH
;
917 if ((drop
& RSET_FPR
) != RSET_FPR
&& ra_hasreg(ir
->r
))
918 rset_set(drop
, ir
->r
); /* WIN64 doesn't spill all FPRs. */
919 ra_evictset(as
, drop
);
920 asm_guardcc(as
, CC_E
);
921 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
); /* Test return status. */
922 args
[0] = ir
->op1
; /* GCstr *str */
923 args
[1] = ASMREF_TMP1
; /* TValue *n */
924 asm_gencall(as
, ci
, args
);
925 /* Store the result to the spill slot or temp slots. */
926 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
)|REX_64
,
927 RID_ESP
, sps_scale(ir
->s
));
930 static void asm_tostr(ASMState
*as
, IRIns
*ir
)
932 IRIns
*irl
= IR(ir
->op1
);
936 if (irt_isnum(irl
->t
)) {
937 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_fromnum
];
938 args
[1] = ASMREF_TMP1
; /* const lua_Number * */
939 asm_setupresult(as
, ir
, ci
); /* GCstr * */
940 asm_gencall(as
, ci
, args
);
941 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
)|REX_64
,
942 RID_ESP
, ra_spill(as
, irl
));
944 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_fromint
];
945 args
[1] = ir
->op1
; /* int32_t k */
946 asm_setupresult(as
, ir
, ci
); /* GCstr * */
947 asm_gencall(as
, ci
, args
);
951 /* -- Memory references --------------------------------------------------- */
953 static void asm_aref(ASMState
*as
, IRIns
*ir
)
955 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
956 asm_fusearef(as
, ir
, RSET_GPR
);
957 if (!(as
->mrm
.idx
== RID_NONE
&& as
->mrm
.ofs
== 0))
958 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
959 else if (as
->mrm
.base
!= dest
)
960 emit_rr(as
, XO_MOV
, dest
, as
->mrm
.base
);
963 /* Merge NE(HREF, niltv) check. */
964 static MCode
*merge_href_niltv(ASMState
*as
, IRIns
*ir
)
966 /* Assumes nothing else generates NE of HREF. */
967 if ((ir
[1].o
== IR_NE
|| ir
[1].o
== IR_EQ
) && ir
[1].op1
== as
->curins
&&
970 p
+= (LJ_64
&& *p
!= XI_ARITHi
) ? 7+6 : 6+6;
971 /* Ensure no loop branch inversion happened. */
972 if (p
[-6] == 0x0f && p
[-5] == XI_JCCn
+(CC_NE
^(ir
[1].o
& 1))) {
973 as
->mcp
= p
; /* Kill cmp reg, imm32 + jz exit. */
974 return p
+ *(int32_t *)(p
-4); /* Return exit address. */
980 /* Inlined hash lookup. Specialized for key type and for const keys.
981 ** The equivalent C code is:
982 ** Node *n = hashkey(t, key);
984 ** if (lj_obj_equal(&n->key, key)) return &n->val;
985 ** } while ((n = nextnode(n)));
988 static void asm_href(ASMState
*as
, IRIns
*ir
)
990 MCode
*nilexit
= merge_href_niltv(as
, ir
); /* Do this before any restores. */
991 RegSet allow
= RSET_GPR
;
992 Reg dest
= ra_dest(as
, ir
, allow
);
993 Reg tab
= ra_alloc1(as
, ir
->op1
, rset_clear(allow
, dest
));
994 Reg key
= RID_NONE
, tmp
= RID_NONE
;
995 IRIns
*irkey
= IR(ir
->op2
);
996 int isk
= irref_isk(ir
->op2
);
997 IRType1 kt
= irkey
->t
;
999 MCLabel l_end
, l_loop
, l_next
;
1002 rset_clear(allow
, tab
);
1003 key
= ra_alloc1(as
, ir
->op2
, irt_isnum(kt
) ? RSET_FPR
: allow
);
1005 tmp
= ra_scratch(as
, rset_exclude(allow
, key
));
1008 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */
1009 l_end
= emit_label(as
);
1010 if (nilexit
&& ir
[1].o
== IR_NE
) {
1011 emit_jcc(as
, CC_E
, nilexit
); /* XI_JMP is not found by lj_asm_patchexit. */
1014 emit_loada(as
, dest
, niltvg(J2G(as
->J
)));
1017 /* Follow hash chain until the end. */
1018 l_loop
= emit_sjcc_label(as
, CC_NZ
);
1019 emit_rr(as
, XO_TEST
, dest
, dest
);
1020 emit_rmro(as
, XO_MOV
, dest
, dest
, offsetof(Node
, next
));
1021 l_next
= emit_label(as
);
1023 /* Type and value comparison. */
1025 emit_jcc(as
, CC_E
, nilexit
);
1027 emit_sjcc(as
, CC_E
, l_end
);
1028 if (irt_isnum(kt
)) {
1030 /* Assumes -0.0 is already canonicalized to +0.0. */
1031 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.lo
),
1032 (int32_t)ir_knum(irkey
)->u32
.lo
);
1033 emit_sjcc(as
, CC_NE
, l_next
);
1034 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.hi
),
1035 (int32_t)ir_knum(irkey
)->u32
.hi
);
1037 emit_sjcc(as
, CC_P
, l_next
);
1038 emit_rmro(as
, XO_UCOMISD
, key
, dest
, offsetof(Node
, key
.n
));
1039 emit_sjcc(as
, CC_AE
, l_next
);
1040 /* The type check avoids NaN penalties and complaints from Valgrind. */
1042 emit_u32(as
, LJ_TISNUM
);
1043 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1045 emit_i8(as
, LJ_TISNUM
);
1046 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1050 } else if (irt_islightud(kt
)) {
1051 emit_rmro(as
, XO_CMP
, key
|REX_64
, dest
, offsetof(Node
, key
.u64
));
1054 if (!irt_ispri(kt
)) {
1055 lua_assert(irt_isaddr(kt
));
1057 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.gcr
),
1058 ptr2addr(ir_kgc(irkey
)));
1060 emit_rmro(as
, XO_CMP
, key
, dest
, offsetof(Node
, key
.gcr
));
1061 emit_sjcc(as
, CC_NE
, l_next
);
1063 lua_assert(!irt_isnil(kt
));
1064 emit_i8(as
, irt_toitype(kt
));
1065 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1067 emit_sfixup(as
, l_loop
);
1070 /* Load main position relative to tab->node into dest. */
1071 khash
= isk
? ir_khash(irkey
) : 1;
1073 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, node
));
1075 emit_rmro(as
, XO_ARITH(XOg_ADD
), dest
, tab
, offsetof(GCtab
, node
));
1076 if ((as
->flags
& JIT_F_PREFER_IMUL
)) {
1077 emit_i8(as
, sizeof(Node
));
1078 emit_rr(as
, XO_IMULi8
, dest
, dest
);
1080 emit_shifti(as
, XOg_SHL
, dest
, 3);
1081 emit_rmrxo(as
, XO_LEA
, dest
, dest
, dest
, XM_SCALE2
, 0);
1084 emit_gri(as
, XG_ARITHi(XOg_AND
), dest
, (int32_t)khash
);
1085 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
1086 } else if (irt_isstr(kt
)) {
1087 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, key
, offsetof(GCstr
, hash
));
1088 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
1089 } else { /* Must match with hashrot() in lj_tab.c. */
1090 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, tab
, offsetof(GCtab
, hmask
));
1091 emit_rr(as
, XO_ARITH(XOg_SUB
), dest
, tmp
);
1092 emit_shifti(as
, XOg_ROL
, tmp
, HASH_ROT3
);
1093 emit_rr(as
, XO_ARITH(XOg_XOR
), dest
, tmp
);
1094 emit_shifti(as
, XOg_ROL
, dest
, HASH_ROT2
);
1095 emit_rr(as
, XO_ARITH(XOg_SUB
), tmp
, dest
);
1096 emit_shifti(as
, XOg_ROL
, dest
, HASH_ROT1
);
1097 emit_rr(as
, XO_ARITH(XOg_XOR
), tmp
, dest
);
1098 if (irt_isnum(kt
)) {
1099 emit_rr(as
, XO_ARITH(XOg_ADD
), dest
, dest
);
1101 emit_shifti(as
, XOg_SHR
|REX_64
, dest
, 32);
1102 emit_rr(as
, XO_MOV
, tmp
, dest
);
1103 emit_rr(as
, XO_MOVDto
, key
|REX_64
, dest
);
1105 emit_rmro(as
, XO_MOV
, dest
, RID_ESP
, ra_spill(as
, irkey
)+4);
1106 emit_rr(as
, XO_MOVDto
, key
, tmp
);
1109 emit_rr(as
, XO_MOV
, tmp
, key
);
1110 emit_rmro(as
, XO_LEA
, dest
, key
, HASH_BIAS
);
1116 static void asm_hrefk(ASMState
*as
, IRIns
*ir
)
1118 IRIns
*kslot
= IR(ir
->op2
);
1119 IRIns
*irkey
= IR(kslot
->op1
);
1120 int32_t ofs
= (int32_t)(kslot
->op2
* sizeof(Node
));
1121 Reg dest
= ra_used(ir
) ? ra_dest(as
, ir
, RSET_GPR
) : RID_NONE
;
1122 Reg node
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1126 lua_assert(ofs
% sizeof(Node
) == 0);
1127 if (ra_hasreg(dest
)) {
1129 if (dest
== node
&& !(as
->flags
& JIT_F_LEA_AGU
))
1130 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, ofs
);
1132 emit_rmro(as
, XO_LEA
, dest
, node
, ofs
);
1133 } else if (dest
!= node
) {
1134 emit_rr(as
, XO_MOV
, dest
, node
);
1137 asm_guardcc(as
, CC_NE
);
1139 if (!irt_ispri(irkey
->t
)) {
1140 Reg key
= ra_scratch(as
, rset_exclude(RSET_GPR
, node
));
1141 emit_rmro(as
, XO_CMP
, key
|REX_64
, node
,
1142 ofs
+ (int32_t)offsetof(Node
, key
.u64
));
1143 lua_assert(irt_isnum(irkey
->t
) || irt_isgcv(irkey
->t
));
1144 /* Assumes -0.0 is already canonicalized to +0.0. */
1145 emit_loadu64(as
, key
, irt_isnum(irkey
->t
) ? ir_knum(irkey
)->u64
:
1146 ((uint64_t)irt_toitype(irkey
->t
) << 32) |
1147 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey
)));
1149 lua_assert(!irt_isnil(irkey
->t
));
1150 emit_i8(as
, irt_toitype(irkey
->t
));
1151 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
1152 ofs
+ (int32_t)offsetof(Node
, key
.it
));
1155 l_exit
= emit_label(as
);
1156 if (irt_isnum(irkey
->t
)) {
1157 /* Assumes -0.0 is already canonicalized to +0.0. */
1158 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1159 ofs
+ (int32_t)offsetof(Node
, key
.u32
.lo
),
1160 (int32_t)ir_knum(irkey
)->u32
.lo
);
1161 emit_sjcc(as
, CC_NE
, l_exit
);
1162 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1163 ofs
+ (int32_t)offsetof(Node
, key
.u32
.hi
),
1164 (int32_t)ir_knum(irkey
)->u32
.hi
);
1166 if (!irt_ispri(irkey
->t
)) {
1167 lua_assert(irt_isgcv(irkey
->t
));
1168 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1169 ofs
+ (int32_t)offsetof(Node
, key
.gcr
),
1170 ptr2addr(ir_kgc(irkey
)));
1171 emit_sjcc(as
, CC_NE
, l_exit
);
1173 lua_assert(!irt_isnil(irkey
->t
));
1174 emit_i8(as
, irt_toitype(irkey
->t
));
1175 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
1176 ofs
+ (int32_t)offsetof(Node
, key
.it
));
1181 static void asm_newref(ASMState
*as
, IRIns
*ir
)
1183 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_newkey
];
1187 if (ir
->r
== RID_SINK
)
1189 args
[0] = ASMREF_L
; /* lua_State *L */
1190 args
[1] = ir
->op1
; /* GCtab *t */
1191 args
[2] = ASMREF_TMP1
; /* cTValue *key */
1192 asm_setupresult(as
, ir
, ci
); /* TValue * */
1193 asm_gencall(as
, ci
, args
);
1194 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
1195 irkey
= IR(ir
->op2
);
1196 if (irt_isnum(irkey
->t
)) {
1197 /* For numbers use the constant itself or a spill slot as a TValue. */
1198 if (irref_isk(ir
->op2
))
1199 emit_loada(as
, tmp
, ir_knum(irkey
));
1201 emit_rmro(as
, XO_LEA
, tmp
|REX_64
, RID_ESP
, ra_spill(as
, irkey
));
1203 /* Otherwise use g->tmptv to hold the TValue. */
1204 if (!irref_isk(ir
->op2
)) {
1205 Reg src
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_GPR
, tmp
));
1206 emit_movtomro(as
, REX_64IR(irkey
, src
), tmp
, 0);
1207 } else if (!irt_ispri(irkey
->t
)) {
1208 emit_movmroi(as
, tmp
, 0, irkey
->i
);
1210 if (!(LJ_64
&& irt_islightud(irkey
->t
)))
1211 emit_movmroi(as
, tmp
, 4, irt_toitype(irkey
->t
));
1212 emit_loada(as
, tmp
, &J2G(as
->J
)->tmptv
);
1216 static void asm_uref(ASMState
*as
, IRIns
*ir
)
1218 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1219 if (irref_isk(ir
->op1
)) {
1220 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
1221 MRef
*v
= &gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)])->uv
.v
;
1222 emit_rma(as
, XO_MOV
, dest
, v
);
1224 Reg uv
= ra_scratch(as
, RSET_GPR
);
1225 Reg func
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1226 if (ir
->o
== IR_UREFC
) {
1227 emit_rmro(as
, XO_LEA
, dest
, uv
, offsetof(GCupval
, tv
));
1228 asm_guardcc(as
, CC_NE
);
1230 emit_rmro(as
, XO_ARITHib
, XOg_CMP
, uv
, offsetof(GCupval
, closed
));
1232 emit_rmro(as
, XO_MOV
, dest
, uv
, offsetof(GCupval
, v
));
1234 emit_rmro(as
, XO_MOV
, uv
, func
,
1235 (int32_t)offsetof(GCfuncL
, uvptr
) + 4*(int32_t)(ir
->op2
>> 8));
1239 static void asm_fref(ASMState
*as
, IRIns
*ir
)
1241 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1242 asm_fusefref(as
, ir
, RSET_GPR
);
1243 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1246 static void asm_strref(ASMState
*as
, IRIns
*ir
)
1248 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1249 asm_fusestrref(as
, ir
, RSET_GPR
);
1250 if (as
->mrm
.base
== RID_NONE
)
1251 emit_loadi(as
, dest
, as
->mrm
.ofs
);
1252 else if (as
->mrm
.base
== dest
&& as
->mrm
.idx
== RID_NONE
)
1253 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, as
->mrm
.ofs
);
1255 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1258 /* -- Loads and stores ---------------------------------------------------- */
1260 static void asm_fxload(ASMState
*as
, IRIns
*ir
)
1262 Reg dest
= ra_dest(as
, ir
, irt_isfp(ir
->t
) ? RSET_FPR
: RSET_GPR
);
1264 if (ir
->o
== IR_FLOAD
)
1265 asm_fusefref(as
, ir
, RSET_GPR
);
1267 asm_fusexref(as
, ir
->op1
, RSET_GPR
);
1268 /* ir->op2 is ignored -- unaligned loads are ok on x86. */
1269 switch (irt_type(ir
->t
)) {
1270 case IRT_I8
: xo
= XO_MOVSXb
; break;
1271 case IRT_U8
: xo
= XO_MOVZXb
; break;
1272 case IRT_I16
: xo
= XO_MOVSXw
; break;
1273 case IRT_U16
: xo
= XO_MOVZXw
; break;
1274 case IRT_NUM
: xo
= XMM_MOVRM(as
); break;
1275 case IRT_FLOAT
: xo
= XO_MOVSS
; break;
1277 if (LJ_64
&& irt_is64(ir
->t
))
1280 lua_assert(irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
));
1284 emit_mrm(as
, xo
, dest
, RID_MRM
);
1287 static void asm_fxstore(ASMState
*as
, IRIns
*ir
)
1289 RegSet allow
= RSET_GPR
;
1290 Reg src
= RID_NONE
, osrc
= RID_NONE
;
1292 if (ir
->r
== RID_SINK
)
1294 /* The IRT_I16/IRT_U16 stores should never be simplified for constant
1295 ** values since mov word [mem], imm16 has a length-changing prefix.
1297 if (irt_isi16(ir
->t
) || irt_isu16(ir
->t
) || irt_isfp(ir
->t
) ||
1298 !asm_isk32(as
, ir
->op2
, &k
)) {
1299 RegSet allow8
= irt_isfp(ir
->t
) ? RSET_FPR
:
1300 (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) ? RSET_GPR8
: RSET_GPR
;
1301 src
= osrc
= ra_alloc1(as
, ir
->op2
, allow8
);
1302 if (!LJ_64
&& !rset_test(allow8
, src
)) { /* Already in wrong register. */
1303 rset_clear(allow
, osrc
);
1304 src
= ra_scratch(as
, allow8
);
1306 rset_clear(allow
, src
);
1308 if (ir
->o
== IR_FSTORE
) {
1309 asm_fusefref(as
, IR(ir
->op1
), allow
);
1311 asm_fusexref(as
, ir
->op1
, allow
);
1312 if (LJ_32
&& ir
->o
== IR_HIOP
) as
->mrm
.ofs
+= 4;
1314 if (ra_hasreg(src
)) {
1316 switch (irt_type(ir
->t
)) {
1317 case IRT_I8
: case IRT_U8
: xo
= XO_MOVtob
; src
|= FORCE_REX
; break;
1318 case IRT_I16
: case IRT_U16
: xo
= XO_MOVtow
; break;
1319 case IRT_NUM
: xo
= XO_MOVSDto
; break;
1320 case IRT_FLOAT
: xo
= XO_MOVSSto
; break;
1322 case IRT_LIGHTUD
: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */
1325 if (LJ_64
&& irt_is64(ir
->t
))
1328 lua_assert(irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
));
1332 emit_mrm(as
, xo
, src
, RID_MRM
);
1333 if (!LJ_64
&& src
!= osrc
) {
1334 ra_noweak(as
, osrc
);
1335 emit_rr(as
, XO_MOV
, src
, osrc
);
1338 if (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) {
1340 emit_mrm(as
, XO_MOVmib
, 0, RID_MRM
);
1342 lua_assert(irt_is64(ir
->t
) || irt_isint(ir
->t
) || irt_isu32(ir
->t
) ||
1345 emit_mrm(as
, XO_MOVmi
, REX_64IR(ir
, 0), RID_MRM
);
1351 static Reg
asm_load_lightud64(ASMState
*as
, IRIns
*ir
, int typecheck
)
1353 if (ra_used(ir
) || typecheck
) {
1354 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1356 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, dest
));
1357 asm_guardcc(as
, CC_NE
);
1359 emit_rr(as
, XO_ARITHi8
, XOg_CMP
, tmp
);
1360 emit_shifti(as
, XOg_SAR
|REX_64
, tmp
, 47);
1361 emit_rr(as
, XO_MOV
, tmp
|REX_64
, dest
);
1370 static void asm_ahuvload(ASMState
*as
, IRIns
*ir
)
1372 lua_assert(irt_isnum(ir
->t
) || irt_ispri(ir
->t
) || irt_isaddr(ir
->t
) ||
1373 (LJ_DUALNUM
&& irt_isint(ir
->t
)));
1375 if (irt_islightud(ir
->t
)) {
1376 Reg dest
= asm_load_lightud64(as
, ir
, 1);
1377 if (ra_hasreg(dest
)) {
1378 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1379 emit_mrm(as
, XO_MOV
, dest
|REX_64
, RID_MRM
);
1385 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
1386 Reg dest
= ra_dest(as
, ir
, allow
);
1387 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1388 emit_mrm(as
, dest
< RID_MAX_GPR
? XO_MOV
: XMM_MOVRM(as
), dest
, RID_MRM
);
1390 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1392 /* Always do the type check, even if the load result is unused. */
1394 asm_guardcc(as
, irt_isnum(ir
->t
) ? CC_AE
: CC_NE
);
1395 if (LJ_64
&& irt_type(ir
->t
) >= IRT_NUM
) {
1396 lua_assert(irt_isinteger(ir
->t
) || irt_isnum(ir
->t
));
1397 emit_u32(as
, LJ_TISNUM
);
1398 emit_mrm(as
, XO_ARITHi
, XOg_CMP
, RID_MRM
);
1400 emit_i8(as
, irt_toitype(ir
->t
));
1401 emit_mrm(as
, XO_ARITHi8
, XOg_CMP
, RID_MRM
);
1405 static void asm_ahustore(ASMState
*as
, IRIns
*ir
)
1407 if (ir
->r
== RID_SINK
)
1409 if (irt_isnum(ir
->t
)) {
1410 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_FPR
);
1411 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1412 emit_mrm(as
, XO_MOVSDto
, src
, RID_MRM
);
1414 } else if (irt_islightud(ir
->t
)) {
1415 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_GPR
);
1416 asm_fuseahuref(as
, ir
->op1
, rset_exclude(RSET_GPR
, src
));
1417 emit_mrm(as
, XO_MOVto
, src
|REX_64
, RID_MRM
);
1420 IRIns
*irr
= IR(ir
->op2
);
1421 RegSet allow
= RSET_GPR
;
1423 if (!irref_isk(ir
->op2
)) {
1424 src
= ra_alloc1(as
, ir
->op2
, allow
);
1425 rset_clear(allow
, src
);
1427 asm_fuseahuref(as
, ir
->op1
, allow
);
1428 if (ra_hasreg(src
)) {
1429 emit_mrm(as
, XO_MOVto
, src
, RID_MRM
);
1430 } else if (!irt_ispri(irr
->t
)) {
1431 lua_assert(irt_isaddr(ir
->t
) || (LJ_DUALNUM
&& irt_isinteger(ir
->t
)));
1432 emit_i32(as
, irr
->i
);
1433 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1436 emit_i32(as
, (int32_t)irt_toitype(ir
->t
));
1437 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1441 static void asm_sload(ASMState
*as
, IRIns
*ir
)
1443 int32_t ofs
= 8*((int32_t)ir
->op1
-1) + ((ir
->op2
& IRSLOAD_FRAME
) ? 4 : 0);
1446 lua_assert(!(ir
->op2
& IRSLOAD_PARENT
)); /* Handled by asm_head_side(). */
1447 lua_assert(irt_isguard(t
) || !(ir
->op2
& IRSLOAD_TYPECHECK
));
1448 lua_assert(LJ_DUALNUM
||
1449 !irt_isint(t
) || (ir
->op2
& (IRSLOAD_CONVERT
|IRSLOAD_FRAME
)));
1450 if ((ir
->op2
& IRSLOAD_CONVERT
) && irt_isguard(t
) && irt_isint(t
)) {
1451 Reg left
= ra_scratch(as
, RSET_FPR
);
1452 asm_tointg(as
, ir
, left
); /* Frees dest reg. Do this before base alloc. */
1453 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1454 emit_rmro(as
, XMM_MOVRM(as
), left
, base
, ofs
);
1455 t
.irt
= IRT_NUM
; /* Continue with a regular number type check. */
1457 } else if (irt_islightud(t
)) {
1458 Reg dest
= asm_load_lightud64(as
, ir
, (ir
->op2
& IRSLOAD_TYPECHECK
));
1459 if (ra_hasreg(dest
)) {
1460 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1461 emit_rmro(as
, XO_MOV
, dest
|REX_64
, base
, ofs
);
1465 } else if (ra_used(ir
)) {
1466 RegSet allow
= irt_isnum(t
) ? RSET_FPR
: RSET_GPR
;
1467 Reg dest
= ra_dest(as
, ir
, allow
);
1468 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1469 lua_assert(irt_isnum(t
) || irt_isint(t
) || irt_isaddr(t
));
1470 if ((ir
->op2
& IRSLOAD_CONVERT
)) {
1471 t
.irt
= irt_isint(t
) ? IRT_NUM
: IRT_INT
; /* Check for original type. */
1472 emit_rmro(as
, irt_isint(t
) ? XO_CVTSI2SD
: XO_CVTSD2SI
, dest
, base
, ofs
);
1473 } else if (irt_isnum(t
)) {
1474 emit_rmro(as
, XMM_MOVRM(as
), dest
, base
, ofs
);
1476 emit_rmro(as
, XO_MOV
, dest
, base
, ofs
);
1479 if (!(ir
->op2
& IRSLOAD_TYPECHECK
))
1480 return; /* No type check: avoid base alloc. */
1481 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1483 if ((ir
->op2
& IRSLOAD_TYPECHECK
)) {
1484 /* Need type check, even if the load result is unused. */
1485 asm_guardcc(as
, irt_isnum(t
) ? CC_AE
: CC_NE
);
1486 if (LJ_64
&& irt_type(t
) >= IRT_NUM
) {
1487 lua_assert(irt_isinteger(t
) || irt_isnum(t
));
1488 emit_u32(as
, LJ_TISNUM
);
1489 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, base
, ofs
+4);
1491 emit_i8(as
, irt_toitype(t
));
1492 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, base
, ofs
+4);
1497 /* -- Allocations --------------------------------------------------------- */
1500 static void asm_cnew(ASMState
*as
, IRIns
*ir
)
1502 CTState
*cts
= ctype_ctsG(J2G(as
->J
));
1503 CTypeID ctypeid
= (CTypeID
)IR(ir
->op1
)->i
;
1504 CTSize sz
= (ir
->o
== IR_CNEWI
|| ir
->op2
== REF_NIL
) ?
1505 lj_ctype_size(cts
, ctypeid
) : (CTSize
)IR(ir
->op2
)->i
;
1506 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_mem_newgco
];
1508 lua_assert(sz
!= CTSIZE_INVALID
);
1510 args
[0] = ASMREF_L
; /* lua_State *L */
1511 args
[1] = ASMREF_TMP1
; /* MSize size */
1513 asm_setupresult(as
, ir
, ci
); /* GCcdata * */
1515 /* Initialize immutable cdata object. */
1516 if (ir
->o
== IR_CNEWI
) {
1517 RegSet allow
= (RSET_GPR
& ~RSET_SCRATCH
);
1519 Reg r64
= sz
== 8 ? REX_64
: 0;
1520 if (irref_isk(ir
->op2
)) {
1521 IRIns
*irk
= IR(ir
->op2
);
1522 uint64_t k
= irk
->o
== IR_KINT64
? ir_k64(irk
)->u64
:
1523 (uint64_t)(uint32_t)irk
->i
;
1524 if (sz
== 4 || checki32((int64_t)k
)) {
1525 emit_i32(as
, (int32_t)k
);
1526 emit_rmro(as
, XO_MOVmi
, r64
, RID_RET
, sizeof(GCcdata
));
1528 emit_movtomro(as
, RID_ECX
+ r64
, RID_RET
, sizeof(GCcdata
));
1529 emit_loadu64(as
, RID_ECX
, k
);
1532 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
1533 emit_movtomro(as
, r
+ r64
, RID_RET
, sizeof(GCcdata
));
1536 int32_t ofs
= sizeof(GCcdata
);
1539 lua_assert(ir
->o
== IR_HIOP
);
1542 if (irref_isk(ir
->op2
)) {
1543 emit_movmroi(as
, RID_RET
, ofs
, IR(ir
->op2
)->i
);
1545 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
1546 emit_movtomro(as
, r
, RID_RET
, ofs
);
1547 rset_clear(allow
, r
);
1549 if (ofs
== sizeof(GCcdata
)) break;
1553 lua_assert(sz
== 4 || sz
== 8);
1556 /* Combine initialization of marked, gct and ctypeid. */
1557 emit_movtomro(as
, RID_ECX
, RID_RET
, offsetof(GCcdata
, marked
));
1558 emit_gri(as
, XG_ARITHi(XOg_OR
), RID_ECX
,
1559 (int32_t)((~LJ_TCDATA
<<8)+(ctypeid
<<16)));
1560 emit_gri(as
, XG_ARITHi(XOg_AND
), RID_ECX
, LJ_GC_WHITES
);
1561 emit_opgl(as
, XO_MOVZXb
, RID_ECX
, gc
.currentwhite
);
1563 asm_gencall(as
, ci
, args
);
1564 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP1
), (int32_t)(sz
+sizeof(GCcdata
)));
1567 #define asm_cnew(as, ir) ((void)0)
1570 /* -- Write barriers ------------------------------------------------------ */
1572 static void asm_tbar(ASMState
*as
, IRIns
*ir
)
1574 Reg tab
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1575 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, tab
));
1576 MCLabel l_end
= emit_label(as
);
1577 emit_movtomro(as
, tmp
, tab
, offsetof(GCtab
, gclist
));
1578 emit_setgl(as
, tab
, gc
.grayagain
);
1579 emit_getgl(as
, tmp
, gc
.grayagain
);
1580 emit_i8(as
, ~LJ_GC_BLACK
);
1581 emit_rmro(as
, XO_ARITHib
, XOg_AND
, tab
, offsetof(GCtab
, marked
));
1582 emit_sjcc(as
, CC_Z
, l_end
);
1583 emit_i8(as
, LJ_GC_BLACK
);
1584 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, tab
, offsetof(GCtab
, marked
));
1587 static void asm_obar(ASMState
*as
, IRIns
*ir
)
1589 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_barrieruv
];
1593 /* No need for other object barriers (yet). */
1594 lua_assert(IR(ir
->op1
)->o
== IR_UREFC
);
1595 ra_evictset(as
, RSET_SCRATCH
);
1596 l_end
= emit_label(as
);
1597 args
[0] = ASMREF_TMP1
; /* global_State *g */
1598 args
[1] = ir
->op1
; /* TValue *tv */
1599 asm_gencall(as
, ci
, args
);
1600 emit_loada(as
, ra_releasetmp(as
, ASMREF_TMP1
), J2G(as
->J
));
1601 obj
= IR(ir
->op1
)->r
;
1602 emit_sjcc(as
, CC_Z
, l_end
);
1603 emit_i8(as
, LJ_GC_WHITES
);
1604 if (irref_isk(ir
->op2
)) {
1605 GCobj
*vp
= ir_kgc(IR(ir
->op2
));
1606 emit_rma(as
, XO_GROUP3b
, XOg_TEST
, &vp
->gch
.marked
);
1608 Reg val
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_SCRATCH
&RSET_GPR
, obj
));
1609 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, val
, (int32_t)offsetof(GChead
, marked
));
1611 emit_sjcc(as
, CC_Z
, l_end
);
1612 emit_i8(as
, LJ_GC_BLACK
);
1613 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, obj
,
1614 (int32_t)offsetof(GCupval
, marked
)-(int32_t)offsetof(GCupval
, tv
));
1617 /* -- FP/int arithmetic and logic operations ------------------------------ */
1619 /* Load reference onto x87 stack. Force a spill to memory if needed. */
1620 static void asm_x87load(ASMState
*as
, IRRef ref
)
1622 IRIns
*ir
= IR(ref
);
1623 if (ir
->o
== IR_KNUM
) {
1624 cTValue
*tv
= ir_knum(ir
);
1625 if (tvispzero(tv
)) /* Use fldz only for +0. */
1626 emit_x87op(as
, XI_FLDZ
);
1627 else if (tvispone(tv
))
1628 emit_x87op(as
, XI_FLD1
);
1630 emit_rma(as
, XO_FLDq
, XOg_FLDq
, tv
);
1631 } else if (ir
->o
== IR_CONV
&& ir
->op2
== IRCONV_NUM_INT
&& !ra_used(ir
) &&
1632 !irref_isk(ir
->op1
) && mayfuse(as
, ir
->op1
)) {
1633 IRIns
*iri
= IR(ir
->op1
);
1634 emit_rmro(as
, XO_FILDd
, XOg_FILDd
, RID_ESP
, ra_spill(as
, iri
));
1636 emit_mrm(as
, XO_FLDq
, XOg_FLDq
, asm_fuseload(as
, ref
, RSET_EMPTY
));
1640 /* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
1641 static int fpmjoin_pow(ASMState
*as
, IRIns
*ir
)
1643 IRIns
*irp
= IR(ir
->op1
);
1644 if (irp
== ir
-1 && irp
->o
== IR_MUL
&& !ra_used(irp
)) {
1645 IRIns
*irpp
= IR(irp
->op1
);
1646 if (irpp
== ir
-2 && irpp
->o
== IR_FPMATH
&&
1647 irpp
->op2
== IRFPM_LOG2
&& !ra_used(irpp
)) {
1648 /* The modified regs must match with the *.dasc implementation. */
1649 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM2
+1)|RID2RSET(RID_EAX
);
1651 if (ra_hasreg(ir
->r
))
1652 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
1653 ra_evictset(as
, drop
);
1654 ra_destreg(as
, ir
, RID_XMM0
);
1655 emit_call(as
, lj_vm_pow_sse
);
1656 irx
= IR(irpp
->op1
);
1657 if (ra_noreg(irx
->r
) && ra_gethint(irx
->r
) == RID_XMM1
)
1658 irx
->r
= RID_INIT
; /* Avoid allocating xmm1 for x. */
1659 ra_left(as
, RID_XMM0
, irpp
->op1
);
1660 ra_left(as
, RID_XMM1
, irp
->op2
);
1667 static void asm_fpmath(ASMState
*as
, IRIns
*ir
)
1669 IRFPMathOp fpm
= ir
->o
== IR_FPMATH
? (IRFPMathOp
)ir
->op2
: IRFPM_OTHER
;
1670 if (fpm
== IRFPM_SQRT
) {
1671 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
1672 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
1673 emit_mrm(as
, XO_SQRTSD
, dest
, left
);
1674 } else if (fpm
<= IRFPM_TRUNC
) {
1675 if (as
->flags
& JIT_F_SSE4_1
) { /* SSE4.1 has a rounding instruction. */
1676 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
1677 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
1678 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
1679 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
1680 ** This is atrocious, but the alternatives are much worse.
1682 /* Round down/up/trunc == 1001/1010/1011. */
1683 emit_i8(as
, 0x09 + fpm
);
1684 emit_mrm(as
, XO_ROUNDSD
, dest
, left
);
1685 if (LJ_64
&& as
->mcp
[1] != (MCode
)(XO_ROUNDSD
>> 16)) {
1686 as
->mcp
[0] = as
->mcp
[1]; as
->mcp
[1] = 0x0f; /* Swap 0F and REX. */
1688 *--as
->mcp
= 0x66; /* 1st byte of ROUNDSD opcode. */
1689 } else { /* Call helper functions for SSE2 variant. */
1690 /* The modified regs must match with the *.dasc implementation. */
1691 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM3
+1)|RID2RSET(RID_EAX
);
1692 if (ra_hasreg(ir
->r
))
1693 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
1694 ra_evictset(as
, drop
);
1695 ra_destreg(as
, ir
, RID_XMM0
);
1696 emit_call(as
, fpm
== IRFPM_FLOOR
? lj_vm_floor_sse
:
1697 fpm
== IRFPM_CEIL
? lj_vm_ceil_sse
: lj_vm_trunc_sse
);
1698 ra_left(as
, RID_XMM0
, ir
->op1
);
1700 } else if (fpm
== IRFPM_EXP2
&& fpmjoin_pow(as
, ir
)) {
1701 /* Rejoined to pow(). */
1702 } else { /* Handle x87 ops. */
1703 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
1705 if (ra_hasreg(dest
)) {
1707 ra_modified(as
, dest
);
1708 emit_rmro(as
, XMM_MOVRM(as
), dest
, RID_ESP
, ofs
);
1710 emit_rmro(as
, XO_FSTPq
, XOg_FSTPq
, RID_ESP
, ofs
);
1711 switch (fpm
) { /* st0 = lj_vm_*(st0) */
1712 case IRFPM_EXP
: emit_call(as
, lj_vm_exp_x87
); break;
1713 case IRFPM_EXP2
: emit_call(as
, lj_vm_exp2_x87
); break;
1714 case IRFPM_SIN
: emit_x87op(as
, XI_FSIN
); break;
1715 case IRFPM_COS
: emit_x87op(as
, XI_FCOS
); break;
1716 case IRFPM_TAN
: emit_x87op(as
, XI_FPOP
); emit_x87op(as
, XI_FPTAN
); break;
1717 case IRFPM_LOG
: case IRFPM_LOG2
: case IRFPM_LOG10
:
1718 /* Note: the use of fyl2xp1 would be pointless here. When computing
1719 ** log(1.0+eps) the precision is already lost after 1.0 is added.
1720 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
1722 emit_x87op(as
, XI_FYL2X
); break;
1726 emit_x87op(as
, XI_FPATAN
); asm_x87load(as
, ir
->op2
); break;
1728 emit_x87op(as
, XI_FPOP1
); emit_x87op(as
, XI_FSCALE
); break;
1729 default: lua_assert(0); break;
1732 default: lua_assert(0); break;
1734 asm_x87load(as
, ir
->op1
);
1736 case IRFPM_LOG
: emit_x87op(as
, XI_FLDLN2
); break;
1737 case IRFPM_LOG2
: emit_x87op(as
, XI_FLD1
); break;
1738 case IRFPM_LOG10
: emit_x87op(as
, XI_FLDLG2
); break;
1740 if (ir
->o
== IR_LDEXP
) asm_x87load(as
, ir
->op2
);
1747 static void asm_fppowi(ASMState
*as
, IRIns
*ir
)
1749 /* The modified regs must match with the *.dasc implementation. */
1750 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM1
+1)|RID2RSET(RID_EAX
);
1751 if (ra_hasreg(ir
->r
))
1752 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
1753 ra_evictset(as
, drop
);
1754 ra_destreg(as
, ir
, RID_XMM0
);
1755 emit_call(as
, lj_vm_powi_sse
);
1756 ra_left(as
, RID_XMM0
, ir
->op1
);
1757 ra_left(as
, RID_EAX
, ir
->op2
);
1760 #if LJ_64 && LJ_HASFFI
1761 static void asm_arith64(ASMState
*as
, IRIns
*ir
, IRCallID id
)
1763 const CCallInfo
*ci
= &lj_ir_callinfo
[id
];
1767 asm_setupresult(as
, ir
, ci
);
1768 asm_gencall(as
, ci
, args
);
1772 static void asm_intmod(ASMState
*as
, IRIns
*ir
)
1774 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_vm_modi
];
1778 asm_setupresult(as
, ir
, ci
);
1779 asm_gencall(as
, ci
, args
);
1782 static int asm_swapops(ASMState
*as
, IRIns
*ir
)
1784 IRIns
*irl
= IR(ir
->op1
);
1785 IRIns
*irr
= IR(ir
->op2
);
1786 lua_assert(ra_noreg(irr
->r
));
1787 if (!irm_iscomm(lj_ir_mode
[ir
->o
]))
1788 return 0; /* Can't swap non-commutative operations. */
1789 if (irref_isk(ir
->op2
))
1790 return 0; /* Don't swap constants to the left. */
1791 if (ra_hasreg(irl
->r
))
1792 return 1; /* Swap if left already has a register. */
1793 if (ra_samehint(ir
->r
, irr
->r
))
1794 return 1; /* Swap if dest and right have matching hints. */
1795 if (as
->curins
> as
->loopref
) { /* In variant part? */
1796 if (ir
->op2
< as
->loopref
&& !irt_isphi(irr
->t
))
1797 return 0; /* Keep invariants on the right. */
1798 if (ir
->op1
< as
->loopref
&& !irt_isphi(irl
->t
))
1799 return 1; /* Swap invariants to the right. */
1801 if (opisfusableload(irl
->o
))
1802 return 1; /* Swap fusable loads to the right. */
1803 return 0; /* Otherwise don't swap. */
1806 static void asm_fparith(ASMState
*as
, IRIns
*ir
, x86Op xo
)
1808 IRRef lref
= ir
->op1
;
1809 IRRef rref
= ir
->op2
;
1810 RegSet allow
= RSET_FPR
;
1812 Reg right
= IR(rref
)->r
;
1813 if (ra_hasreg(right
)) {
1814 rset_clear(allow
, right
);
1815 ra_noweak(as
, right
);
1817 dest
= ra_dest(as
, ir
, allow
);
1820 } else if (ra_noreg(right
)) {
1821 if (asm_swapops(as
, ir
)) {
1822 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
1824 right
= asm_fuseload(as
, rref
, rset_clear(allow
, dest
));
1826 emit_mrm(as
, xo
, dest
, right
);
1827 ra_left(as
, dest
, lref
);
1830 static void asm_intarith(ASMState
*as
, IRIns
*ir
, x86Arith xa
)
1832 IRRef lref
= ir
->op1
;
1833 IRRef rref
= ir
->op2
;
1834 RegSet allow
= RSET_GPR
;
1837 if (as
->flagmcp
== as
->mcp
) { /* Drop test r,r instruction. */
1838 MCode
*p
= as
->mcp
+ ((LJ_64
&& *as
->mcp
< XI_TESTb
) ? 3 : 2);
1839 if ((p
[1] & 15) < 14) {
1840 if ((p
[1] & 15) >= 12) p
[1] -= 4; /* L <->S, NL <-> NS */
1843 } /* else: cannot transform LE/NLE to cc without use of OF. */
1845 right
= IR(rref
)->r
;
1846 if (ra_hasreg(right
)) {
1847 rset_clear(allow
, right
);
1848 ra_noweak(as
, right
);
1850 dest
= ra_dest(as
, ir
, allow
);
1853 } else if (ra_noreg(right
) && !asm_isk32(as
, rref
, &k
)) {
1854 if (asm_swapops(as
, ir
)) {
1855 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
1857 right
= asm_fuseloadm(as
, rref
, rset_clear(allow
, dest
), irt_is64(ir
->t
));
1859 if (irt_isguard(ir
->t
)) /* For IR_ADDOV etc. */
1860 asm_guardcc(as
, CC_O
);
1861 if (xa
!= XOg_X_IMUL
) {
1862 if (ra_hasreg(right
))
1863 emit_mrm(as
, XO_ARITH(xa
), REX_64IR(ir
, dest
), right
);
1865 emit_gri(as
, XG_ARITHi(xa
), REX_64IR(ir
, dest
), k
);
1866 } else if (ra_hasreg(right
)) { /* IMUL r, mrm. */
1867 emit_mrm(as
, XO_IMUL
, REX_64IR(ir
, dest
), right
);
1868 } else { /* IMUL r, r, k. */
1869 /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */
1870 Reg left
= asm_fuseloadm(as
, lref
, RSET_GPR
, irt_is64(ir
->t
));
1872 if (checki8(k
)) { emit_i8(as
, k
); xo
= XO_IMULi8
;
1873 } else { emit_i32(as
, k
); xo
= XO_IMULi
; }
1874 emit_mrm(as
, xo
, REX_64IR(ir
, dest
), left
);
1877 ra_left(as
, dest
, lref
);
1880 /* LEA is really a 4-operand ADD with an independent destination register,
1881 ** up to two source registers and an immediate. One register can be scaled
1882 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
1885 ** Currently only a few common cases are supported:
1886 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated
1887 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b
1888 ** - Right ADD fusion: y = a+(b+k)
1889 ** The ommited variants have already been reduced by FOLD.
1891 ** There are more fusion opportunities, like gathering shifts or joining
1892 ** common references. But these are probably not worth the trouble, since
1893 ** array indexing is not decomposed and already makes use of all fields
1894 ** of the ModRM operand.
1896 static int asm_lea(ASMState
*as
, IRIns
*ir
)
1898 IRIns
*irl
= IR(ir
->op1
);
1899 IRIns
*irr
= IR(ir
->op2
);
1900 RegSet allow
= RSET_GPR
;
1902 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1903 as
->mrm
.scale
= XM_SCALE1
;
1905 if (ra_hasreg(irl
->r
)) {
1906 rset_clear(allow
, irl
->r
);
1907 ra_noweak(as
, irl
->r
);
1908 as
->mrm
.base
= irl
->r
;
1909 if (irref_isk(ir
->op2
) || ra_hasreg(irr
->r
)) {
1910 /* The PHI renaming logic does a better job in some cases. */
1911 if (ra_hasreg(ir
->r
) &&
1912 ((irt_isphi(irl
->t
) && as
->phireg
[ir
->r
] == ir
->op1
) ||
1913 (irt_isphi(irr
->t
) && as
->phireg
[ir
->r
] == ir
->op2
)))
1915 if (irref_isk(ir
->op2
)) {
1916 as
->mrm
.ofs
= irr
->i
;
1918 rset_clear(allow
, irr
->r
);
1919 ra_noweak(as
, irr
->r
);
1920 as
->mrm
.idx
= irr
->r
;
1922 } else if (irr
->o
== IR_ADD
&& mayfuse(as
, ir
->op2
) &&
1923 irref_isk(irr
->op2
)) {
1924 Reg idx
= ra_alloc1(as
, irr
->op1
, allow
);
1925 rset_clear(allow
, idx
);
1926 as
->mrm
.idx
= (uint8_t)idx
;
1927 as
->mrm
.ofs
= IR(irr
->op2
)->i
;
1931 } else if (ir
->op1
!= ir
->op2
&& irl
->o
== IR_ADD
&& mayfuse(as
, ir
->op1
) &&
1932 (irref_isk(ir
->op2
) || irref_isk(irl
->op2
))) {
1933 Reg idx
, base
= ra_alloc1(as
, irl
->op1
, allow
);
1934 rset_clear(allow
, base
);
1935 as
->mrm
.base
= (uint8_t)base
;
1936 if (irref_isk(ir
->op2
)) {
1937 as
->mrm
.ofs
= irr
->i
;
1938 idx
= ra_alloc1(as
, irl
->op2
, allow
);
1940 as
->mrm
.ofs
= IR(irl
->op2
)->i
;
1941 idx
= ra_alloc1(as
, ir
->op2
, allow
);
1943 rset_clear(allow
, idx
);
1944 as
->mrm
.idx
= (uint8_t)idx
;
1948 dest
= ra_dest(as
, ir
, allow
);
1949 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1950 return 1; /* Success. */
1953 static void asm_add(ASMState
*as
, IRIns
*ir
)
1955 if (irt_isnum(ir
->t
))
1956 asm_fparith(as
, ir
, XO_ADDSD
);
1957 else if ((as
->flags
& JIT_F_LEA_AGU
) || as
->flagmcp
== as
->mcp
||
1958 irt_is64(ir
->t
) || !asm_lea(as
, ir
))
1959 asm_intarith(as
, ir
, XOg_ADD
);
1962 static void asm_neg_not(ASMState
*as
, IRIns
*ir
, x86Group3 xg
)
1964 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1965 emit_rr(as
, XO_GROUP3
, REX_64IR(ir
, xg
), dest
);
1966 ra_left(as
, dest
, ir
->op1
);
1969 static void asm_min_max(ASMState
*as
, IRIns
*ir
, int cc
)
1971 Reg right
, dest
= ra_dest(as
, ir
, RSET_GPR
);
1972 IRRef lref
= ir
->op1
, rref
= ir
->op2
;
1973 if (irref_isk(rref
)) { lref
= rref
; rref
= ir
->op1
; }
1974 right
= ra_alloc1(as
, rref
, rset_exclude(RSET_GPR
, dest
));
1975 emit_rr(as
, XO_CMOV
+ (cc
<<24), REX_64IR(ir
, dest
), right
);
1976 emit_rr(as
, XO_CMP
, REX_64IR(ir
, dest
), right
);
1977 ra_left(as
, dest
, lref
);
1980 static void asm_bitswap(ASMState
*as
, IRIns
*ir
)
1982 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1983 as
->mcp
= emit_op(XO_BSWAP
+ ((dest
&7) << 24),
1984 REX_64IR(ir
, 0), dest
, 0, as
->mcp
, 1);
1985 ra_left(as
, dest
, ir
->op1
);
1988 static void asm_bitshift(ASMState
*as
, IRIns
*ir
, x86Shift xs
)
1990 IRRef rref
= ir
->op2
;
1991 IRIns
*irr
= IR(rref
);
1993 if (irref_isk(rref
)) { /* Constant shifts. */
1995 dest
= ra_dest(as
, ir
, RSET_GPR
);
1996 shift
= irr
->i
& (irt_is64(ir
->t
) ? 63 : 31);
1999 case 1: emit_rr(as
, XO_SHIFT1
, REX_64IR(ir
, xs
), dest
); break;
2000 default: emit_shifti(as
, REX_64IR(ir
, xs
), dest
, shift
); break;
2002 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
2004 dest
= ra_dest(as
, ir
, rset_exclude(RSET_GPR
, RID_ECX
));
2005 if (dest
== RID_ECX
) {
2006 dest
= ra_scratch(as
, rset_exclude(RSET_GPR
, RID_ECX
));
2007 emit_rr(as
, XO_MOV
, RID_ECX
, dest
);
2010 if (ra_noreg(right
))
2011 right
= ra_allocref(as
, rref
, RID2RSET(RID_ECX
));
2012 else if (right
!= RID_ECX
)
2013 ra_scratch(as
, RID2RSET(RID_ECX
));
2014 emit_rr(as
, XO_SHIFTcl
, REX_64IR(ir
, xs
), dest
);
2015 ra_noweak(as
, right
);
2016 if (right
!= RID_ECX
)
2017 emit_rr(as
, XO_MOV
, RID_ECX
, right
);
2019 ra_left(as
, dest
, ir
->op1
);
2021 ** Note: avoid using the flags resulting from a shift or rotate!
2022 ** All of them cause a partial flag stall, except for r,1 shifts
2023 ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
2027 /* -- Comparisons --------------------------------------------------------- */
2029 /* Virtual flags for unordered FP comparisons. */
2030 #define VCC_U 0x1000 /* Unordered. */
2031 #define VCC_P 0x2000 /* Needs extra CC_P branch. */
2032 #define VCC_S 0x4000 /* Swap avoids CC_P branch. */
2033 #define VCC_PS (VCC_P|VCC_S)
2035 /* Map of comparisons to flags. ORDER IR. */
2036 #define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
2037 static const uint16_t asm_compmap
[IR_ABC
+1] = {
2038 /* signed non-eq unsigned flags */
2039 /* LT */ COMPFLAGS(CC_GE
, CC_G
, CC_AE
, VCC_PS
),
2040 /* GE */ COMPFLAGS(CC_L
, CC_L
, CC_B
, 0),
2041 /* LE */ COMPFLAGS(CC_G
, CC_G
, CC_A
, VCC_PS
),
2042 /* GT */ COMPFLAGS(CC_LE
, CC_L
, CC_BE
, 0),
2043 /* ULT */ COMPFLAGS(CC_AE
, CC_A
, CC_AE
, VCC_U
),
2044 /* UGE */ COMPFLAGS(CC_B
, CC_B
, CC_B
, VCC_U
|VCC_PS
),
2045 /* ULE */ COMPFLAGS(CC_A
, CC_A
, CC_A
, VCC_U
),
2046 /* UGT */ COMPFLAGS(CC_BE
, CC_B
, CC_BE
, VCC_U
|VCC_PS
),
2047 /* EQ */ COMPFLAGS(CC_NE
, CC_NE
, CC_NE
, VCC_P
),
2048 /* NE */ COMPFLAGS(CC_E
, CC_E
, CC_E
, VCC_U
|VCC_P
),
2049 /* ABC */ COMPFLAGS(CC_BE
, CC_B
, CC_BE
, VCC_U
|VCC_PS
) /* Same as UGT. */
2052 /* FP and integer comparisons. */
2053 static void asm_comp(ASMState
*as
, IRIns
*ir
, uint32_t cc
)
2055 if (irt_isnum(ir
->t
)) {
2056 IRRef lref
= ir
->op1
;
2057 IRRef rref
= ir
->op2
;
2061 ** An extra CC_P branch is required to preserve ordered/unordered
2062 ** semantics for FP comparisons. This can be avoided by swapping
2063 ** the operands and inverting the condition (except for EQ and UNE).
2064 ** So always try to swap if possible.
2066 ** Another option would be to swap operands to achieve better memory
2067 ** operand fusion. But it's unlikely that this outweighs the cost
2068 ** of the extra branches.
2070 if (cc
& VCC_S
) { /* Swap? */
2071 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2072 cc
^= (VCC_PS
|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
2074 left
= ra_alloc1(as
, lref
, RSET_FPR
);
2075 right
= asm_fuseload(as
, rref
, rset_exclude(RSET_FPR
, left
));
2076 l_around
= emit_label(as
);
2077 asm_guardcc(as
, cc
>> 4);
2078 if (cc
& VCC_P
) { /* Extra CC_P branch required? */
2079 if (!(cc
& VCC_U
)) {
2080 asm_guardcc(as
, CC_P
); /* Branch to exit for ordered comparisons. */
2081 } else if (l_around
!= as
->invmcp
) {
2082 emit_sjcc(as
, CC_P
, l_around
); /* Branch around for unordered. */
2084 /* Patched to mcloop by asm_loop_fixup. */
2087 emit_sjcc(as
, CC_P
, as
->mcp
);
2089 emit_jcc(as
, CC_P
, as
->mcp
);
2092 emit_mrm(as
, XO_UCOMISD
, left
, right
);
2094 IRRef lref
= ir
->op1
, rref
= ir
->op2
;
2095 IROp leftop
= (IROp
)(IR(lref
)->o
);
2096 Reg r64
= REX_64IR(ir
, 0);
2098 lua_assert(irt_is64(ir
->t
) || irt_isint(ir
->t
) ||
2099 irt_isu32(ir
->t
) || irt_isaddr(ir
->t
) || irt_isu8(ir
->t
));
2100 /* Swap constants (only for ABC) and fusable loads to the right. */
2101 if (irref_isk(lref
) || (!irref_isk(rref
) && opisfusableload(leftop
))) {
2102 if ((cc
& 0xc) == 0xc) cc
^= 0x53; /* L <-> G, LE <-> GE */
2103 else if ((cc
& 0xa) == 0x2) cc
^= 0x55; /* A <-> B, AE <-> BE */
2104 lref
= ir
->op2
; rref
= ir
->op1
;
2106 if (asm_isk32(as
, rref
, &imm
)) {
2107 IRIns
*irl
= IR(lref
);
2108 /* Check wether we can use test ins. Not for unsigned, since CF=0. */
2109 int usetest
= (imm
== 0 && (cc
& 0xa) != 0x2);
2110 if (usetest
&& irl
->o
== IR_BAND
&& irl
+1 == ir
&& !ra_used(irl
)) {
2111 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
2112 Reg right
, left
= RID_NONE
;
2113 RegSet allow
= RSET_GPR
;
2114 if (!asm_isk32(as
, irl
->op2
, &imm
)) {
2115 left
= ra_alloc1(as
, irl
->op2
, allow
);
2116 rset_clear(allow
, left
);
2117 } else { /* Try to Fuse IRT_I8/IRT_U8 loads, too. See below. */
2118 IRIns
*irll
= IR(irl
->op1
);
2119 if (opisfusableload((IROp
)irll
->o
) &&
2120 (irt_isi8(irll
->t
) || irt_isu8(irll
->t
))) {
2121 IRType1 origt
= irll
->t
; /* Temporarily flip types. */
2122 irll
->t
.irt
= (irll
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
2123 as
->curins
--; /* Skip to BAND to avoid failing in noconflict(). */
2124 right
= asm_fuseload(as
, irl
->op1
, RSET_GPR
);
2127 if (right
!= RID_MRM
) goto test_nofuse
;
2128 /* Fusion succeeded, emit test byte mrm, imm8. */
2129 asm_guardcc(as
, cc
);
2130 emit_i8(as
, (imm
& 0xff));
2131 emit_mrm(as
, XO_GROUP3b
, XOg_TEST
, RID_MRM
);
2135 as
->curins
--; /* Skip to BAND to avoid failing in noconflict(). */
2136 right
= asm_fuseloadm(as
, irl
->op1
, allow
, r64
);
2137 as
->curins
++; /* Undo the above. */
2139 asm_guardcc(as
, cc
);
2140 if (ra_noreg(left
)) {
2142 emit_mrm(as
, XO_GROUP3
, r64
+ XOg_TEST
, right
);
2144 emit_mrm(as
, XO_TEST
, r64
+ left
, right
);
2148 if (opisfusableload((IROp
)irl
->o
) &&
2149 ((irt_isu8(irl
->t
) && checku8(imm
)) ||
2150 ((irt_isi8(irl
->t
) || irt_isi16(irl
->t
)) && checki8(imm
)) ||
2151 (irt_isu16(irl
->t
) && checku16(imm
) && checki8((int16_t)imm
)))) {
2152 /* Only the IRT_INT case is fused by asm_fuseload.
2153 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
2154 ** are handled here.
2155 ** Note that cmp word [mem], imm16 should not be generated,
2156 ** since it has a length-changing prefix. Compares of a word
2157 ** against a sign-extended imm8 are ok, however.
2159 IRType1 origt
= irl
->t
; /* Temporarily flip types. */
2160 irl
->t
.irt
= (irl
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
2161 left
= asm_fuseload(as
, lref
, RSET_GPR
);
2163 if (left
== RID_MRM
) { /* Fusion succeeded? */
2164 if (irt_isu8(irl
->t
) || irt_isu16(irl
->t
))
2165 cc
>>= 4; /* Need unsigned compare. */
2166 asm_guardcc(as
, cc
);
2168 emit_mrm(as
, (irt_isi8(origt
) || irt_isu8(origt
)) ?
2169 XO_ARITHib
: XO_ARITHiw8
, r64
+ XOg_CMP
, RID_MRM
);
2171 } /* Otherwise handle register case as usual. */
2173 left
= asm_fuseloadm(as
, lref
,
2174 irt_isu8(ir
->t
) ? RSET_GPR8
: RSET_GPR
, r64
);
2176 asm_guardcc(as
, cc
);
2177 if (usetest
&& left
!= RID_MRM
) {
2178 /* Use test r,r instead of cmp r,0. */
2180 if (irt_isu8(ir
->t
)) {
2181 lua_assert(ir
->o
== IR_EQ
|| ir
->o
== IR_NE
);
2183 if (!rset_test(RSET_RANGE(RID_EAX
, RID_EBX
+1), left
)) {
2188 emit_mrm(as
, XO_GROUP3
, XOg_TEST
, left
);
2193 emit_rr(as
, xo
, r64
+ left
, left
);
2194 if (irl
+1 == ir
) /* Referencing previous ins? */
2195 as
->flagmcp
= as
->mcp
; /* Set flag to drop test r,r if possible. */
2197 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), r64
+ left
, imm
);
2201 Reg left
= ra_alloc1(as
, lref
, RSET_GPR
);
2202 Reg right
= asm_fuseloadm(as
, rref
, rset_exclude(RSET_GPR
, left
), r64
);
2203 asm_guardcc(as
, cc
);
2204 emit_mrm(as
, XO_CMP
, r64
+ left
, right
);
2209 #if LJ_32 && LJ_HASFFI
2210 /* 64 bit integer comparisons in 32 bit mode. */
2211 static void asm_comp_int64(ASMState
*as
, IRIns
*ir
)
2213 uint32_t cc
= asm_compmap
[(ir
-1)->o
];
2214 RegSet allow
= RSET_GPR
;
2215 Reg lefthi
= RID_NONE
, leftlo
= RID_NONE
;
2216 Reg righthi
= RID_NONE
, rightlo
= RID_NONE
;
2220 as
->curins
--; /* Skip loword ins. Avoids failing in noconflict(), too. */
2222 /* Allocate/fuse hiword operands. */
2223 if (irref_isk(ir
->op2
)) {
2224 lefthi
= asm_fuseload(as
, ir
->op1
, allow
);
2226 lefthi
= ra_alloc1(as
, ir
->op1
, allow
);
2227 rset_clear(allow
, lefthi
);
2228 righthi
= asm_fuseload(as
, ir
->op2
, allow
);
2229 if (righthi
== RID_MRM
) {
2230 if (as
->mrm
.base
!= RID_NONE
) rset_clear(allow
, as
->mrm
.base
);
2231 if (as
->mrm
.idx
!= RID_NONE
) rset_clear(allow
, as
->mrm
.idx
);
2233 rset_clear(allow
, righthi
);
2236 mrm
= as
->mrm
; /* Save state for hiword instruction. */
2238 /* Allocate/fuse loword operands. */
2239 if (irref_isk((ir
-1)->op2
)) {
2240 leftlo
= asm_fuseload(as
, (ir
-1)->op1
, allow
);
2242 leftlo
= ra_alloc1(as
, (ir
-1)->op1
, allow
);
2243 rset_clear(allow
, leftlo
);
2244 rightlo
= asm_fuseload(as
, (ir
-1)->op2
, allow
);
2247 /* All register allocations must be performed _before_ this point. */
2248 l_around
= emit_label(as
);
2249 as
->invmcp
= as
->flagmcp
= NULL
; /* Cannot use these optimizations. */
2251 /* Loword comparison and branch. */
2252 asm_guardcc(as
, cc
>> 4); /* Always use unsigned compare for loword. */
2253 if (ra_noreg(rightlo
)) {
2254 int32_t imm
= IR((ir
-1)->op2
)->i
;
2255 if (imm
== 0 && ((cc
>> 4) & 0xa) != 0x2 && leftlo
!= RID_MRM
)
2256 emit_rr(as
, XO_TEST
, leftlo
, leftlo
);
2258 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), leftlo
, imm
);
2260 emit_mrm(as
, XO_CMP
, leftlo
, rightlo
);
2263 /* Hiword comparison and branches. */
2264 if ((cc
& 15) != CC_NE
)
2265 emit_sjcc(as
, CC_NE
, l_around
); /* Hiword unequal: skip loword compare. */
2266 if ((cc
& 15) != CC_E
)
2267 asm_guardcc(as
, cc
>> 8); /* Hiword compare without equality check. */
2268 as
->mrm
= mrm
; /* Restore state. */
2269 if (ra_noreg(righthi
)) {
2270 int32_t imm
= IR(ir
->op2
)->i
;
2271 if (imm
== 0 && (cc
& 0xa) != 0x2 && lefthi
!= RID_MRM
)
2272 emit_rr(as
, XO_TEST
, lefthi
, lefthi
);
2274 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), lefthi
, imm
);
2276 emit_mrm(as
, XO_CMP
, lefthi
, righthi
);
2281 /* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
2283 /* Hiword op of a split 64 bit op. Previous op must be the loword op. */
2284 static void asm_hiop(ASMState
*as
, IRIns
*ir
)
2286 #if LJ_32 && LJ_HASFFI
2287 /* HIOP is marked as a store because it needs its own DCE logic. */
2288 int uselo
= ra_used(ir
-1), usehi
= ra_used(ir
); /* Loword/hiword used? */
2289 if (LJ_UNLIKELY(!(as
->flags
& JIT_F_OPT_DCE
))) uselo
= usehi
= 1;
2290 if ((ir
-1)->o
== IR_CONV
) { /* Conversions to/from 64 bit. */
2291 if (usehi
|| uselo
) {
2292 if (irt_isfp(ir
->t
))
2293 asm_conv_fp_int64(as
, ir
);
2295 asm_conv_int64_fp(as
, ir
);
2297 as
->curins
--; /* Always skip the CONV. */
2299 } else if ((ir
-1)->o
<= IR_NE
) { /* 64 bit integer comparisons. ORDER IR. */
2300 asm_comp_int64(as
, ir
);
2302 } else if ((ir
-1)->o
== IR_XSTORE
) {
2303 if ((ir
-1)->r
!= RID_SINK
)
2304 asm_fxstore(as
, ir
);
2307 if (!usehi
) return; /* Skip unused hiword op for all remaining ops. */
2308 switch ((ir
-1)->o
) {
2312 asm_intarith(as
, ir
, XOg_ADC
);
2313 asm_intarith(as
, ir
-1, XOg_ADD
);
2318 asm_intarith(as
, ir
, XOg_SBB
);
2319 asm_intarith(as
, ir
-1, XOg_SUB
);
2322 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2323 emit_rr(as
, XO_GROUP3
, XOg_NEG
, dest
);
2325 emit_rr(as
, XO_ARITHi8
, XOg_ADC
, dest
);
2326 ra_left(as
, dest
, ir
->op1
);
2328 asm_neg_not(as
, ir
-1, XOg_NEG
);
2334 ra_allocref(as
, ir
->op1
, RID2RSET(RID_RETLO
)); /* Mark lo op as used. */
2337 /* Nothing to do here. Handled by CNEWI itself. */
2339 default: lua_assert(0); break;
2342 UNUSED(as
); UNUSED(ir
); lua_assert(0); /* Unused on x64 or without FFI. */
2346 /* -- Stack handling ------------------------------------------------------ */
2348 /* Check Lua stack size for overflow. Use exit handler as fallback. */
2349 static void asm_stack_check(ASMState
*as
, BCReg topslot
,
2350 IRIns
*irp
, RegSet allow
, ExitNo exitno
)
2352 /* Try to get an unused temp. register, otherwise spill/restore eax. */
2353 Reg pbase
= irp
? irp
->r
: RID_BASE
;
2354 Reg r
= allow
? rset_pickbot(allow
) : RID_EAX
;
2355 emit_jcc(as
, CC_B
, exitstub_addr(as
->J
, exitno
));
2356 if (allow
== RSET_EMPTY
) /* Restore temp. register. */
2357 emit_rmro(as
, XO_MOV
, r
|REX_64
, RID_ESP
, 0);
2360 emit_gri(as
, XG_ARITHi(XOg_CMP
), r
, (int32_t)(8*topslot
));
2361 if (ra_hasreg(pbase
) && pbase
!= r
)
2362 emit_rr(as
, XO_ARITH(XOg_SUB
), r
, pbase
);
2364 emit_rmro(as
, XO_ARITH(XOg_SUB
), r
, RID_NONE
,
2365 ptr2addr(&J2G(as
->J
)->jit_base
));
2366 emit_rmro(as
, XO_MOV
, r
, r
, offsetof(lua_State
, maxstack
));
2367 emit_getgl(as
, r
, jit_L
);
2368 if (allow
== RSET_EMPTY
) /* Spill temp. register. */
2369 emit_rmro(as
, XO_MOVto
, r
|REX_64
, RID_ESP
, 0);
2372 /* Restore Lua stack from on-trace state. */
2373 static void asm_stack_restore(ASMState
*as
, SnapShot
*snap
)
2375 SnapEntry
*map
= &as
->T
->snapmap
[snap
->mapofs
];
2376 SnapEntry
*flinks
= &as
->T
->snapmap
[snap_nextofs(as
->T
, snap
)-1];
2377 MSize n
, nent
= snap
->nent
;
2378 /* Store the value of all modified slots to the Lua stack. */
2379 for (n
= 0; n
< nent
; n
++) {
2380 SnapEntry sn
= map
[n
];
2381 BCReg s
= snap_slot(sn
);
2382 int32_t ofs
= 8*((int32_t)s
-1);
2383 IRRef ref
= snap_ref(sn
);
2384 IRIns
*ir
= IR(ref
);
2385 if ((sn
& SNAP_NORESTORE
))
2387 if (irt_isnum(ir
->t
)) {
2388 Reg src
= ra_alloc1(as
, ref
, RSET_FPR
);
2389 emit_rmro(as
, XO_MOVSDto
, src
, RID_BASE
, ofs
);
2391 lua_assert(irt_ispri(ir
->t
) || irt_isaddr(ir
->t
) ||
2392 (LJ_DUALNUM
&& irt_isinteger(ir
->t
)));
2393 if (!irref_isk(ref
)) {
2394 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, RID_BASE
));
2395 emit_movtomro(as
, REX_64IR(ir
, src
), RID_BASE
, ofs
);
2396 } else if (!irt_ispri(ir
->t
)) {
2397 emit_movmroi(as
, RID_BASE
, ofs
, ir
->i
);
2399 if ((sn
& (SNAP_CONT
|SNAP_FRAME
))) {
2400 if (s
!= 0) /* Do not overwrite link to previous frame. */
2401 emit_movmroi(as
, RID_BASE
, ofs
+4, (int32_t)(*flinks
--));
2403 if (!(LJ_64
&& irt_islightud(ir
->t
)))
2404 emit_movmroi(as
, RID_BASE
, ofs
+4, irt_toitype(ir
->t
));
2409 lua_assert(map
+ nent
== flinks
);
2412 /* -- GC handling --------------------------------------------------------- */
2414 /* Check GC threshold and do one or more GC steps. */
2415 static void asm_gc_check(ASMState
*as
)
2417 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_step_jit
];
2421 ra_evictset(as
, RSET_SCRATCH
);
2422 l_end
= emit_label(as
);
2423 /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
2424 asm_guardcc(as
, CC_NE
); /* Assumes asm_snap_prep() already done. */
2425 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
);
2426 args
[0] = ASMREF_TMP1
; /* global_State *g */
2427 args
[1] = ASMREF_TMP2
; /* MSize steps */
2428 asm_gencall(as
, ci
, args
);
2429 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
2430 emit_loada(as
, tmp
, J2G(as
->J
));
2431 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP2
), as
->gcsteps
);
2432 /* Jump around GC step if GC total < GC threshold. */
2433 emit_sjcc(as
, CC_B
, l_end
);
2434 emit_opgl(as
, XO_ARITH(XOg_CMP
), tmp
, gc
.threshold
);
2435 emit_getgl(as
, tmp
, gc
.total
);
2440 /* -- Loop handling ------------------------------------------------------- */
2442 /* Fixup the loop branch. */
2443 static void asm_loop_fixup(ASMState
*as
)
2445 MCode
*p
= as
->mctop
;
2446 MCode
*target
= as
->mcp
;
2447 if (as
->realign
) { /* Realigned loops use short jumps. */
2448 as
->realign
= NULL
; /* Stop another retry. */
2449 lua_assert(((intptr_t)target
& 15) == 0);
2450 if (as
->loopinv
) { /* Inverted loop branch? */
2453 lua_assert(target
- p
>= -128);
2454 p
[-1] = (MCode
)(target
- p
); /* Patch sjcc. */
2455 if (as
->loopinv
== 2)
2456 p
[-3] = (MCode
)(target
- p
+ 2); /* Patch opt. short jp. */
2458 lua_assert(target
- p
>= -128);
2459 p
[-1] = (MCode
)(int8_t)(target
- p
); /* Patch short jmp. */
2465 if (as
->loopinv
) { /* Inverted loop branch? */
2466 /* asm_guardcc already inverted the jcc and patched the jmp. */
2469 *(int32_t *)(p
-4) = (int32_t)(target
- p
); /* Patch jcc. */
2470 if (as
->loopinv
== 2) {
2471 *(int32_t *)(p
-10) = (int32_t)(target
- p
+ 6); /* Patch opt. jp. */
2474 } else { /* Otherwise just patch jmp. */
2475 *(int32_t *)(p
-4) = (int32_t)(target
- p
);
2478 /* Realign small loops and shorten the loop branch. */
2479 if (newloop
>= p
- 128) {
2480 as
->realign
= newloop
; /* Force a retry and remember alignment. */
2481 as
->curins
= as
->stopins
; /* Abort asm_trace now. */
2482 as
->T
->nins
= as
->orignins
; /* Remove any added renames. */
2487 /* -- Head of trace ------------------------------------------------------- */
2489 /* Coalesce BASE register for a root trace. */
2490 static void asm_head_root_base(ASMState
*as
)
2492 IRIns
*ir
= IR(REF_BASE
);
2496 if (rset_test(as
->modset
, r
) || irt_ismarked(ir
->t
))
2497 ir
->r
= RID_INIT
; /* No inheritance for modified BASE register. */
2499 emit_rr(as
, XO_MOV
, r
, RID_BASE
);
2503 /* Coalesce or reload BASE register for a side trace. */
2504 static RegSet
asm_head_side_base(ASMState
*as
, IRIns
*irp
, RegSet allow
)
2506 IRIns
*ir
= IR(REF_BASE
);
2510 if (rset_test(as
->modset
, r
) || irt_ismarked(ir
->t
))
2511 ir
->r
= RID_INIT
; /* No inheritance for modified BASE register. */
2513 rset_clear(allow
, r
); /* Mark same BASE register as coalesced. */
2514 } else if (ra_hasreg(irp
->r
) && rset_test(as
->freeset
, irp
->r
)) {
2515 rset_clear(allow
, irp
->r
);
2516 emit_rr(as
, XO_MOV
, r
, irp
->r
); /* Move from coalesced parent reg. */
2518 emit_getgl(as
, r
, jit_base
); /* Otherwise reload BASE. */
2524 /* -- Tail of trace ------------------------------------------------------- */
2526 /* Fixup the tail code. */
2527 static void asm_tail_fixup(ASMState
*as
, TraceNo lnk
)
2529 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
2530 MCode
*p
= as
->mctop
;
2532 int32_t spadj
= as
->T
->spadjust
;
2534 p
-= ((as
->flags
& JIT_F_LEA_AGU
) ? 7 : 6) + (LJ_64
? 1 : 0);
2537 /* Patch stack adjustment. */
2538 if (checki8(spadj
)) {
2544 *(int32_t *)p1
= spadj
;
2546 if ((as
->flags
& JIT_F_LEA_AGU
)) {
2550 p1
[-3] = (MCode
)XI_LEA
;
2551 p1
[-2] = MODRM(checki8(spadj
) ? XM_OFS8
: XM_OFS32
, RID_ESP
, RID_ESP
);
2552 p1
[-1] = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
2557 p1
[-2] = (MCode
)(checki8(spadj
) ? XI_ARITHi8
: XI_ARITHi
);
2558 p1
[-1] = MODRM(XM_REG
, XOg_ADD
, RID_ESP
);
2561 /* Patch exit branch. */
2562 target
= lnk
? traceref(as
->J
, lnk
)->mcode
: (MCode
*)lj_vm_exit_interp
;
2563 *(int32_t *)(p
-4) = jmprel(p
, target
);
2565 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
2566 for (q
= as
->mctop
-1; q
>= p
; q
--)
2571 /* Prepare tail of code. */
2572 static void asm_tail_prep(ASMState
*as
)
2574 MCode
*p
= as
->mctop
;
2575 /* Realign and leave room for backwards loop branch or exit branch. */
2577 int i
= ((int)(intptr_t)as
->realign
) & 15;
2578 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
2582 p
-= (as
->loopinv
? 5 : 2); /* Space for short/near jmp. */
2584 p
-= 5; /* Space for exit branch (near jmp). */
2587 as
->invmcp
= as
->mcp
= p
;
2589 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
2590 as
->mcp
= p
- (((as
->flags
& JIT_F_LEA_AGU
) ? 7 : 6) + (LJ_64
? 1 : 0));
2595 /* -- Instruction dispatch ------------------------------------------------ */
2597 /* Assemble a single instruction. */
2598 static void asm_ir(ASMState
*as
, IRIns
*ir
)
2600 switch ((IROp
)ir
->o
) {
2601 /* Miscellaneous ops. */
2602 case IR_LOOP
: asm_loop(as
); break;
2603 case IR_NOP
: case IR_XBAR
: lua_assert(!ra_used(ir
)); break;
2605 ra_alloc1(as
, ir
->op1
, irt_isfp(ir
->t
) ? RSET_FPR
: RSET_GPR
); break;
2606 case IR_PHI
: asm_phi(as
, ir
); break;
2607 case IR_HIOP
: asm_hiop(as
, ir
); break;
2608 case IR_GCSTEP
: asm_gcstep(as
, ir
); break;
2610 /* Guarded assertions. */
2611 case IR_LT
: case IR_GE
: case IR_LE
: case IR_GT
:
2612 case IR_ULT
: case IR_UGE
: case IR_ULE
: case IR_UGT
:
2613 case IR_EQ
: case IR_NE
: case IR_ABC
:
2614 asm_comp(as
, ir
, asm_compmap
[ir
->o
]);
2617 case IR_RETF
: asm_retf(as
, ir
); break;
2620 case IR_BNOT
: asm_neg_not(as
, ir
, XOg_NOT
); break;
2621 case IR_BSWAP
: asm_bitswap(as
, ir
); break;
2623 case IR_BAND
: asm_intarith(as
, ir
, XOg_AND
); break;
2624 case IR_BOR
: asm_intarith(as
, ir
, XOg_OR
); break;
2625 case IR_BXOR
: asm_intarith(as
, ir
, XOg_XOR
); break;
2627 case IR_BSHL
: asm_bitshift(as
, ir
, XOg_SHL
); break;
2628 case IR_BSHR
: asm_bitshift(as
, ir
, XOg_SHR
); break;
2629 case IR_BSAR
: asm_bitshift(as
, ir
, XOg_SAR
); break;
2630 case IR_BROL
: asm_bitshift(as
, ir
, XOg_ROL
); break;
2631 case IR_BROR
: asm_bitshift(as
, ir
, XOg_ROR
); break;
2633 /* Arithmetic ops. */
2634 case IR_ADD
: asm_add(as
, ir
); break;
2636 if (irt_isnum(ir
->t
))
2637 asm_fparith(as
, ir
, XO_SUBSD
);
2638 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2639 asm_intarith(as
, ir
, XOg_SUB
);
2642 if (irt_isnum(ir
->t
))
2643 asm_fparith(as
, ir
, XO_MULSD
);
2645 asm_intarith(as
, ir
, XOg_X_IMUL
);
2648 #if LJ_64 && LJ_HASFFI
2649 if (!irt_isnum(ir
->t
))
2650 asm_arith64(as
, ir
, irt_isi64(ir
->t
) ? IRCALL_lj_carith_divi64
:
2651 IRCALL_lj_carith_divu64
);
2654 asm_fparith(as
, ir
, XO_DIVSD
);
2657 #if LJ_64 && LJ_HASFFI
2658 if (!irt_isint(ir
->t
))
2659 asm_arith64(as
, ir
, irt_isi64(ir
->t
) ? IRCALL_lj_carith_modi64
:
2660 IRCALL_lj_carith_modu64
);
2667 if (irt_isnum(ir
->t
))
2668 asm_fparith(as
, ir
, XO_XORPS
);
2670 asm_neg_not(as
, ir
, XOg_NEG
);
2672 case IR_ABS
: asm_fparith(as
, ir
, XO_ANDPS
); break;
2675 if (irt_isnum(ir
->t
))
2676 asm_fparith(as
, ir
, XO_MINSD
);
2678 asm_min_max(as
, ir
, CC_G
);
2681 if (irt_isnum(ir
->t
))
2682 asm_fparith(as
, ir
, XO_MAXSD
);
2684 asm_min_max(as
, ir
, CC_L
);
2687 case IR_FPMATH
: case IR_ATAN2
: case IR_LDEXP
:
2691 #if LJ_64 && LJ_HASFFI
2692 if (!irt_isnum(ir
->t
))
2693 asm_arith64(as
, ir
, irt_isi64(ir
->t
) ? IRCALL_lj_carith_powi64
:
2694 IRCALL_lj_carith_powu64
);
2700 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
2701 case IR_ADDOV
: asm_intarith(as
, ir
, XOg_ADD
); break;
2702 case IR_SUBOV
: asm_intarith(as
, ir
, XOg_SUB
); break;
2703 case IR_MULOV
: asm_intarith(as
, ir
, XOg_X_IMUL
); break;
2705 /* Memory references. */
2706 case IR_AREF
: asm_aref(as
, ir
); break;
2707 case IR_HREF
: asm_href(as
, ir
); break;
2708 case IR_HREFK
: asm_hrefk(as
, ir
); break;
2709 case IR_NEWREF
: asm_newref(as
, ir
); break;
2710 case IR_UREFO
: case IR_UREFC
: asm_uref(as
, ir
); break;
2711 case IR_FREF
: asm_fref(as
, ir
); break;
2712 case IR_STRREF
: asm_strref(as
, ir
); break;
2714 /* Loads and stores. */
2715 case IR_ALOAD
: case IR_HLOAD
: case IR_ULOAD
: case IR_VLOAD
:
2716 asm_ahuvload(as
, ir
);
2718 case IR_FLOAD
: case IR_XLOAD
: asm_fxload(as
, ir
); break;
2719 case IR_SLOAD
: asm_sload(as
, ir
); break;
2721 case IR_ASTORE
: case IR_HSTORE
: case IR_USTORE
: asm_ahustore(as
, ir
); break;
2722 case IR_FSTORE
: case IR_XSTORE
: asm_fxstore(as
, ir
); break;
2725 case IR_SNEW
: case IR_XSNEW
: asm_snew(as
, ir
); break;
2726 case IR_TNEW
: asm_tnew(as
, ir
); break;
2727 case IR_TDUP
: asm_tdup(as
, ir
); break;
2728 case IR_CNEW
: case IR_CNEWI
: asm_cnew(as
, ir
); break;
2730 /* Write barriers. */
2731 case IR_TBAR
: asm_tbar(as
, ir
); break;
2732 case IR_OBAR
: asm_obar(as
, ir
); break;
2734 /* Type conversions. */
2735 case IR_TOBIT
: asm_tobit(as
, ir
); break;
2736 case IR_CONV
: asm_conv(as
, ir
); break;
2737 case IR_TOSTR
: asm_tostr(as
, ir
); break;
2738 case IR_STRTO
: asm_strto(as
, ir
); break;
2741 case IR_CALLN
: case IR_CALLL
: case IR_CALLS
: asm_call(as
, ir
); break;
2742 case IR_CALLXS
: asm_callx(as
, ir
); break;
2743 case IR_CARG
: break;
2746 setintV(&as
->J
->errinfo
, ir
->o
);
2747 lj_trace_err_info(as
->J
, LJ_TRERR_NYIIR
);
2752 /* -- Trace setup --------------------------------------------------------- */
2754 /* Ensure there are enough stack slots for call arguments. */
2755 static Reg
asm_setup_call_slots(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
2757 IRRef args
[CCI_NARGS_MAX
*2];
2759 asm_collectargs(as
, ir
, ci
, args
);
2760 nslots
= asm_count_call_slots(as
, ci
, args
);
2761 if (nslots
> as
->evenspill
) /* Leave room for args in stack slots. */
2762 as
->evenspill
= nslots
;
2764 return irt_isfp(ir
->t
) ? REGSP_HINT(RID_FPRET
) : REGSP_HINT(RID_RET
);
2766 return irt_isfp(ir
->t
) ? REGSP_INIT
: REGSP_HINT(RID_RET
);
2770 /* Target-specific setup. */
2771 static void asm_setup_target(ASMState
*as
)
2773 asm_exitstub_setup(as
, as
->T
->nsnap
);
2776 /* -- Trace patching ------------------------------------------------------ */
2778 static const uint8_t map_op1
[256] = {
2779 0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x20,
2780 0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,
2781 0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,
2782 0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,
2784 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x14,0x14,0x14,0x14,0x14,0x14,0x14,0x14,
2786 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,
2788 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,
2789 0x51,0x51,0x92,0x92,0x10,0x10,0x12,0x11,0x45,0x86,0x52,0x93,0x51,0x51,0x51,0x51,
2790 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,
2791 0x93,0x86,0x93,0x93,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,
2792 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x47,0x51,0x51,0x51,0x51,0x51,
2794 0x59,0x59,0x59,0x59,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51,
2796 0x55,0x55,0x55,0x55,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51,
2798 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x05,
2799 0x93,0x93,0x53,0x51,0x70,0x71,0x93,0x86,0x54,0x51,0x53,0x51,0x51,0x52,0x51,0x51,
2800 0x92,0x92,0x92,0x92,0x52,0x52,0x51,0x51,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,
2801 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x45,0x45,0x47,0x52,0x51,0x51,0x51,0x51,
2802 0x10,0x51,0x10,0x10,0x51,0x51,0x63,0x66,0x51,0x51,0x51,0x51,0x51,0x51,0x92,0x92
2805 static const uint8_t map_op2
[256] = {
2806 0x93,0x93,0x93,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x51,0x52,0x51,0x93,0x52,0x94,
2807 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2808 0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2809 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x34,0x51,0x35,0x51,0x51,0x51,0x51,0x51,
2810 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2811 0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2812 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2813 0x94,0x54,0x54,0x54,0x93,0x93,0x93,0x52,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2814 0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,
2815 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2816 0x52,0x52,0x52,0x93,0x94,0x93,0x51,0x51,0x52,0x52,0x52,0x93,0x94,0x93,0x93,0x93,
2817 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x94,0x93,0x93,0x93,0x93,0x93,
2818 0x93,0x93,0x94,0x93,0x94,0x94,0x94,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,
2819 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2820 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
2821 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x52
2824 static uint32_t asm_x86_inslen(const uint8_t* p
)
2826 uint32_t result
= 0;
2827 uint32_t prefixes
= 0;
2828 uint32_t x
= map_op1
[*p
];
2831 case 0: return result
+ x
+ (prefixes
& 4);
2832 case 1: prefixes
|= x
; x
= map_op1
[*++p
]; result
++; break;
2833 case 2: x
= map_op2
[*++p
]; break;
2834 case 3: p
++; goto mrm
;
2835 case 4: result
-= (prefixes
& 2); /* fallthrough */
2836 case 5: return result
+ (x
& 15);
2837 case 6: /* Group 3. */
2838 if (p
[1] & 0x38) x
= 2;
2839 else if ((prefixes
& 2) && (x
== 0x66)) x
= 4;
2841 case 7: /* VEX c4/c5. */
2842 if (LJ_32
&& p
[1] < 0xc0) {
2859 case 8: result
-= (prefixes
& 2); /* fallthrough */
2860 case 9: mrm
: /* ModR/M and possibly SIB. */
2864 case 0: if ((x
& 7) == 5) return result
+ 4; break;
2865 case 1: result
++; break;
2866 case 2: result
+= 4; break;
2867 case 3: return result
;
2871 if (x
< 0x40 && (p
[1] & 7) == 5) result
+= 4;
2878 /* Patch exit jumps of existing machine code to a new target. */
2879 void lj_asm_patchexit(jit_State
*J
, GCtrace
*T
, ExitNo exitno
, MCode
*target
)
2881 MCode
*p
= T
->mcode
;
2882 MCode
*mcarea
= lj_mcode_patch(J
, p
, 0);
2883 MSize len
= T
->szmcode
;
2884 MCode
*px
= exitstub_addr(J
, exitno
) - 6;
2885 MCode
*pe
= p
+len
-6;
2886 uint32_t stateaddr
= u32ptr(&J2G(J
)->vmstate
);
2887 if (len
> 5 && p
[len
-5] == XI_JMP
&& p
+len
-6 + *(int32_t *)(p
+len
-4) == px
)
2888 *(int32_t *)(p
+len
-4) = jmprel(p
+len
, target
);
2889 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
2890 for (; p
< pe
; p
+= asm_x86_inslen(p
))
2891 if (*(uint32_t *)(p
+(LJ_64
? 3 : 2)) == stateaddr
&& p
[0] == XI_MOVmi
)
2894 for (; p
< pe
; p
+= asm_x86_inslen(p
))
2895 if ((*(uint16_t *)p
& 0xf0ff) == 0x800f && p
+ *(int32_t *)(p
+2) == px
)
2896 *(int32_t *)(p
+2) = jmprel(p
+6, target
);
2897 lj_mcode_sync(T
->mcode
, T
->mcode
+ T
->szmcode
);
2898 lj_mcode_patch(J
, mcarea
, 1);