Minor fixes for x64 interpreter.
[luajit-2.0/celess22.git] / src / lj_asm.c
blobeb14b0e5940bebea2ba7c370d505110895e6f87c
1 /*
2 ** IR assembler (SSA IR -> machine code).
3 ** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
4 */
6 #define lj_asm_c
7 #define LUA_CORE
9 #include "lj_obj.h"
11 #if LJ_HASJIT
13 #include "lj_gc.h"
14 #include "lj_str.h"
15 #include "lj_tab.h"
16 #include "lj_frame.h"
17 #include "lj_ir.h"
18 #include "lj_jit.h"
19 #include "lj_iropt.h"
20 #include "lj_mcode.h"
21 #include "lj_iropt.h"
22 #include "lj_trace.h"
23 #include "lj_snap.h"
24 #include "lj_asm.h"
25 #include "lj_dispatch.h"
26 #include "lj_vm.h"
27 #include "lj_target.h"
29 /* -- Assembler state and common macros ----------------------------------- */
31 /* Assembler state. */
32 typedef struct ASMState {
33 RegCost cost[RID_MAX]; /* Reference and blended allocation cost for regs. */
35 MCode *mcp; /* Current MCode pointer (grows down). */
36 MCode *mclim; /* Lower limit for MCode memory + red zone. */
38 IRIns *ir; /* Copy of pointer to IR instructions/constants. */
39 jit_State *J; /* JIT compiler state. */
41 x86ModRM mrm; /* Fused x86 address operand. */
43 RegSet freeset; /* Set of free registers. */
44 RegSet modset; /* Set of registers modified inside the loop. */
45 RegSet phiset; /* Set of PHI registers. */
47 uint32_t flags; /* Copy of JIT compiler flags. */
48 int loopinv; /* Loop branch inversion (0:no, 1:yes, 2:yes+CC_P). */
50 int32_t evenspill; /* Next even spill slot. */
51 int32_t oddspill; /* Next odd spill slot (or 0). */
53 IRRef curins; /* Reference of current instruction. */
54 IRRef stopins; /* Stop assembly before hitting this instruction. */
55 IRRef orignins; /* Original T->nins. */
57 IRRef snapref; /* Current snapshot is active after this reference. */
58 IRRef snaprename; /* Rename highwater mark for snapshot check. */
59 SnapNo snapno; /* Current snapshot number. */
60 SnapNo loopsnapno; /* Loop snapshot number. */
62 Trace *T; /* Trace to assemble. */
63 Trace *parent; /* Parent trace (or NULL). */
65 IRRef fuseref; /* Fusion limit (loopref, 0 or FUSE_DISABLED). */
66 IRRef sectref; /* Section base reference (loopref or 0). */
67 IRRef loopref; /* Reference of LOOP instruction (or 0). */
69 BCReg topslot; /* Number of slots for stack check (unless 0). */
70 MSize gcsteps; /* Accumulated number of GC steps (per section). */
72 MCode *mcbot; /* Bottom of reserved MCode. */
73 MCode *mctop; /* Top of generated MCode. */
74 MCode *mcloop; /* Pointer to loop MCode (or NULL). */
75 MCode *invmcp; /* Points to invertible loop branch (or NULL). */
76 MCode *testmcp; /* Pending opportunity to remove test r,r. */
77 MCode *realign; /* Realign loop if not NULL. */
79 IRRef1 phireg[RID_MAX]; /* PHI register references. */
80 uint16_t parentmap[LJ_MAX_JSLOTS]; /* Parent slot to RegSP map. */
81 } ASMState;
83 #define IR(ref) (&as->ir[(ref)])
85 #define ASMREF_TMP1 REF_TRUE /* Temp. register. */
86 #define ASMREF_TMP2 REF_FALSE /* Temp. register. */
87 #define ASMREF_L REF_NIL /* Stores register for L. */
89 /* Check for variant to invariant references. */
90 #define iscrossref(as, ref) ((ref) < as->sectref)
92 /* Inhibit memory op fusion from variant to invariant references. */
93 #define FUSE_DISABLED (~(IRRef)0)
94 #define mayfuse(as, ref) ((ref) > as->fuseref)
95 #define neverfuse(as) (as->fuseref == FUSE_DISABLED)
96 #define opisfusableload(o) \
97 ((o) == IR_ALOAD || (o) == IR_HLOAD || (o) == IR_ULOAD || \
98 (o) == IR_FLOAD || (o) == IR_SLOAD || (o) == IR_XLOAD)
100 /* Instruction selection for XMM moves. */
101 #define XMM_MOVRR(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVSD : XO_MOVAPS)
102 #define XMM_MOVRM(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVLPD : XO_MOVSD)
104 /* Sparse limit checks using a red zone before the actual limit. */
105 #define MCLIM_REDZONE 64
106 #define checkmclim(as) \
107 if (LJ_UNLIKELY(as->mcp < as->mclim)) asm_mclimit(as)
109 static LJ_NORET LJ_NOINLINE void asm_mclimit(ASMState *as)
111 lj_mcode_limiterr(as->J, (size_t)(as->mctop - as->mcp + 4*MCLIM_REDZONE));
114 /* -- Emit x86 instructions ----------------------------------------------- */
116 #define MODRM(mode, r1, r2) ((MCode)((mode)+(((r1)&7)<<3)+((r2)&7)))
118 #if LJ_64
119 #define REXRB(p, rr, rb) \
120 { MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \
121 if (rex != 0x40) *--(p) = rex; }
122 #define FORCE_REX 0x200
123 #define REX_64 (FORCE_REX|0x080000)
124 #else
125 #define REXRB(p, rr, rb) ((void)0)
126 #define FORCE_REX 0
127 #define REX_64 0
128 #endif
130 #define emit_i8(as, i) (*--as->mcp = (MCode)(i))
131 #define emit_i32(as, i) (*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4)
133 #define emit_x87op(as, xo) \
134 (*(uint16_t *)(as->mcp-2) = (uint16_t)(xo), as->mcp -= 2)
136 /* op */
137 static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
138 MCode *p, int delta)
140 int n = (int8_t)xo;
141 #if defined(__GNUC__)
142 if (__builtin_constant_p(xo) && n == -2)
143 p[delta-2] = (MCode)(xo >> 24);
144 else if (__builtin_constant_p(xo) && n == -3)
145 *(uint16_t *)(p+delta-3) = (uint16_t)(xo >> 16);
146 else
147 #endif
148 *(uint32_t *)(p+delta-5) = (uint32_t)xo;
149 p += n + delta;
150 #if LJ_64
152 uint32_t rex = 0x40 + ((rr>>1)&(4+(FORCE_REX>>1)))+((rx>>2)&2)+((rb>>3)&1);
153 if (rex != 0x40) {
154 rex |= (rr >> 16);
155 if (n == -4) { *p = (MCode)rex; rex = (MCode)(xo >> 8); }
156 *--p = (MCode)rex;
159 #else
160 UNUSED(rr); UNUSED(rb); UNUSED(rx);
161 #endif
162 return p;
165 /* op + modrm */
166 #define emit_opm(xo, mode, rr, rb, p, delta) \
167 (p[(delta)-1] = MODRM((mode), (rr), (rb)), \
168 emit_op((xo), (rr), (rb), 0, (p), (delta)))
170 /* op + modrm + sib */
171 #define emit_opmx(xo, mode, scale, rr, rb, rx, p) \
172 (p[-1] = MODRM((scale), (rx), (rb)), \
173 p[-2] = MODRM((mode), (rr), RID_ESP), \
174 emit_op((xo), (rr), (rb), (rx), (p), -1))
176 /* op r1, r2 */
177 static void emit_rr(ASMState *as, x86Op xo, Reg r1, Reg r2)
179 MCode *p = as->mcp;
180 as->mcp = emit_opm(xo, XM_REG, r1, r2, p, 0);
183 #if LJ_64 && defined(LUA_USE_ASSERT)
184 /* [addr] is sign-extended in x64 and must be in lower 2G (not 4G). */
185 static int32_t ptr2addr(void *p)
187 lua_assert((uintptr_t)p < (uintptr_t)0x80000000);
188 return i32ptr(p);
190 #else
191 #define ptr2addr(p) (i32ptr((p)))
192 #endif
194 /* op r, [addr] */
195 static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr)
197 MCode *p = as->mcp;
198 *(int32_t *)(p-4) = ptr2addr(addr);
199 #if LJ_64
200 p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
201 as->mcp = emit_opm(xo, XM_OFS0, rr, RID_ESP, p, -5);
202 #else
203 as->mcp = emit_opm(xo, XM_OFS0, rr, RID_EBP, p, -4);
204 #endif
207 /* op r, [base+ofs] */
208 static void emit_rmro(ASMState *as, x86Op xo, Reg rr, Reg rb, int32_t ofs)
210 MCode *p = as->mcp;
211 x86Mode mode;
212 if (ra_hasreg(rb)) {
213 if (ofs == 0 && (rb&7) != RID_EBP) {
214 mode = XM_OFS0;
215 } else if (checki8(ofs)) {
216 *--p = (MCode)ofs;
217 mode = XM_OFS8;
218 } else {
219 p -= 4;
220 *(int32_t *)p = ofs;
221 mode = XM_OFS32;
223 if ((rb&7) == RID_ESP)
224 *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
225 } else {
226 *(int32_t *)(p-4) = ofs;
227 #if LJ_64
228 p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
229 p -= 5;
230 rb = RID_ESP;
231 #else
232 p -= 4;
233 rb = RID_EBP;
234 #endif
235 mode = XM_OFS0;
237 as->mcp = emit_opm(xo, mode, rr, rb, p, 0);
240 /* op r, [base+idx*scale+ofs] */
241 static void emit_rmrxo(ASMState *as, x86Op xo, Reg rr, Reg rb, Reg rx,
242 x86Mode scale, int32_t ofs)
244 MCode *p = as->mcp;
245 x86Mode mode;
246 if (ofs == 0 && (rb&7) != RID_EBP) {
247 mode = XM_OFS0;
248 } else if (checki8(ofs)) {
249 mode = XM_OFS8;
250 *--p = (MCode)ofs;
251 } else {
252 mode = XM_OFS32;
253 p -= 4;
254 *(int32_t *)p = ofs;
256 as->mcp = emit_opmx(xo, mode, scale, rr, rb, rx, p);
259 /* op r, i */
260 static void emit_gri(ASMState *as, x86Group xg, Reg rb, int32_t i)
262 MCode *p = as->mcp;
263 if (checki8(i)) {
264 p -= 3;
265 p[2] = (MCode)i;
266 p[0] = (MCode)(xg >> 16);
267 } else {
268 p -= 6;
269 *(int32_t *)(p+2) = i;
270 p[0] = (MCode)(xg >> 8);
272 p[1] = MODRM(XM_REG, xg, rb);
273 REXRB(p, 0, rb);
274 as->mcp = p;
277 /* op [base+ofs], i */
278 static void emit_gmroi(ASMState *as, x86Group xg, Reg rb, int32_t ofs,
279 int32_t i)
281 x86Op xo;
282 if (checki8(i)) {
283 emit_i8(as, i);
284 xo = (x86Op)(((xg >> 16) << 24)+0xfe);
285 } else {
286 emit_i32(as, i);
287 xo = (x86Op)(((xg >> 8) << 24)+0xfe);
289 emit_rmro(as, xo, (Reg)xg, rb, ofs);
292 #define emit_shifti(as, xg, r, i) \
293 (emit_i8(as, (i)), emit_rr(as, XO_SHIFTi, (Reg)(xg), (r)))
295 /* op r, rm/mrm */
296 static void emit_mrm(ASMState *as, x86Op xo, Reg rr, Reg rb)
298 MCode *p = as->mcp;
299 x86Mode mode = XM_REG;
300 if (rb == RID_MRM) {
301 rb = as->mrm.base;
302 if (rb == RID_NONE) {
303 rb = RID_EBP;
304 mode = XM_OFS0;
305 p -= 4;
306 *(int32_t *)p = as->mrm.ofs;
307 if (as->mrm.idx != RID_NONE)
308 goto mrmidx;
309 #if LJ_64
310 *--p = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
311 rb = RID_ESP;
312 #endif
313 } else {
314 if (as->mrm.ofs == 0 && (rb&7) != RID_EBP) {
315 mode = XM_OFS0;
316 } else if (checki8(as->mrm.ofs)) {
317 *--p = (MCode)as->mrm.ofs;
318 mode = XM_OFS8;
319 } else {
320 p -= 4;
321 *(int32_t *)p = as->mrm.ofs;
322 mode = XM_OFS32;
324 if (as->mrm.idx != RID_NONE) {
325 mrmidx:
326 as->mcp = emit_opmx(xo, mode, as->mrm.scale, rr, rb, as->mrm.idx, p);
327 return;
329 if ((rb&7) == RID_ESP)
330 *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
333 as->mcp = emit_opm(xo, mode, rr, rb, p, 0);
336 static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
338 if (ofs) {
339 if ((as->flags & JIT_F_LEA_AGU))
340 emit_rmro(as, XO_LEA, r, r, ofs);
341 else
342 emit_gri(as, XG_ARITHi(XOg_ADD), r, ofs);
346 /* -- Emit moves ---------------------------------------------------------- */
348 /* Generic move between two regs. */
349 static void emit_movrr(ASMState *as, Reg r1, Reg r2)
351 emit_rr(as, r1 < RID_MAX_GPR ? XO_MOV : XMM_MOVRR(as), r1, r2);
354 /* Generic move from [base+ofs]. */
355 static void emit_movrmro(ASMState *as, Reg rr, Reg rb, int32_t ofs)
357 emit_rmro(as, rr < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), rr, rb, ofs);
360 /* mov [base+ofs], i */
361 static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
363 emit_i32(as, i);
364 emit_rmro(as, XO_MOVmi, 0, base, ofs);
367 /* mov [base+ofs], r */
368 #define emit_movtomro(as, r, base, ofs) \
369 emit_rmro(as, XO_MOVto, (r), (base), (ofs))
371 /* Get/set global_State fields. */
372 #define emit_opgl(as, xo, r, field) \
373 emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field)
374 #define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r), field)
375 #define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r), field)
376 #define emit_setgli(as, field, i) \
377 (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, field))
379 /* mov r, i / xor r, r */
380 static void emit_loadi(ASMState *as, Reg r, int32_t i)
382 if (i == 0) {
383 emit_rr(as, XO_ARITH(XOg_XOR), r, r);
384 } else {
385 MCode *p = as->mcp;
386 *(int32_t *)(p-4) = i;
387 p[-5] = (MCode)(XI_MOVri+(r&7));
388 p -= 5;
389 REXRB(p, 0, r);
390 as->mcp = p;
394 /* mov r, addr */
395 #define emit_loada(as, r, addr) \
396 emit_loadi(as, (r), ptr2addr((addr)))
398 /* movsd r, [&tv->n] / xorps r, r */
399 static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
401 if (tvispzero(tv)) /* Use xor only for +0. */
402 emit_rr(as, XO_XORPS, r, r);
403 else
404 emit_rma(as, XMM_MOVRM(as), r, &tv->n);
407 /* -- Emit branches ------------------------------------------------------- */
409 /* Label for short jumps. */
410 typedef MCode *MCLabel;
412 /* jcc short target */
413 static void emit_sjcc(ASMState *as, int cc, MCLabel target)
415 MCode *p = as->mcp;
416 p[-1] = (MCode)(int8_t)(target-p);
417 p[-2] = (MCode)(XI_JCCs+(cc&15));
418 as->mcp = p - 2;
421 /* jcc short (pending target) */
422 static MCLabel emit_sjcc_label(ASMState *as, int cc)
424 MCode *p = as->mcp;
425 p[-1] = 0;
426 p[-2] = (MCode)(XI_JCCs+(cc&15));
427 as->mcp = p - 2;
428 return p;
431 /* Fixup jcc short target. */
432 static void emit_sfixup(ASMState *as, MCLabel source)
434 source[-1] = (MCode)(as->mcp-source);
437 /* Return label pointing to current PC. */
438 #define emit_label(as) ((as)->mcp)
440 /* jcc target */
441 static void emit_jcc(ASMState *as, int cc, MCode *target)
443 MCode *p = as->mcp;
444 int32_t addr = (int32_t)(target - p);
445 *(int32_t *)(p-4) = addr;
446 p[-5] = (MCode)(XI_JCCn+(cc&15));
447 p[-6] = 0x0f;
448 as->mcp = p - 6;
451 /* call target */
452 static void emit_call_(ASMState *as, MCode *target)
454 MCode *p = as->mcp;
455 *(int32_t *)(p-4) = (int32_t)(target - p);
456 p[-5] = XI_CALL;
457 as->mcp = p - 5;
460 #define emit_call(as, f) emit_call_(as, (MCode *)(void *)(f))
462 /* -- Register allocator debugging ---------------------------------------- */
464 /* #define LUAJIT_DEBUG_RA */
466 #ifdef LUAJIT_DEBUG_RA
468 #include <stdio.h>
469 #include <stdarg.h>
471 #define RIDNAME(name) #name,
472 static const char *const ra_regname[] = {
473 GPRDEF(RIDNAME)
474 FPRDEF(RIDNAME)
475 "mrm",
476 NULL
478 #undef RIDNAME
480 static char ra_dbg_buf[65536];
481 static char *ra_dbg_p;
482 static char *ra_dbg_merge;
483 static MCode *ra_dbg_mcp;
485 static void ra_dstart(void)
487 ra_dbg_p = ra_dbg_buf;
488 ra_dbg_merge = NULL;
489 ra_dbg_mcp = NULL;
492 static void ra_dflush(void)
494 fwrite(ra_dbg_buf, 1, (size_t)(ra_dbg_p-ra_dbg_buf), stdout);
495 ra_dstart();
498 static void ra_dprintf(ASMState *as, const char *fmt, ...)
500 char *p;
501 va_list argp;
502 va_start(argp, fmt);
503 p = ra_dbg_mcp == as->mcp ? ra_dbg_merge : ra_dbg_p;
504 ra_dbg_mcp = NULL;
505 p += sprintf(p, "%08x \e[36m%04d ", (uintptr_t)as->mcp, as->curins-REF_BIAS);
506 for (;;) {
507 const char *e = strchr(fmt, '$');
508 if (e == NULL) break;
509 memcpy(p, fmt, (size_t)(e-fmt));
510 p += e-fmt;
511 if (e[1] == 'r') {
512 Reg r = va_arg(argp, Reg) & RID_MASK;
513 if (r <= RID_MAX) {
514 const char *q;
515 for (q = ra_regname[r]; *q; q++)
516 *p++ = *q >= 'A' && *q <= 'Z' ? *q + 0x20 : *q;
517 } else {
518 *p++ = '?';
519 lua_assert(0);
521 } else if (e[1] == 'f' || e[1] == 'i') {
522 IRRef ref;
523 if (e[1] == 'f')
524 ref = va_arg(argp, IRRef);
525 else
526 ref = va_arg(argp, IRIns *) - as->ir;
527 if (ref >= REF_BIAS)
528 p += sprintf(p, "%04d", ref - REF_BIAS);
529 else
530 p += sprintf(p, "K%03d", REF_BIAS - ref);
531 } else if (e[1] == 's') {
532 uint32_t slot = va_arg(argp, uint32_t);
533 p += sprintf(p, "[esp+0x%x]", sps_scale(slot));
534 } else {
535 lua_assert(0);
537 fmt = e+2;
539 va_end(argp);
540 while (*fmt)
541 *p++ = *fmt++;
542 *p++ = '\e'; *p++ = '['; *p++ = 'm'; *p++ = '\n';
543 if (p > ra_dbg_buf+sizeof(ra_dbg_buf)-256) {
544 fwrite(ra_dbg_buf, 1, (size_t)(p-ra_dbg_buf), stdout);
545 p = ra_dbg_buf;
547 ra_dbg_p = p;
550 #define RA_DBG_START() ra_dstart()
551 #define RA_DBG_FLUSH() ra_dflush()
552 #define RA_DBG_REF() \
553 do { char *_p = ra_dbg_p; ra_dprintf(as, ""); \
554 ra_dbg_merge = _p; ra_dbg_mcp = as->mcp; } while (0)
555 #define RA_DBGX(x) ra_dprintf x
557 #else
558 #define RA_DBG_START() ((void)0)
559 #define RA_DBG_FLUSH() ((void)0)
560 #define RA_DBG_REF() ((void)0)
561 #define RA_DBGX(x) ((void)0)
562 #endif
564 /* -- Register allocator -------------------------------------------------- */
566 #define ra_free(as, r) rset_set(as->freeset, (r))
567 #define ra_modified(as, r) rset_set(as->modset, (r))
569 #define ra_used(ir) (ra_hasreg((ir)->r) || ra_hasspill((ir)->s))
571 /* Setup register allocator. */
572 static void ra_setup(ASMState *as)
574 /* Initially all regs (except the stack pointer) are free for use. */
575 as->freeset = RSET_ALL;
576 as->modset = RSET_EMPTY;
577 as->phiset = RSET_EMPTY;
578 memset(as->phireg, 0, sizeof(as->phireg));
579 memset(as->cost, 0, sizeof(as->cost));
580 as->cost[RID_ESP] = REGCOST(~0u, 0u);
583 /* Rematerialize constants. */
584 static Reg ra_rematk(ASMState *as, IRIns *ir)
586 Reg r = ir->r;
587 lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
588 ra_free(as, r);
589 ra_modified(as, r);
590 ir->r = RID_INIT; /* Do not keep any hint. */
591 RA_DBGX((as, "remat $i $r", ir, r));
592 if (ir->o == IR_KNUM) {
593 emit_loadn(as, r, ir_knum(ir));
594 } else if (ir->o == IR_BASE) {
595 ra_sethint(ir->r, RID_BASE); /* Restore BASE register hint. */
596 emit_getgl(as, r, jit_base);
597 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
598 lua_assert(irt_isnil(ir->t));
599 emit_getgl(as, r, jit_L);
600 } else {
601 lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
602 ir->o == IR_KPTR || ir->o == IR_KNULL);
603 emit_loadi(as, r, ir->i);
605 return r;
608 /* Force a spill. Allocate a new spill slot if needed. */
609 static int32_t ra_spill(ASMState *as, IRIns *ir)
611 int32_t slot = ir->s;
612 if (!ra_hasspill(slot)) {
613 if (irt_isnum(ir->t)) {
614 slot = as->evenspill;
615 as->evenspill += 2;
616 } else if (as->oddspill) {
617 slot = as->oddspill;
618 as->oddspill = 0;
619 } else {
620 slot = as->evenspill;
621 as->oddspill = slot+1;
622 as->evenspill += 2;
624 if (as->evenspill > 256)
625 lj_trace_err(as->J, LJ_TRERR_SPILLOV);
626 ir->s = (uint8_t)slot;
628 return sps_scale(slot);
631 /* Release the temporarily allocated register in ASMREF_TMP1/ASMREF_TMP2. */
632 static Reg ra_releasetmp(ASMState *as, IRRef ref)
634 IRIns *ir = IR(ref);
635 Reg r = ir->r;
636 lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
637 ra_free(as, r);
638 ra_modified(as, r);
639 ir->r = RID_INIT;
640 return r;
643 /* Restore a register (marked as free). Rematerialize or force a spill. */
644 static Reg ra_restore(ASMState *as, IRRef ref)
646 IRIns *ir = IR(ref);
647 if (irref_isk(ref) || ref == REF_BASE) {
648 return ra_rematk(as, ir);
649 } else {
650 Reg r = ir->r;
651 lua_assert(ra_hasreg(r));
652 ra_free(as, r);
653 ra_modified(as, r);
654 ra_sethint(ir->r, r); /* Keep hint. */
655 RA_DBGX((as, "restore $i $r", ir, r));
656 emit_movrmro(as, r, RID_ESP, ra_spill(as, ir)); /* Force a spill. */
657 return r;
661 /* Save a register to a spill slot. */
662 static LJ_AINLINE void ra_save(ASMState *as, IRIns *ir, Reg r)
664 RA_DBGX((as, "save $i $r", ir, r));
665 emit_rmro(as, r < RID_MAX_GPR ? XO_MOVto : XO_MOVSDto,
666 r, RID_ESP, sps_scale(ir->s));
669 #define MINCOST(r) \
670 if (LJ_LIKELY(allow&RID2RSET(r)) && as->cost[r] < cost) \
671 cost = as->cost[r]
673 /* Evict the register with the lowest cost, forcing a restore. */
674 static Reg ra_evict(ASMState *as, RegSet allow)
676 RegCost cost = ~(RegCost)0;
677 if (allow < RID2RSET(RID_MAX_GPR)) {
678 MINCOST(RID_EAX);MINCOST(RID_ECX);MINCOST(RID_EDX);MINCOST(RID_EBX);
679 MINCOST(RID_EBP);MINCOST(RID_ESI);MINCOST(RID_EDI);
680 #if LJ_64
681 MINCOST(RID_R8D);MINCOST(RID_R9D);MINCOST(RID_R10D);MINCOST(RID_R11D);
682 MINCOST(RID_R12D);MINCOST(RID_R13D);MINCOST(RID_R14D);MINCOST(RID_R15D);
683 #endif
684 } else {
685 MINCOST(RID_XMM0);MINCOST(RID_XMM1);MINCOST(RID_XMM2);MINCOST(RID_XMM3);
686 MINCOST(RID_XMM4);MINCOST(RID_XMM5);MINCOST(RID_XMM6);MINCOST(RID_XMM7);
687 #if LJ_64
688 MINCOST(RID_XMM8);MINCOST(RID_XMM9);MINCOST(RID_XMM10);MINCOST(RID_XMM11);
689 MINCOST(RID_XMM12);MINCOST(RID_XMM13);MINCOST(RID_XMM14);MINCOST(RID_XMM15);
690 #endif
692 lua_assert(allow != RSET_EMPTY);
693 lua_assert(regcost_ref(cost) >= as->T->nk && regcost_ref(cost) < as->T->nins);
694 return ra_restore(as, regcost_ref(cost));
697 /* Pick any register (marked as free). Evict on-demand. */
698 static LJ_AINLINE Reg ra_pick(ASMState *as, RegSet allow)
700 RegSet pick = as->freeset & allow;
701 if (!pick)
702 return ra_evict(as, allow);
703 else
704 return rset_picktop(pick);
707 /* Get a scratch register (marked as free). */
708 static LJ_AINLINE Reg ra_scratch(ASMState *as, RegSet allow)
710 Reg r = ra_pick(as, allow);
711 ra_modified(as, r);
712 RA_DBGX((as, "scratch $r", r));
713 return r;
716 /* Evict all registers from a set (if not free). */
717 static void ra_evictset(ASMState *as, RegSet drop)
719 as->modset |= drop;
720 drop &= ~as->freeset;
721 while (drop) {
722 Reg r = rset_picktop(drop);
723 ra_restore(as, regcost_ref(as->cost[r]));
724 rset_clear(drop, r);
725 checkmclim(as);
729 /* Allocate a register for ref from the allowed set of registers.
730 ** Note: this function assumes the ref does NOT have a register yet!
731 ** Picks an optimal register, sets the cost and marks the register as non-free.
733 static Reg ra_allocref(ASMState *as, IRRef ref, RegSet allow)
735 IRIns *ir = IR(ref);
736 RegSet pick = as->freeset & allow;
737 Reg r;
738 lua_assert(ra_noreg(ir->r));
739 if (pick) {
740 /* First check register hint from propagation or PHI. */
741 if (ra_hashint(ir->r)) {
742 r = ra_gethint(ir->r);
743 if (rset_test(pick, r)) /* Use hint register if possible. */
744 goto found;
745 /* Rematerialization is cheaper than missing a hint. */
746 if (rset_test(allow, r) && irref_isk(regcost_ref(as->cost[r]))) {
747 ra_rematk(as, IR(regcost_ref(as->cost[r])));
748 goto found;
750 RA_DBGX((as, "hintmiss $f $r", ref, r));
752 /* Invariants should preferably get unmodified registers. */
753 if (ref < as->loopref && !irt_isphi(ir->t)) {
754 if ((pick & ~as->modset))
755 pick &= ~as->modset;
756 r = rset_pickbot(pick); /* Reduce conflicts with inverse allocation. */
757 } else {
758 r = rset_picktop(pick);
760 } else {
761 r = ra_evict(as, allow);
763 found:
764 RA_DBGX((as, "alloc $f $r", ref, r));
765 ir->r = (uint8_t)r;
766 rset_clear(as->freeset, r);
767 as->cost[r] = REGCOST_REF_T(ref, irt_t(ir->t));
768 return r;
771 /* Allocate a register on-demand. */
772 static LJ_INLINE Reg ra_alloc1(ASMState *as, IRRef ref, RegSet allow)
774 Reg r = IR(ref)->r;
775 /* Note: allow is ignored if the register is already allocated. */
776 if (ra_noreg(r)) r = ra_allocref(as, ref, allow);
777 return r;
780 /* Rename register allocation and emit move. */
781 static void ra_rename(ASMState *as, Reg down, Reg up)
783 IRRef ren, ref = regcost_ref(as->cost[up] = as->cost[down]);
784 IR(ref)->r = (uint8_t)up;
785 as->cost[down] = 0;
786 lua_assert((down < RID_MAX_GPR) == (up < RID_MAX_GPR));
787 lua_assert(!rset_test(as->freeset, down) && rset_test(as->freeset, up));
788 rset_set(as->freeset, down); /* 'down' is free ... */
789 rset_clear(as->freeset, up); /* ... and 'up' is now allocated. */
790 RA_DBGX((as, "rename $f $r $r", regcost_ref(as->cost[up]), down, up));
791 emit_movrr(as, down, up); /* Backwards code generation needs inverse move. */
792 if (!ra_hasspill(IR(ref)->s)) { /* Add the rename to the IR. */
793 lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, as->snapno);
794 ren = tref_ref(lj_ir_emit(as->J));
795 as->ir = as->T->ir; /* The IR may have been reallocated. */
796 IR(ren)->r = (uint8_t)down;
797 IR(ren)->s = SPS_NONE;
801 /* Pick a destination register (marked as free).
802 ** Caveat: allow is ignored if there's already a destination register.
803 ** Use ra_destreg() to get a specific register.
805 static Reg ra_dest(ASMState *as, IRIns *ir, RegSet allow)
807 Reg dest = ir->r;
808 if (ra_hasreg(dest)) {
809 ra_free(as, dest);
810 ra_modified(as, dest);
811 } else {
812 dest = ra_scratch(as, allow);
814 if (LJ_UNLIKELY(ra_hasspill(ir->s))) ra_save(as, ir, dest);
815 return dest;
818 /* Force a specific destination register (marked as free). */
819 static void ra_destreg(ASMState *as, IRIns *ir, Reg r)
821 Reg dest = ra_dest(as, ir, RID2RSET(r));
822 if (dest != r) {
823 ra_scratch(as, RID2RSET(r));
824 emit_movrr(as, dest, r);
828 /* Propagate dest register to left reference. Emit moves as needed.
829 ** This is a required fixup step for all 2-operand machine instructions.
831 static void ra_left(ASMState *as, Reg dest, IRRef lref)
833 IRIns *ir = IR(lref);
834 Reg left = ir->r;
835 if (ra_noreg(left)) {
836 if (irref_isk(lref)) {
837 if (ir->o == IR_KNUM) {
838 cTValue *tv = ir_knum(ir);
839 /* FP remat needs a load except for +0. Still better than eviction. */
840 if (tvispzero(tv) || !(as->freeset & RSET_FPR)) {
841 emit_loadn(as, dest, tv);
842 return;
844 } else {
845 lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
846 ir->o == IR_KPTR || ir->o == IR_KNULL);
847 emit_loadi(as, dest, ir->i);
848 return;
851 if (!ra_hashint(left) && !iscrossref(as, lref))
852 ra_sethint(ir->r, dest); /* Propagate register hint. */
853 left = ra_allocref(as, lref, dest < RID_MAX_GPR ? RSET_GPR : RSET_FPR);
855 /* Move needed for true 3-operand instruction: y=a+b ==> y=a; y+=b. */
856 if (dest != left) {
857 /* Use register renaming if dest is the PHI reg. */
858 if (irt_isphi(ir->t) && as->phireg[dest] == lref) {
859 ra_modified(as, left);
860 ra_rename(as, left, dest);
861 } else {
862 emit_movrr(as, dest, left);
867 /* -- Exit stubs ---------------------------------------------------------- */
869 /* Generate an exit stub group at the bottom of the reserved MCode memory. */
870 static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
872 ExitNo i, groupofs = (group*EXITSTUBS_PER_GROUP) & 0xff;
873 MCode *mxp = as->mcbot;
874 MCode *mxpstart = mxp;
875 if (mxp + (2+2)*EXITSTUBS_PER_GROUP+8+5 >= as->mctop)
876 asm_mclimit(as);
877 /* Push low byte of exitno for each exit stub. */
878 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)groupofs;
879 for (i = 1; i < EXITSTUBS_PER_GROUP; i++) {
880 *mxp++ = XI_JMPs; *mxp++ = (MCode)((2+2)*(EXITSTUBS_PER_GROUP - i) - 2);
881 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)(groupofs + i);
883 /* Push the high byte of the exitno for each exit stub group. */
884 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8);
885 /* Store DISPATCH in ExitInfo->dispatch. Account for the two push ops. */
886 *mxp++ = XI_MOVmi;
887 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP);
888 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
889 *mxp++ = 2*sizeof(void *);
890 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4;
891 /* Jump to exit handler which fills in the ExitState. */
892 *mxp++ = XI_JMP; mxp += 4;
893 *((int32_t *)(mxp-4)) = (int32_t)((MCode *)lj_vm_exit_handler - mxp);
894 /* Commit the code for this group (even if assembly fails later on). */
895 lj_mcode_commitbot(as->J, mxp);
896 as->mcbot = mxp;
897 as->mclim = as->mcbot + MCLIM_REDZONE;
898 return mxpstart;
901 /* Setup all needed exit stubs. */
902 static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
904 ExitNo i;
905 if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
906 lj_trace_err(as->J, LJ_TRERR_SNAPOV);
907 for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
908 if (as->J->exitstubgroup[i] == NULL)
909 as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
912 /* -- Snapshot and guard handling ----------------------------------------- */
914 /* Can we rematerialize a KNUM instead of forcing a spill? */
915 static int asm_snap_canremat(ASMState *as)
917 Reg r;
918 for (r = RID_MIN_FPR; r < RID_MAX_FPR; r++)
919 if (irref_isk(regcost_ref(as->cost[r])))
920 return 1;
921 return 0;
924 /* Allocate registers or spill slots for refs escaping to a snapshot. */
925 static void asm_snap_alloc(ASMState *as)
927 SnapShot *snap = &as->T->snap[as->snapno];
928 IRRef2 *map = &as->T->snapmap[snap->mapofs];
929 BCReg s, nslots = snap->nslots;
930 for (s = 0; s < nslots; s++) {
931 IRRef ref = snap_ref(map[s]);
932 if (!irref_isk(ref)) {
933 IRIns *ir = IR(ref);
934 if (!ra_used(ir) && ir->o != IR_FRAME) {
935 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
936 /* Not a var-to-invar ref and got a free register (or a remat)? */
937 if ((!iscrossref(as, ref) || irt_isphi(ir->t)) &&
938 ((as->freeset & allow) ||
939 (allow == RSET_FPR && asm_snap_canremat(as)))) {
940 ra_allocref(as, ref, allow); /* Allocate a register. */
941 checkmclim(as);
942 RA_DBGX((as, "snapreg $f $r", ref, ir->r));
943 } else {
944 ra_spill(as, ir); /* Otherwise force a spill slot. */
945 RA_DBGX((as, "snapspill $f $s", ref, ir->s));
952 /* All guards for a snapshot use the same exitno. This is currently the
953 ** same as the snapshot number. Since the exact origin of the exit cannot
954 ** be determined, all guards for the same snapshot must exit with the same
955 ** RegSP mapping.
956 ** A renamed ref which has been used in a prior guard for the same snapshot
957 ** would cause an inconsistency. The easy way out is to force a spill slot.
959 static int asm_snap_checkrename(ASMState *as, IRRef ren)
961 SnapShot *snap = &as->T->snap[as->snapno];
962 IRRef2 *map = &as->T->snapmap[snap->mapofs];
963 BCReg s, nslots = snap->nslots;
964 for (s = 0; s < nslots; s++) {
965 IRRef ref = snap_ref(map[s]);
966 if (ref == ren) {
967 IRIns *ir = IR(ref);
968 ra_spill(as, ir); /* Register renamed, so force a spill slot. */
969 RA_DBGX((as, "snaprensp $f $s", ref, ir->s));
970 return 1; /* Found. */
973 return 0; /* Not found. */
976 /* Prepare snapshot for next guard instruction. */
977 static void asm_snap_prep(ASMState *as)
979 if (as->curins < as->snapref) {
980 do {
981 lua_assert(as->snapno != 0);
982 as->snapno--;
983 as->snapref = as->T->snap[as->snapno].ref;
984 } while (as->curins < as->snapref);
985 asm_snap_alloc(as);
986 as->snaprename = as->T->nins;
987 } else {
988 /* Process any renames above the highwater mark. */
989 for (; as->snaprename < as->T->nins; as->snaprename++) {
990 IRIns *ir = IR(as->snaprename);
991 if (asm_snap_checkrename(as, ir->op1))
992 ir->op2 = REF_BIAS-1; /* Kill rename. */
997 /* Emit conditional branch to exit for guard.
998 ** It's important to emit this *after* all registers have been allocated,
999 ** because rematerializations may invalidate the flags.
1001 static void asm_guardcc(ASMState *as, int cc)
1003 MCode *target = exitstub_addr(as->J, as->snapno);
1004 MCode *p = as->mcp;
1005 if (LJ_UNLIKELY(p == as->invmcp)) {
1006 as->loopinv = 1;
1007 *(int32_t *)(p+1) = target - (p+5);
1008 target = p;
1009 cc ^= 1;
1010 if (as->realign) {
1011 emit_sjcc(as, cc, target);
1012 return;
1015 emit_jcc(as, cc, target);
1018 /* -- Memory operand fusion ----------------------------------------------- */
1020 /* Arch-specific field offsets. */
1021 static const uint8_t field_ofs[IRFL__MAX+1] = {
1022 #define FLOFS(name, ofs) (uint8_t)(ofs),
1023 IRFLDEF(FLOFS)
1024 #undef FLOFS
1028 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
1029 #define CONFLICT_SEARCH_LIM 15
1031 /* Check if there's no conflicting instruction between curins and ref. */
1032 static int noconflict(ASMState *as, IRRef ref, IROp conflict)
1034 IRIns *ir = as->ir;
1035 IRRef i = as->curins;
1036 if (i > ref + CONFLICT_SEARCH_LIM)
1037 return 0; /* Give up, ref is too far away. */
1038 while (--i > ref)
1039 if (ir[i].o == conflict)
1040 return 0; /* Conflict found. */
1041 return 1; /* Ok, no conflict. */
1044 /* Fuse array reference into memory operand. */
1045 static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow)
1047 IRIns *irb = IR(ir->op1);
1048 IRIns *ira, *irx;
1049 lua_assert(ir->o == IR_AREF);
1050 lua_assert(irb->o == IR_FLOAD && irb->op2 == IRFL_TAB_ARRAY);
1051 ira = IR(irb->op1);
1052 if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
1053 noconflict(as, irb->op1, IR_NEWREF)) {
1054 /* We can avoid the FLOAD of t->array for colocated arrays. */
1055 as->mrm.base = (uint8_t)ra_alloc1(as, irb->op1, allow); /* Table obj. */
1056 as->mrm.ofs = -(int32_t)(ira->op1*sizeof(TValue)); /* Ofs to colo array. */
1057 } else {
1058 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); /* Array base. */
1059 as->mrm.ofs = 0;
1061 irx = IR(ir->op2);
1062 if (irref_isk(ir->op2)) {
1063 as->mrm.ofs += 8*irx->i;
1064 as->mrm.idx = RID_NONE;
1065 } else {
1066 rset_clear(allow, as->mrm.base);
1067 as->mrm.scale = XM_SCALE8;
1068 /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
1069 ** Doesn't help much without ABCelim, but reduces register pressure.
1071 if (mayfuse(as, ir->op2) && ra_noreg(irx->r) &&
1072 irx->o == IR_ADD && irref_isk(irx->op2)) {
1073 as->mrm.ofs += 8*IR(irx->op2)->i;
1074 as->mrm.idx = (uint8_t)ra_alloc1(as, irx->op1, allow);
1075 } else {
1076 as->mrm.idx = (uint8_t)ra_alloc1(as, ir->op2, allow);
1081 /* Fuse array/hash/upvalue reference into memory operand.
1082 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
1083 ** pass the final allow mask, excluding any GPRs used for other inputs.
1084 ** In particular: 2-operand GPR instructions need to call ra_dest() first!
1086 static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
1088 IRIns *ir = IR(ref);
1089 if (ra_noreg(ir->r)) {
1090 switch ((IROp)ir->o) {
1091 case IR_AREF:
1092 if (mayfuse(as, ref)) {
1093 asm_fusearef(as, ir, allow);
1094 return;
1096 break;
1097 case IR_HREFK:
1098 if (mayfuse(as, ref)) {
1099 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
1100 as->mrm.ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
1101 as->mrm.idx = RID_NONE;
1102 return;
1104 break;
1105 case IR_UREFC:
1106 if (irref_isk(ir->op1)) {
1107 GCfunc *fn = ir_kfunc(IR(ir->op1));
1108 GCupval *uv = &gcref(fn->l.uvptr[ir->op2])->uv;
1109 as->mrm.ofs = ptr2addr(&uv->tv);
1110 as->mrm.base = as->mrm.idx = RID_NONE;
1111 return;
1113 break;
1114 default:
1115 lua_assert(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO);
1116 break;
1119 as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow);
1120 as->mrm.ofs = 0;
1121 as->mrm.idx = RID_NONE;
1124 /* Fuse FLOAD/FREF reference into memory operand. */
1125 static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
1127 lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF);
1128 as->mrm.ofs = field_ofs[ir->op2];
1129 as->mrm.idx = RID_NONE;
1130 if (irref_isk(ir->op1)) {
1131 as->mrm.ofs += IR(ir->op1)->i;
1132 as->mrm.base = RID_NONE;
1133 } else {
1134 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
1138 /* Fuse string reference into memory operand. */
1139 static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
1141 IRIns *irr;
1142 lua_assert(ir->o == IR_STRREF);
1143 as->mrm.base = as->mrm.idx = RID_NONE;
1144 as->mrm.scale = XM_SCALE1;
1145 as->mrm.ofs = sizeof(GCstr);
1146 if (irref_isk(ir->op1)) {
1147 as->mrm.ofs += IR(ir->op1)->i;
1148 } else {
1149 Reg r = ra_alloc1(as, ir->op1, allow);
1150 rset_clear(allow, r);
1151 as->mrm.base = (uint8_t)r;
1153 irr = IR(ir->op2);
1154 if (irref_isk(ir->op2)) {
1155 as->mrm.ofs += irr->i;
1156 } else {
1157 Reg r;
1158 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
1159 if (mayfuse(as, ir->op2) && irr->o == IR_ADD && irref_isk(irr->op2)) {
1160 as->mrm.ofs += IR(irr->op2)->i;
1161 r = ra_alloc1(as, irr->op1, allow);
1162 } else {
1163 r = ra_alloc1(as, ir->op2, allow);
1165 if (as->mrm.base == RID_NONE)
1166 as->mrm.base = (uint8_t)r;
1167 else
1168 as->mrm.idx = (uint8_t)r;
1172 static void asm_fusexref(ASMState *as, IRIns *ir, RegSet allow)
1174 if (ir->o == IR_KPTR) {
1175 as->mrm.ofs = ir->i;
1176 as->mrm.base = as->mrm.idx = RID_NONE;
1177 } else {
1178 lua_assert(ir->o == IR_STRREF);
1179 asm_fusestrref(as, ir, allow);
1183 /* Fuse load into memory operand. */
1184 static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
1186 IRIns *ir = IR(ref);
1187 if (ra_hasreg(ir->r)) {
1188 if (allow != RSET_EMPTY) return ir->r; /* Fast path. */
1189 fusespill:
1190 /* Force a spill if only memory operands are allowed (asm_x87load). */
1191 as->mrm.base = RID_ESP;
1192 as->mrm.ofs = ra_spill(as, ir);
1193 as->mrm.idx = RID_NONE;
1194 return RID_MRM;
1196 if (ir->o == IR_KNUM) {
1197 RegSet avail = as->freeset & ~as->modset & RSET_FPR;
1198 lua_assert(allow != RSET_EMPTY);
1199 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */
1200 as->mrm.ofs = ptr2addr(ir_knum(ir));
1201 as->mrm.base = as->mrm.idx = RID_NONE;
1202 return RID_MRM;
1204 } else if (mayfuse(as, ref)) {
1205 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
1206 if (ir->o == IR_SLOAD) {
1207 if (!irt_isint(ir->t) && !(ir->op2 & IRSLOAD_PARENT)) {
1208 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
1209 as->mrm.ofs = 8*((int32_t)ir->op1-1);
1210 as->mrm.idx = RID_NONE;
1211 return RID_MRM;
1213 } else if (ir->o == IR_FLOAD) {
1214 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
1215 if ((irt_isint(ir->t) || irt_isaddr(ir->t)) &&
1216 noconflict(as, ref, IR_FSTORE)) {
1217 asm_fusefref(as, ir, xallow);
1218 return RID_MRM;
1220 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
1221 if (noconflict(as, ref, ir->o + IRDELTA_L2S)) {
1222 asm_fuseahuref(as, ir->op1, xallow);
1223 return RID_MRM;
1225 } else if (ir->o == IR_XLOAD) {
1226 /* Generic fusion is only ok for 32 bit operand (but see asm_comp).
1227 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
1229 if (irt_isint(ir->t) || irt_isaddr(ir->t)) {
1230 asm_fusexref(as, IR(ir->op1), xallow);
1231 return RID_MRM;
1235 if (!(as->freeset & allow) &&
1236 (allow == RSET_EMPTY || ra_hasspill(ir->s) || ref < as->loopref))
1237 goto fusespill;
1238 return ra_allocref(as, ref, allow);
1241 /* -- Calls --------------------------------------------------------------- */
1243 /* Generate a call to a C function. */
1244 static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
1246 RegSet allow = RSET_ALL;
1247 uint32_t n, nargs = CCI_NARGS(ci);
1248 int32_t ofs = 0;
1249 lua_assert(!(nargs > 2 && (ci->flags&CCI_FASTCALL))); /* Avoid stack adj. */
1250 emit_call(as, ci->func);
1251 for (n = 0; n < nargs; n++) { /* Setup args. */
1252 #if LJ_64
1253 #error "NYI: 64 bit mode call argument setup"
1254 #endif
1255 IRIns *ir = IR(args[n]);
1256 if (irt_isnum(ir->t)) {
1257 if ((ofs & 4) && irref_isk(args[n])) {
1258 /* Split stores for unaligned FP consts. */
1259 emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo);
1260 emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi);
1261 } else {
1262 Reg r;
1263 if ((allow & RSET_FPR) == RSET_EMPTY)
1264 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
1265 r = ra_alloc1(as, args[n], allow & RSET_FPR);
1266 allow &= ~RID2RSET(r);
1267 emit_rmro(as, XO_MOVSDto, r, RID_ESP, ofs);
1269 ofs += 8;
1270 } else {
1271 if ((ci->flags & CCI_FASTCALL) && n < 2) {
1272 Reg r = n == 0 ? RID_ECX : RID_EDX;
1273 if (args[n] < ASMREF_TMP1) {
1274 emit_loadi(as, r, ir->i);
1275 } else {
1276 lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */
1277 allow &= ~RID2RSET(r);
1278 if (ra_hasreg(ir->r))
1279 emit_movrr(as, r, ir->r);
1280 else
1281 ra_allocref(as, args[n], RID2RSET(r));
1283 } else {
1284 if (args[n] < ASMREF_TMP1) {
1285 emit_movmroi(as, RID_ESP, ofs, ir->i);
1286 } else {
1287 Reg r;
1288 if ((allow & RSET_GPR) == RSET_EMPTY)
1289 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
1290 r = ra_alloc1(as, args[n], allow & RSET_GPR);
1291 allow &= ~RID2RSET(r);
1292 emit_movtomro(as, r, RID_ESP, ofs);
1294 ofs += 4;
1300 /* Setup result reg/sp for call. Evict scratch regs. */
1301 static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
1303 RegSet drop = RSET_SCRATCH;
1304 if ((ci->flags & CCI_NOFPRCLOBBER))
1305 drop &= ~RSET_FPR;
1306 if (ra_hasreg(ir->r))
1307 rset_clear(drop, ir->r); /* Dest reg handled below. */
1308 ra_evictset(as, drop); /* Evictions must be performed first. */
1309 if (ra_used(ir)) {
1310 if (irt_isnum(ir->t)) {
1311 int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */
1312 #if LJ_64
1313 if ((ci->flags & CCI_CASTU64)) {
1314 Reg dest = ir->r;
1315 if (ra_hasreg(dest)) {
1316 ra_free(as, dest);
1317 ra_modified(as, dest);
1318 emit_rr(as, XO_MOVD, dest|REX_64, RID_RET); /* Really MOVQ. */
1319 } else {
1320 emit_movrmro(as, RID_RET, RID_ESP, ofs);
1322 } else {
1323 ra_destreg(as, ir, RID_FPRET);
1325 #else
1326 /* Number result is in x87 st0 for x86 calling convention. */
1327 Reg dest = ir->r;
1328 if (ra_hasreg(dest)) {
1329 ra_free(as, dest);
1330 ra_modified(as, dest);
1331 emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs);
1333 if ((ci->flags & CCI_CASTU64)) {
1334 emit_movtomro(as, RID_RET, RID_ESP, ofs);
1335 emit_movtomro(as, RID_RETHI, RID_ESP, ofs+4);
1336 } else {
1337 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
1339 #endif
1340 } else {
1341 lua_assert(!irt_ispri(ir->t));
1342 ra_destreg(as, ir, RID_RET);
1347 /* Collect arguments from CALL* and ARG instructions. */
1348 static void asm_collectargs(ASMState *as, IRIns *ir,
1349 const CCallInfo *ci, IRRef *args)
1351 uint32_t n = CCI_NARGS(ci);
1352 lua_assert(n <= CCI_NARGS_MAX);
1353 if ((ci->flags & CCI_L)) { *args++ = ASMREF_L; n--; }
1354 while (n-- > 1) {
1355 ir = IR(ir->op1);
1356 lua_assert(ir->o == IR_CARG);
1357 args[n] = ir->op2;
1359 args[0] = ir->op1;
1360 lua_assert(IR(ir->op1)->o != IR_CARG);
1363 static void asm_call(ASMState *as, IRIns *ir)
1365 IRRef args[CCI_NARGS_MAX];
1366 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
1367 asm_collectargs(as, ir, ci, args);
1368 asm_setupresult(as, ir, ci);
1369 asm_gencall(as, ci, args);
1372 /* -- Type conversions ---------------------------------------------------- */
1374 static void asm_tonum(ASMState *as, IRIns *ir)
1376 Reg dest = ra_dest(as, ir, RSET_FPR);
1377 Reg left = asm_fuseload(as, ir->op1, RSET_GPR);
1378 emit_mrm(as, XO_CVTSI2SD, dest, left);
1379 if (!(as->flags & JIT_F_SPLIT_XMM))
1380 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
1383 static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
1385 Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
1386 Reg dest = ra_dest(as, ir, RSET_GPR);
1387 asm_guardcc(as, CC_P);
1388 asm_guardcc(as, CC_NE);
1389 emit_rr(as, XO_UCOMISD, left, tmp);
1390 emit_rr(as, XO_CVTSI2SD, tmp, dest);
1391 if (!(as->flags & JIT_F_SPLIT_XMM))
1392 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
1393 emit_rr(as, XO_CVTTSD2SI, dest, left);
1394 /* Can't fuse since left is needed twice. */
1397 static void asm_toint(ASMState *as, IRIns *ir)
1399 Reg dest = ra_dest(as, ir, RSET_GPR);
1400 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
1401 emit_mrm(as, XO_CVTSD2SI, dest, left);
1404 static void asm_tobit(ASMState *as, IRIns *ir)
1406 Reg dest = ra_dest(as, ir, RSET_GPR);
1407 Reg tmp = ra_noreg(IR(ir->op1)->r) ?
1408 ra_alloc1(as, ir->op1, RSET_FPR) :
1409 ra_scratch(as, RSET_FPR);
1410 Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp));
1411 emit_rr(as, XO_MOVDto, tmp, dest);
1412 emit_mrm(as, XO_ADDSD, tmp, right);
1413 ra_left(as, tmp, ir->op1);
1416 static void asm_strto(ASMState *as, IRIns *ir)
1418 /* Force a spill slot for the destination register (if any). */
1419 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_tonum];
1420 IRRef args[2];
1421 RegSet drop = RSET_SCRATCH;
1422 if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r))
1423 rset_set(drop, ir->r); /* WIN64 doesn't spill all FPRs. */
1424 ra_evictset(as, drop);
1425 asm_guardcc(as, CC_E);
1426 emit_rr(as, XO_TEST, RID_RET, RID_RET);
1427 args[0] = ir->op1;
1428 args[1] = ASMREF_TMP1;
1429 asm_gencall(as, ci, args);
1430 /* Store the result to the spill slot or slots SPS_TEMP1/2. */
1431 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1),
1432 RID_ESP, sps_scale(ir->s));
1435 static void asm_tostr(ASMState *as, IRIns *ir)
1437 IRIns *irl = IR(ir->op1);
1438 IRRef args[2];
1439 args[0] = ASMREF_L;
1440 as->gcsteps++;
1441 if (irt_isnum(irl->t)) {
1442 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromnum];
1443 args[1] = ASMREF_TMP1;
1444 asm_setupresult(as, ir, ci);
1445 asm_gencall(as, ci, args);
1446 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1),
1447 RID_ESP, ra_spill(as, irl));
1448 } else {
1449 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromint];
1450 args[1] = ir->op1;
1451 asm_setupresult(as, ir, ci);
1452 asm_gencall(as, ci, args);
1456 /* -- Memory references --------------------------------------------------- */
1458 static void asm_aref(ASMState *as, IRIns *ir)
1460 Reg dest = ra_dest(as, ir, RSET_GPR);
1461 asm_fusearef(as, ir, RSET_GPR);
1462 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0))
1463 emit_mrm(as, XO_LEA, dest, RID_MRM);
1464 else if (as->mrm.base != dest)
1465 emit_rr(as, XO_MOV, dest, as->mrm.base);
1468 /* Must match with hashkey() and hashrot() in lj_tab.c. */
1469 static uint32_t ir_khash(IRIns *ir)
1471 uint32_t lo, hi;
1472 if (irt_isstr(ir->t)) {
1473 return ir_kstr(ir)->hash;
1474 } else if (irt_isnum(ir->t)) {
1475 lo = ir_knum(ir)->u32.lo;
1476 hi = ir_knum(ir)->u32.hi & 0x7fffffff;
1477 } else if (irt_ispri(ir->t)) {
1478 lua_assert(!irt_isnil(ir->t));
1479 return irt_type(ir->t)-IRT_FALSE;
1480 } else {
1481 lua_assert(irt_isgcv(ir->t));
1482 lo = u32ptr(ir_kgc(ir));
1483 hi = lo - 0x04c11db7;
1485 lo ^= hi; hi = lj_rol(hi, 14);
1486 lo -= hi; hi = lj_rol(hi, 5);
1487 hi ^= lo; hi -= lj_rol(lo, 27);
1488 return hi;
1491 /* Merge NE(HREF, niltv) check. */
1492 static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
1494 /* Assumes nothing else generates NE of HREF. */
1495 if (ir[1].o == IR_NE && ir[1].op1 == as->curins) {
1496 if (LJ_64 && *as->mcp != XI_ARITHi)
1497 as->mcp += 7+6;
1498 else
1499 as->mcp += 6+6; /* Kill cmp reg, imm32 + jz exit. */
1500 return as->mcp + *(int32_t *)(as->mcp-4); /* Return exit address. */
1502 return NULL;
1505 /* Inlined hash lookup. Specialized for key type and for const keys.
1506 ** The equivalent C code is:
1507 ** Node *n = hashkey(t, key);
1508 ** do {
1509 ** if (lj_obj_equal(&n->key, key)) return &n->val;
1510 ** } while ((n = nextnode(n)));
1511 ** return niltv(L);
1513 static void asm_href(ASMState *as, IRIns *ir)
1515 MCode *nilexit = merge_href_niltv(as, ir); /* Do this before any restores. */
1516 RegSet allow = RSET_GPR;
1517 Reg dest = ra_dest(as, ir, allow);
1518 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
1519 Reg key = RID_NONE, tmp = RID_NONE;
1520 IRIns *irkey = IR(ir->op2);
1521 int isk = irref_isk(ir->op2);
1522 IRType1 kt = irkey->t;
1523 uint32_t khash;
1524 MCLabel l_end, l_loop, l_next;
1526 if (!isk) {
1527 rset_clear(allow, tab);
1528 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
1529 if (!irt_isstr(kt))
1530 tmp = ra_scratch(as, rset_exclude(allow, key));
1533 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */
1534 l_end = emit_label(as);
1535 if (nilexit)
1536 emit_jcc(as, CC_E, nilexit); /* XI_JMP is not found by lj_asm_patchexit. */
1537 else
1538 emit_loada(as, dest, niltvg(J2G(as->J)));
1540 /* Follow hash chain until the end. */
1541 l_loop = emit_sjcc_label(as, CC_NZ);
1542 emit_rr(as, XO_TEST, dest, dest);
1543 emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next));
1544 l_next = emit_label(as);
1546 /* Type and value comparison. */
1547 emit_sjcc(as, CC_E, l_end);
1548 if (irt_isnum(kt)) {
1549 if (isk) {
1550 /* Assumes -0.0 is already canonicalized to +0.0. */
1551 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
1552 (int32_t)ir_knum(irkey)->u32.lo);
1553 emit_sjcc(as, CC_NE, l_next);
1554 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
1555 (int32_t)ir_knum(irkey)->u32.hi);
1556 } else {
1557 emit_sjcc(as, CC_P, l_next);
1558 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
1559 emit_sjcc(as, CC_A, l_next);
1560 /* The type check avoids NaN penalties and complaints from Valgrind. */
1561 emit_i8(as, ~IRT_NUM);
1562 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1564 } else {
1565 if (!irt_ispri(kt)) {
1566 lua_assert(irt_isaddr(kt));
1567 if (isk)
1568 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr),
1569 ptr2addr(ir_kgc(irkey)));
1570 else
1571 emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr));
1572 emit_sjcc(as, CC_NE, l_next);
1574 lua_assert(!irt_isnil(kt));
1575 emit_i8(as, ~irt_type(kt));
1576 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1578 emit_sfixup(as, l_loop);
1579 checkmclim(as);
1581 /* Load main position relative to tab->node into dest. */
1582 khash = isk ? ir_khash(irkey) : 1;
1583 if (khash == 0) {
1584 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node));
1585 } else {
1586 emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node));
1587 if ((as->flags & JIT_F_PREFER_IMUL)) {
1588 emit_i8(as, sizeof(Node));
1589 emit_rr(as, XO_IMULi8, dest, dest);
1590 } else {
1591 emit_shifti(as, XOg_SHL, dest, 3);
1592 emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
1594 if (isk) {
1595 emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash);
1596 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
1597 } else if (irt_isstr(kt)) {
1598 emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash));
1599 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
1600 } else { /* Must match with hashrot() in lj_tab.c. */
1601 emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask));
1602 emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
1603 emit_shifti(as, XOg_ROL, tmp, 27);
1604 emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
1605 emit_shifti(as, XOg_ROL, dest, 5);
1606 emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
1607 emit_shifti(as, XOg_ROL, dest, 14);
1608 emit_rr(as, XO_ARITH(XOg_XOR), tmp, dest);
1609 if (irt_isnum(kt)) {
1610 emit_rmro(as, XO_ARITH(XOg_AND), dest, RID_ESP, ra_spill(as, irkey)+4);
1611 emit_loadi(as, dest, 0x7fffffff);
1612 emit_rr(as, XO_MOVDto, key, tmp);
1613 } else {
1614 emit_rr(as, XO_MOV, tmp, key);
1615 emit_rmro(as, XO_LEA, dest, key, -0x04c11db7);
1621 static void asm_hrefk(ASMState *as, IRIns *ir)
1623 IRIns *kslot = IR(ir->op2);
1624 IRIns *irkey = IR(kslot->op1);
1625 int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
1626 Reg dest = ra_used(ir) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
1627 Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
1628 MCLabel l_exit;
1629 lua_assert(ofs % sizeof(Node) == 0);
1630 if (ra_hasreg(dest)) {
1631 if (ofs != 0) {
1632 if (dest == node && !(as->flags & JIT_F_LEA_AGU))
1633 emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs);
1634 else
1635 emit_rmro(as, XO_LEA, dest, node, ofs);
1636 } else if (dest != node) {
1637 emit_rr(as, XO_MOV, dest, node);
1640 asm_guardcc(as, CC_NE);
1641 l_exit = emit_label(as);
1642 if (irt_isnum(irkey->t)) {
1643 /* Assumes -0.0 is already canonicalized to +0.0. */
1644 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1645 ofs + (int32_t)offsetof(Node, key.u32.lo),
1646 (int32_t)ir_knum(irkey)->u32.lo);
1647 emit_sjcc(as, CC_NE, l_exit);
1648 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1649 ofs + (int32_t)offsetof(Node, key.u32.hi),
1650 (int32_t)ir_knum(irkey)->u32.hi);
1651 } else {
1652 if (!irt_ispri(irkey->t)) {
1653 lua_assert(irt_isgcv(irkey->t));
1654 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1655 ofs + (int32_t)offsetof(Node, key.gcr),
1656 ptr2addr(ir_kgc(irkey)));
1657 emit_sjcc(as, CC_NE, l_exit);
1659 lua_assert(!irt_isnil(irkey->t));
1660 emit_i8(as, ~irt_type(irkey->t));
1661 emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
1662 ofs + (int32_t)offsetof(Node, key.it));
1666 static void asm_newref(ASMState *as, IRIns *ir)
1668 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
1669 IRRef args[3];
1670 IRIns *irkey;
1671 Reg tmp;
1672 args[0] = ASMREF_L;
1673 args[1] = ir->op1;
1674 args[2] = ASMREF_TMP1;
1675 asm_setupresult(as, ir, ci);
1676 asm_gencall(as, ci, args);
1677 tmp = ra_releasetmp(as, ASMREF_TMP1);
1678 irkey = IR(ir->op2);
1679 if (irt_isnum(irkey->t)) {
1680 /* For numbers use the constant itself or a spill slot as a TValue. */
1681 if (irref_isk(ir->op2))
1682 emit_loada(as, tmp, ir_knum(irkey));
1683 else
1684 emit_rmro(as, XO_LEA, tmp, RID_ESP, ra_spill(as, irkey));
1685 } else {
1686 /* Otherwise use g->tmptv to hold the TValue. */
1687 if (!irref_isk(ir->op2)) {
1688 Reg src = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, tmp));
1689 emit_movtomro(as, src, tmp, 0);
1690 } else if (!irt_ispri(irkey->t)) {
1691 emit_movmroi(as, tmp, 0, irkey->i);
1693 emit_movmroi(as, tmp, 4, irt_toitype(irkey->t));
1694 emit_loada(as, tmp, &J2G(as->J)->tmptv);
1698 static void asm_uref(ASMState *as, IRIns *ir)
1700 /* NYI: Check that UREFO is still open and not aliasing a slot. */
1701 if (ra_used(ir)) {
1702 Reg dest = ra_dest(as, ir, RSET_GPR);
1703 if (irref_isk(ir->op1)) {
1704 GCfunc *fn = ir_kfunc(IR(ir->op1));
1705 TValue **v = &gcref(fn->l.uvptr[ir->op2])->uv.v;
1706 emit_rma(as, XO_MOV, dest, v);
1707 } else {
1708 Reg uv = ra_scratch(as, RSET_GPR);
1709 Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
1710 if (ir->o == IR_UREFC) {
1711 emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv));
1712 asm_guardcc(as, CC_NE);
1713 emit_i8(as, 1);
1714 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
1715 } else {
1716 emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v));
1718 emit_rmro(as, XO_MOV, uv, func,
1719 (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)ir->op2);
1724 static void asm_fref(ASMState *as, IRIns *ir)
1726 Reg dest = ra_dest(as, ir, RSET_GPR);
1727 asm_fusefref(as, ir, RSET_GPR);
1728 emit_mrm(as, XO_LEA, dest, RID_MRM);
1731 static void asm_strref(ASMState *as, IRIns *ir)
1733 Reg dest = ra_dest(as, ir, RSET_GPR);
1734 asm_fusestrref(as, ir, RSET_GPR);
1735 if (as->mrm.base == RID_NONE)
1736 emit_loadi(as, dest, as->mrm.ofs);
1737 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE)
1738 emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs);
1739 else
1740 emit_mrm(as, XO_LEA, dest, RID_MRM);
1743 /* -- Loads and stores ---------------------------------------------------- */
1745 static void asm_fxload(ASMState *as, IRIns *ir)
1747 Reg dest = ra_dest(as, ir, RSET_GPR);
1748 x86Op xo;
1749 if (ir->o == IR_FLOAD)
1750 asm_fusefref(as, ir, RSET_GPR);
1751 else
1752 asm_fusexref(as, IR(ir->op1), RSET_GPR);
1753 /* ir->op2 is ignored -- unaligned loads are ok on x86. */
1754 switch (irt_type(ir->t)) {
1755 case IRT_I8: xo = XO_MOVSXb; break;
1756 case IRT_U8: xo = XO_MOVZXb; break;
1757 case IRT_I16: xo = XO_MOVSXw; break;
1758 case IRT_U16: xo = XO_MOVZXw; break;
1759 default:
1760 lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
1761 xo = XO_MOV;
1762 break;
1764 emit_mrm(as, xo, dest, RID_MRM);
1767 static void asm_fstore(ASMState *as, IRIns *ir)
1769 RegSet allow = RSET_GPR;
1770 Reg src = RID_NONE;
1771 /* The IRT_I16/IRT_U16 stores should never be simplified for constant
1772 ** values since mov word [mem], imm16 has a length-changing prefix.
1774 if (!irref_isk(ir->op2) || irt_isi16(ir->t) || irt_isu16(ir->t)) {
1775 RegSet allow8 = (irt_isi8(ir->t) || irt_isu8(ir->t)) ? RSET_GPR8 : RSET_GPR;
1776 src = ra_alloc1(as, ir->op2, allow8);
1777 rset_clear(allow, src);
1779 asm_fusefref(as, IR(ir->op1), allow);
1780 if (ra_hasreg(src)) {
1781 x86Op xo;
1782 switch (irt_type(ir->t)) {
1783 case IRT_I8: case IRT_U8: xo = XO_MOVtob; src |= FORCE_REX; break;
1784 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break;
1785 default:
1786 lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
1787 xo = XO_MOVto;
1788 break;
1790 emit_mrm(as, xo, src, RID_MRM);
1791 } else {
1792 if (irt_isi8(ir->t) || irt_isu8(ir->t)) {
1793 emit_i8(as, IR(ir->op2)->i);
1794 emit_mrm(as, XO_MOVmib, 0, RID_MRM);
1795 } else {
1796 lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
1797 emit_i32(as, IR(ir->op2)->i);
1798 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1803 static void asm_ahuload(ASMState *as, IRIns *ir)
1805 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
1806 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t));
1807 if (ra_used(ir)) {
1808 Reg dest = ra_dest(as, ir, allow);
1809 asm_fuseahuref(as, ir->op1, RSET_GPR);
1810 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), dest, RID_MRM);
1811 } else {
1812 asm_fuseahuref(as, ir->op1, RSET_GPR);
1814 /* Always do the type check, even if the load result is unused. */
1815 asm_guardcc(as, irt_isnum(ir->t) ? CC_A : CC_NE);
1816 emit_i8(as, ~irt_type(ir->t));
1817 as->mrm.ofs += 4;
1818 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM);
1821 static void asm_ahustore(ASMState *as, IRIns *ir)
1823 if (irt_isnum(ir->t)) {
1824 Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
1825 asm_fuseahuref(as, ir->op1, RSET_GPR);
1826 emit_mrm(as, XO_MOVSDto, src, RID_MRM);
1827 } else {
1828 IRIns *irr = IR(ir->op2);
1829 RegSet allow = RSET_GPR;
1830 Reg src = RID_NONE;
1831 if (!irref_isk(ir->op2)) {
1832 src = ra_alloc1(as, ir->op2, allow);
1833 rset_clear(allow, src);
1835 asm_fuseahuref(as, ir->op1, allow);
1836 if (ra_hasreg(src)) {
1837 emit_mrm(as, XO_MOVto, src, RID_MRM);
1838 } else if (!irt_ispri(irr->t)) {
1839 lua_assert(irt_isaddr(ir->t));
1840 emit_i32(as, irr->i);
1841 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1843 as->mrm.ofs += 4;
1844 emit_i32(as, (int32_t)~irt_type(ir->t));
1845 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1849 static void asm_sload(ASMState *as, IRIns *ir)
1851 int32_t ofs = 8*((int32_t)ir->op1-1);
1852 IRType1 t = ir->t;
1853 Reg base;
1854 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */
1855 if (irt_isint(t)) {
1856 Reg left = ra_scratch(as, RSET_FPR);
1857 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */
1858 base = ra_alloc1(as, REF_BASE, RSET_GPR);
1859 emit_rmro(as, XMM_MOVRM(as), left, base, ofs);
1860 t.irt = IRT_NUM; /* Continue with a regular number type check. */
1861 } else if (ra_used(ir)) {
1862 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
1863 Reg dest = ra_dest(as, ir, allow);
1864 lua_assert(irt_isnum(ir->t) || irt_isaddr(ir->t));
1865 base = ra_alloc1(as, REF_BASE, RSET_GPR);
1866 emit_movrmro(as, dest, base, ofs);
1867 } else {
1868 if (!irt_isguard(ir->t))
1869 return; /* No type check: avoid base alloc. */
1870 base = ra_alloc1(as, REF_BASE, RSET_GPR);
1872 if (irt_isguard(ir->t)) {
1873 /* Need type check, even if the load result is unused. */
1874 asm_guardcc(as, irt_isnum(t) ? CC_A : CC_NE);
1875 emit_i8(as, ~irt_type(t));
1876 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4);
1880 /* -- Allocations --------------------------------------------------------- */
1882 static void asm_snew(ASMState *as, IRIns *ir)
1884 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_new];
1885 IRRef args[3];
1886 args[0] = ASMREF_L;
1887 args[1] = ir->op1;
1888 args[2] = ir->op2;
1889 as->gcsteps++;
1890 asm_setupresult(as, ir, ci);
1891 asm_gencall(as, ci, args);
1894 static void asm_tnew(ASMState *as, IRIns *ir)
1896 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_new1];
1897 IRRef args[2];
1898 args[0] = ASMREF_L;
1899 args[1] = ASMREF_TMP1;
1900 as->gcsteps++;
1901 asm_setupresult(as, ir, ci);
1902 asm_gencall(as, ci, args);
1903 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), ir->op1 | (ir->op2 << 24));
1906 static void asm_tdup(ASMState *as, IRIns *ir)
1908 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_dup];
1909 IRRef args[2];
1910 args[0] = ASMREF_L;
1911 args[1] = ir->op1;
1912 as->gcsteps++;
1913 asm_setupresult(as, ir, ci);
1914 asm_gencall(as, ci, args);
1917 /* -- Write barriers ------------------------------------------------------ */
1919 static void asm_tbar(ASMState *as, IRIns *ir)
1921 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
1922 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab));
1923 MCLabel l_end = emit_label(as);
1924 emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist));
1925 emit_setgl(as, tab, gc.grayagain);
1926 emit_getgl(as, tmp, gc.grayagain);
1927 emit_i8(as, ~LJ_GC_BLACK);
1928 emit_rmro(as, XO_ARITHib, XOg_AND, tab, offsetof(GCtab, marked));
1929 emit_sjcc(as, CC_Z, l_end);
1930 emit_i8(as, LJ_GC_BLACK);
1931 emit_rmro(as, XO_GROUP3b, XOg_TEST, tab, offsetof(GCtab, marked));
1934 static void asm_obar(ASMState *as, IRIns *ir)
1936 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
1937 IRRef args[2];
1938 MCLabel l_end;
1939 Reg obj;
1940 /* No need for other object barriers (yet). */
1941 lua_assert(IR(ir->op1)->o == IR_UREFC);
1942 l_end = emit_label(as);
1943 args[0] = ASMREF_TMP1;
1944 args[1] = ir->op1;
1945 asm_gencall(as, ci, args);
1946 emit_loada(as, ra_releasetmp(as, ASMREF_TMP1), J2G(as->J));
1947 obj = IR(ir->op1)->r;
1948 emit_sjcc(as, CC_Z, l_end);
1949 emit_i8(as, LJ_GC_WHITES);
1950 if (irref_isk(ir->op2)) {
1951 GCobj *vp = ir_kgc(IR(ir->op2));
1952 emit_rma(as, XO_GROUP3b, XOg_TEST, &vp->gch.marked);
1953 } else {
1954 Reg val = ra_alloc1(as, ir->op2, rset_exclude(RSET_SCRATCH&RSET_GPR, obj));
1955 emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked));
1957 emit_sjcc(as, CC_Z, l_end);
1958 emit_i8(as, LJ_GC_BLACK);
1959 emit_rmro(as, XO_GROUP3b, XOg_TEST, obj,
1960 (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
1963 /* -- FP/int arithmetic and logic operations ------------------------------ */
1965 /* Load reference onto x87 stack. Force a spill to memory if needed. */
1966 static void asm_x87load(ASMState *as, IRRef ref)
1968 IRIns *ir = IR(ref);
1969 if (ir->o == IR_KNUM) {
1970 cTValue *tv = ir_knum(ir);
1971 if (tvispzero(tv)) /* Use fldz only for +0. */
1972 emit_x87op(as, XI_FLDZ);
1973 else if (tvispone(tv))
1974 emit_x87op(as, XI_FLD1);
1975 else
1976 emit_rma(as, XO_FLDq, XOg_FLDq, tv);
1977 } else if (ir->o == IR_TONUM && !ra_used(ir) &&
1978 !irref_isk(ir->op1) && mayfuse(as, ir->op1)) {
1979 IRIns *iri = IR(ir->op1);
1980 emit_rmro(as, XO_FILDd, XOg_FILDd, RID_ESP, ra_spill(as, iri));
1981 } else {
1982 emit_mrm(as, XO_FLDq, XOg_FLDq, asm_fuseload(as, ref, RSET_EMPTY));
1986 /* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
1987 static int fpmjoin_pow(ASMState *as, IRIns *ir)
1989 IRIns *irp = IR(ir->op1);
1990 if (irp == ir-1 && irp->o == IR_MUL && !ra_used(irp)) {
1991 IRIns *irpp = IR(irp->op1);
1992 if (irpp == ir-2 && irpp->o == IR_FPMATH &&
1993 irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
1994 /* The modified regs must match with the *.dasc implementation. */
1995 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
1996 IRIns *irx;
1997 if (ra_hasreg(ir->r))
1998 rset_clear(drop, ir->r); /* Dest reg handled below. */
1999 ra_evictset(as, drop);
2000 ra_destreg(as, ir, RID_XMM0);
2001 emit_call(as, lj_vm_pow_sse);
2002 irx = IR(irpp->op1);
2003 if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
2004 irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
2005 ra_left(as, RID_XMM0, irpp->op1);
2006 ra_left(as, RID_XMM1, irp->op2);
2007 return 1;
2010 return 0;
2013 static void asm_fpmath(ASMState *as, IRIns *ir)
2015 IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER;
2016 if (fpm == IRFPM_SQRT) {
2017 Reg dest = ra_dest(as, ir, RSET_FPR);
2018 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2019 emit_mrm(as, XO_SQRTSD, dest, left);
2020 } else if (fpm <= IRFPM_TRUNC) {
2021 if (as->flags & JIT_F_SSE4_1) { /* SSE4.1 has a rounding instruction. */
2022 Reg dest = ra_dest(as, ir, RSET_FPR);
2023 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2024 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
2025 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
2026 ** This is atrocious, but the alternatives are much worse.
2028 /* Round down/up/trunc == 1001/1010/1011. */
2029 emit_i8(as, 0x09 + fpm);
2030 emit_mrm(as, XO_ROUNDSD, dest, left);
2031 if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
2032 as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */
2034 *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */
2035 } else { /* Call helper functions for SSE2 variant. */
2036 /* The modified regs must match with the *.dasc implementation. */
2037 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
2038 if (ra_hasreg(ir->r))
2039 rset_clear(drop, ir->r); /* Dest reg handled below. */
2040 ra_evictset(as, drop);
2041 ra_destreg(as, ir, RID_XMM0);
2042 emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
2043 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
2044 ra_left(as, RID_XMM0, ir->op1);
2046 } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) {
2047 /* Rejoined to pow(). */
2048 } else { /* Handle x87 ops. */
2049 int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */
2050 Reg dest = ir->r;
2051 if (ra_hasreg(dest)) {
2052 ra_free(as, dest);
2053 ra_modified(as, dest);
2054 emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs);
2056 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
2057 switch (fpm) { /* st0 = lj_vm_*(st0) */
2058 case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
2059 case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break;
2060 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
2061 case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
2062 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
2063 case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10:
2064 /* Note: the use of fyl2xp1 would be pointless here. When computing
2065 ** log(1.0+eps) the precision is already lost after 1.0 is added.
2066 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
2068 emit_x87op(as, XI_FYL2X); break;
2069 case IRFPM_OTHER:
2070 switch (ir->o) {
2071 case IR_ATAN2:
2072 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
2073 case IR_LDEXP:
2074 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
2075 default: lua_assert(0); break;
2077 break;
2078 default: lua_assert(0); break;
2080 asm_x87load(as, ir->op1);
2081 switch (fpm) {
2082 case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break;
2083 case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break;
2084 case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break;
2085 case IRFPM_OTHER:
2086 if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2);
2087 break;
2088 default: break;
2093 static void asm_powi(ASMState *as, IRIns *ir)
2095 /* The modified regs must match with the *.dasc implementation. */
2096 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
2097 if (ra_hasreg(ir->r))
2098 rset_clear(drop, ir->r); /* Dest reg handled below. */
2099 ra_evictset(as, drop);
2100 ra_destreg(as, ir, RID_XMM0);
2101 emit_call(as, lj_vm_powi_sse);
2102 ra_left(as, RID_XMM0, ir->op1);
2103 ra_left(as, RID_EAX, ir->op2);
2106 /* Find out whether swapping operands might be beneficial. */
2107 static int swapops(ASMState *as, IRIns *ir)
2109 IRIns *irl = IR(ir->op1);
2110 IRIns *irr = IR(ir->op2);
2111 lua_assert(ra_noreg(irr->r));
2112 if (!irm_iscomm(lj_ir_mode[ir->o]))
2113 return 0; /* Can't swap non-commutative operations. */
2114 if (irref_isk(ir->op2))
2115 return 0; /* Don't swap constants to the left. */
2116 if (ra_hasreg(irl->r))
2117 return 1; /* Swap if left already has a register. */
2118 if (ra_samehint(ir->r, irr->r))
2119 return 1; /* Swap if dest and right have matching hints. */
2120 if (ir->op1 < as->loopref && !irt_isphi(irl->t) &&
2121 !(ir->op2 < as->loopref && !irt_isphi(irr->t)))
2122 return 1; /* Swap invariants to the right. */
2123 if (opisfusableload(irl->o))
2124 return 1; /* Swap fusable loads to the right. */
2125 return 0; /* Otherwise don't swap. */
2128 static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo)
2130 IRRef lref = ir->op1;
2131 IRRef rref = ir->op2;
2132 RegSet allow = RSET_FPR;
2133 Reg dest;
2134 Reg right = IR(rref)->r;
2135 if (ra_hasreg(right))
2136 rset_clear(allow, right);
2137 dest = ra_dest(as, ir, allow);
2138 if (lref == rref) {
2139 right = dest;
2140 } else if (ra_noreg(right)) {
2141 if (swapops(as, ir)) {
2142 IRRef tmp = lref; lref = rref; rref = tmp;
2144 right = asm_fuseload(as, rref, rset_clear(allow, dest));
2146 emit_mrm(as, xo, dest, right);
2147 ra_left(as, dest, lref);
2150 static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa)
2152 IRRef lref = ir->op1;
2153 IRRef rref = ir->op2;
2154 RegSet allow = RSET_GPR;
2155 Reg dest, right;
2156 if (as->testmcp == as->mcp) { /* Drop test r,r instruction. */
2157 as->testmcp = NULL;
2158 as->mcp += (LJ_64 && *as->mcp != XI_TEST) ? 3 : 2;
2160 right = IR(rref)->r;
2161 if (ra_hasreg(right))
2162 rset_clear(allow, right);
2163 dest = ra_dest(as, ir, allow);
2164 if (lref == rref) {
2165 right = dest;
2166 } else if (ra_noreg(right) && !irref_isk(rref)) {
2167 if (swapops(as, ir)) {
2168 IRRef tmp = lref; lref = rref; rref = tmp;
2170 right = asm_fuseload(as, rref, rset_clear(allow, dest));
2171 /* Note: fuses only with IR_FLOAD for now. */
2173 if (irt_isguard(ir->t)) /* For IR_ADDOV etc. */
2174 asm_guardcc(as, CC_O);
2175 if (ra_hasreg(right))
2176 emit_mrm(as, XO_ARITH(xa), dest, right);
2177 else
2178 emit_gri(as, XG_ARITHi(xa), dest, IR(ir->op2)->i);
2179 ra_left(as, dest, lref);
2182 /* LEA is really a 4-operand ADD with an independent destination register,
2183 ** up to two source registers and an immediate. One register can be scaled
2184 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
2185 ** instructions.
2187 ** Currently only a few common cases are supported:
2188 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated
2189 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b
2190 ** - Right ADD fusion: y = a+(b+k)
2191 ** The ommited variants have already been reduced by FOLD.
2193 ** There are more fusion opportunities, like gathering shifts or joining
2194 ** common references. But these are probably not worth the trouble, since
2195 ** array indexing is not decomposed and already makes use of all fields
2196 ** of the ModRM operand.
2198 static int asm_lea(ASMState *as, IRIns *ir)
2200 IRIns *irl = IR(ir->op1);
2201 IRIns *irr = IR(ir->op2);
2202 RegSet allow = RSET_GPR;
2203 Reg dest;
2204 as->mrm.base = as->mrm.idx = RID_NONE;
2205 as->mrm.scale = XM_SCALE1;
2206 as->mrm.ofs = 0;
2207 if (ra_hasreg(irl->r)) {
2208 rset_clear(allow, irl->r);
2209 as->mrm.base = irl->r;
2210 if (irref_isk(ir->op2) || ra_hasreg(irr->r)) {
2211 /* The PHI renaming logic does a better job in some cases. */
2212 if (ra_hasreg(ir->r) &&
2213 ((irt_isphi(irl->t) && as->phireg[ir->r] == ir->op1) ||
2214 (irt_isphi(irr->t) && as->phireg[ir->r] == ir->op2)))
2215 return 0;
2216 if (irref_isk(ir->op2)) {
2217 as->mrm.ofs = irr->i;
2218 } else {
2219 rset_clear(allow, irr->r);
2220 as->mrm.idx = irr->r;
2222 } else if (irr->o == IR_ADD && mayfuse(as, ir->op2) &&
2223 irref_isk(irr->op2)) {
2224 Reg idx = ra_alloc1(as, irr->op1, allow);
2225 rset_clear(allow, idx);
2226 as->mrm.idx = (uint8_t)idx;
2227 as->mrm.ofs = IR(irr->op2)->i;
2228 } else {
2229 return 0;
2231 } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) &&
2232 (irref_isk(ir->op2) || irref_isk(irl->op2))) {
2233 Reg idx, base = ra_alloc1(as, irl->op1, allow);
2234 rset_clear(allow, base);
2235 as->mrm.base = (uint8_t)base;
2236 if (irref_isk(ir->op2)) {
2237 as->mrm.ofs = irr->i;
2238 idx = ra_alloc1(as, irl->op2, allow);
2239 } else {
2240 as->mrm.ofs = IR(irl->op2)->i;
2241 idx = ra_alloc1(as, ir->op2, allow);
2243 rset_clear(allow, idx);
2244 as->mrm.idx = (uint8_t)idx;
2245 } else {
2246 return 0;
2248 dest = ra_dest(as, ir, allow);
2249 emit_mrm(as, XO_LEA, dest, RID_MRM);
2250 return 1; /* Success. */
2253 static void asm_add(ASMState *as, IRIns *ir)
2255 if (irt_isnum(ir->t))
2256 asm_fparith(as, ir, XO_ADDSD);
2257 else if ((as->flags & JIT_F_LEA_AGU) || as->testmcp == as->mcp ||
2258 !asm_lea(as, ir))
2259 asm_intarith(as, ir, XOg_ADD);
2262 static void asm_bitnot(ASMState *as, IRIns *ir)
2264 Reg dest = ra_dest(as, ir, RSET_GPR);
2265 emit_rr(as, XO_GROUP3, XOg_NOT, dest);
2266 ra_left(as, dest, ir->op1);
2269 static void asm_bitswap(ASMState *as, IRIns *ir)
2271 Reg dest = ra_dest(as, ir, RSET_GPR);
2272 MCode *p = as->mcp;
2273 p[-1] = (MCode)(XI_BSWAP+(dest&7));
2274 p[-2] = 0x0f;
2275 p -= 2;
2276 REXRB(p, 0, dest);
2277 as->mcp = p;
2278 ra_left(as, dest, ir->op1);
2281 static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2283 IRRef rref = ir->op2;
2284 IRIns *irr = IR(rref);
2285 Reg dest;
2286 if (irref_isk(rref)) { /* Constant shifts. */
2287 int shift;
2288 dest = ra_dest(as, ir, RSET_GPR);
2289 shift = irr->i & 31; /* Handle shifts of 0..31 bits. */
2290 switch (shift) {
2291 case 0: return;
2292 case 1: emit_rr(as, XO_SHIFT1, (Reg)xs, dest); break;
2293 default: emit_shifti(as, xs, dest, shift); break;
2295 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
2296 RegSet allow = rset_exclude(RSET_GPR, RID_ECX);
2297 Reg right = irr->r;
2298 if (ra_noreg(right)) {
2299 right = ra_allocref(as, rref, RID2RSET(RID_ECX));
2300 } else if (right != RID_ECX) {
2301 rset_clear(allow, right);
2302 ra_scratch(as, RID2RSET(RID_ECX));
2304 dest = ra_dest(as, ir, allow);
2305 emit_rr(as, XO_SHIFTcl, (Reg)xs, dest);
2306 if (right != RID_ECX)
2307 emit_rr(as, XO_MOV, RID_ECX, right);
2309 ra_left(as, dest, ir->op1);
2311 ** Note: avoid using the flags resulting from a shift or rotate!
2312 ** All of them cause a partial flag stall, except for r,1 shifts
2313 ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
2317 /* -- Comparisons --------------------------------------------------------- */
2319 /* Virtual flags for unordered FP comparisons. */
2320 #define VCC_U 0x100 /* Unordered. */
2321 #define VCC_P 0x200 /* Needs extra CC_P branch. */
2322 #define VCC_S 0x400 /* Swap avoids CC_P branch. */
2323 #define VCC_PS (VCC_P|VCC_S)
2325 static void asm_comp_(ASMState *as, IRIns *ir, int cc)
2327 if (irt_isnum(ir->t)) {
2328 IRRef lref = ir->op1;
2329 IRRef rref = ir->op2;
2330 Reg left, right;
2331 MCLabel l_around;
2333 ** An extra CC_P branch is required to preserve ordered/unordered
2334 ** semantics for FP comparisons. This can be avoided by swapping
2335 ** the operands and inverting the condition (except for EQ and UNE).
2336 ** So always try to swap if possible.
2338 ** Another option would be to swap operands to achieve better memory
2339 ** operand fusion. But it's unlikely that this outweighs the cost
2340 ** of the extra branches.
2342 if (cc & VCC_S) { /* Swap? */
2343 IRRef tmp = lref; lref = rref; rref = tmp;
2344 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
2346 left = ra_alloc1(as, lref, RSET_FPR);
2347 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
2348 l_around = emit_label(as);
2349 asm_guardcc(as, cc >> 4);
2350 if (cc & VCC_P) { /* Extra CC_P branch required? */
2351 if (!(cc & VCC_U)) {
2352 asm_guardcc(as, CC_P); /* Branch to exit for ordered comparisons. */
2353 } else if (l_around != as->invmcp) {
2354 emit_sjcc(as, CC_P, l_around); /* Branch around for unordered. */
2355 } else {
2356 /* Patched to mcloop by asm_loop_fixup. */
2357 as->loopinv = 2;
2358 if (as->realign)
2359 emit_sjcc(as, CC_P, as->mcp);
2360 else
2361 emit_jcc(as, CC_P, as->mcp);
2364 emit_mrm(as, XO_UCOMISD, left, right);
2365 } else {
2366 IRRef lref = ir->op1, rref = ir->op2;
2367 IROp leftop = (IROp)(IR(lref)->o);
2368 lua_assert(irt_isint(ir->t) || (irt_isaddr(ir->t) && (cc & 0xe) == CC_E));
2369 /* Swap constants (only for ABC) and fusable loads to the right. */
2370 if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) {
2371 if ((cc & 0xc) == 0xc) cc ^= 3; /* L <-> G, LE <-> GE */
2372 else if ((cc & 0xa) == 0x2) cc ^= 5; /* A <-> B, AE <-> BE */
2373 lref = ir->op2; rref = ir->op1;
2375 if (irref_isk(rref)) {
2376 IRIns *irl = IR(lref);
2377 int32_t imm = IR(rref)->i;
2378 /* Check wether we can use test ins. Not for unsigned, since CF=0. */
2379 int usetest = (imm == 0 && (cc & 0xa) != 0x2);
2380 if (usetest && irl->o == IR_BAND && irl+1 == ir && !ra_used(irl)) {
2381 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
2382 Reg right, left = RID_NONE;
2383 RegSet allow = RSET_GPR;
2384 if (!irref_isk(irl->op2)) {
2385 left = ra_alloc1(as, irl->op2, allow);
2386 rset_clear(allow, left);
2388 right = asm_fuseload(as, irl->op1, allow);
2389 asm_guardcc(as, cc);
2390 if (irref_isk(irl->op2)) {
2391 emit_i32(as, IR(irl->op2)->i);
2392 emit_mrm(as, XO_GROUP3, XOg_TEST, right);
2393 } else {
2394 emit_mrm(as, XO_TEST, left, right);
2396 } else {
2397 Reg left;
2398 if (opisfusableload((IROp)irl->o) &&
2399 ((irt_isu8(irl->t) && checku8(imm)) ||
2400 ((irt_isi8(irl->t) || irt_isi16(irl->t)) && checki8(imm)) ||
2401 (irt_isu16(irl->t) && checku16(imm) && checki8((int16_t)imm)))) {
2402 /* Only the IRT_INT case is fused by asm_fuseload.
2403 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
2404 ** are handled here.
2405 ** Note that cmp word [mem], imm16 should not be generated,
2406 ** since it has a length-changing prefix. Compares of a word
2407 ** against a sign-extended imm8 are ok, however.
2409 IRType1 origt = irl->t; /* Temporarily flip types. */
2410 irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT;
2411 left = asm_fuseload(as, lref, RSET_GPR);
2412 irl->t = origt;
2413 if (left == RID_MRM) { /* Fusion succeeded? */
2414 asm_guardcc(as, cc);
2415 emit_i8(as, imm);
2416 emit_mrm(as, (irt_isi8(origt) || irt_isu8(origt)) ?
2417 XO_ARITHib : XO_ARITHiw8, XOg_CMP, RID_MRM);
2418 return;
2419 } /* Otherwise handle register case as usual. */
2420 } else {
2421 left = asm_fuseload(as, lref, RSET_GPR);
2423 asm_guardcc(as, cc);
2424 if (usetest && left != RID_MRM) {
2425 /* Use test r,r instead of cmp r,0. */
2426 if (irl+1 == ir) /* Referencing previous ins? */
2427 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
2428 emit_rr(as, XO_TEST, left, left);
2429 } else {
2430 x86Op xo;
2431 if (checki8(imm)) {
2432 emit_i8(as, imm);
2433 xo = XO_ARITHi8;
2434 } else {
2435 emit_i32(as, imm);
2436 xo = XO_ARITHi;
2438 emit_mrm(as, xo, XOg_CMP, left);
2441 } else {
2442 Reg left = ra_alloc1(as, lref, RSET_GPR);
2443 Reg right = asm_fuseload(as, rref, rset_exclude(RSET_GPR, left));
2444 asm_guardcc(as, cc);
2445 emit_mrm(as, XO_CMP, left, right);
2450 #define asm_comp(as, ir, ci, cf, cu) \
2451 asm_comp_(as, ir, (ci)+((cf)<<4)+(cu))
2453 /* -- GC handling --------------------------------------------------------- */
2455 /* Sync all live GC values to Lua stack slots. */
2456 static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base)
2458 /* Some care must be taken when allocating registers here, since this is
2459 ** not part of the fast path. All scratch registers are evicted in the
2460 ** fast path, so it's easiest to force allocation from scratch registers
2461 ** only. This avoids register allocation state unification.
2463 RegSet allow = rset_exclude(RSET_SCRATCH & RSET_GPR, base);
2464 IRRef2 *map = &as->T->snapmap[snap->mapofs];
2465 BCReg s, nslots = snap->nslots;
2466 for (s = 0; s < nslots; s++) {
2467 IRRef ref = snap_ref(map[s]);
2468 if (!irref_isk(ref)) {
2469 IRIns *ir = IR(ref);
2470 if (ir->o == IR_FRAME) {
2471 /* NYI: sync the frame, bump base, set topslot, clear new slots. */
2472 lj_trace_err(as->J, LJ_TRERR_NYIGCF);
2473 } else if (irt_isgcv(ir->t) &&
2474 !(ir->o == IR_SLOAD && ir->op1 < nslots && map[ir->op1] == 0)) {
2475 Reg src = ra_alloc1(as, ref, allow);
2476 int32_t ofs = 8*(int32_t)(s-1);
2477 emit_movtomro(as, src, base, ofs);
2478 emit_movmroi(as, base, ofs+4, irt_toitype(ir->t));
2479 checkmclim(as);
2485 /* Check GC threshold and do one or more GC steps. */
2486 static void asm_gc_check(ASMState *as, SnapShot *snap)
2488 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
2489 IRRef args[2];
2490 MCLabel l_end;
2491 Reg base, lstate, tmp;
2492 RegSet drop = RSET_SCRATCH;
2493 if (ra_hasreg(IR(REF_BASE)->r)) /* Stack may be reallocated by the GC. */
2494 drop |= RID2RSET(IR(REF_BASE)->r); /* Need to evict BASE, too. */
2495 ra_evictset(as, drop);
2496 l_end = emit_label(as);
2497 args[0] = ASMREF_L;
2498 args[1] = ASMREF_TMP1;
2499 asm_gencall(as, ci, args);
2500 tmp = ra_releasetmp(as, ASMREF_TMP1);
2501 emit_loadi(as, tmp, (int32_t)as->gcsteps);
2502 /* We don't know spadj yet, so get the C frame from L->cframe. */
2503 emit_movmroi(as, tmp, CFRAME_OFS_PC,
2504 (int32_t)as->T->snapmap[snap->mapofs+snap->nslots]);
2505 emit_gri(as, XG_ARITHi(XOg_AND), tmp, CFRAME_RAWMASK);
2506 lstate = IR(ASMREF_L)->r;
2507 emit_movrmro(as, tmp, lstate, offsetof(lua_State, cframe));
2508 /* It's ok if lstate is already in a non-scratch reg. But all allocations
2509 ** in the non-fast path must use a scratch reg. See comment above.
2511 base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_SCRATCH & RSET_GPR, lstate));
2512 emit_movtomro(as, base, lstate, offsetof(lua_State, base));
2513 asm_gc_sync(as, snap, base);
2514 /* BASE/L get restored anyway, better do it inside the slow path. */
2515 if (as->parent || as->curins == as->loopref) ra_restore(as, REF_BASE);
2516 if (rset_test(RSET_SCRATCH, lstate) && ra_hasreg(IR(ASMREF_L)->r))
2517 ra_restore(as, ASMREF_L);
2518 /* Jump around GC step if GC total < GC threshold. */
2519 tmp = ra_scratch(as, RSET_SCRATCH & RSET_GPR);
2520 emit_sjcc(as, CC_B, l_end);
2521 emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold);
2522 emit_getgl(as, tmp, gc.total);
2523 as->gcsteps = 0;
2524 checkmclim(as);
2527 /* -- PHI and loop handling ----------------------------------------------- */
2529 /* Break a PHI cycle by renaming to a free register (evict if needed). */
2530 static void asm_phi_break(ASMState *as, RegSet blocked, RegSet blockedby,
2531 RegSet allow)
2533 RegSet candidates = blocked & allow;
2534 if (candidates) { /* If this register file has candidates. */
2535 /* Note: the set for ra_pick cannot be empty, since each register file
2536 ** has some registers never allocated to PHIs.
2538 Reg down, up = ra_pick(as, ~blocked & allow); /* Get a free register. */
2539 if (candidates & ~blockedby) /* Optimize shifts, else it's a cycle. */
2540 candidates = candidates & ~blockedby;
2541 down = rset_picktop(candidates); /* Pick candidate PHI register. */
2542 ra_rename(as, down, up); /* And rename it to the free register. */
2546 /* PHI register shuffling.
2548 ** The allocator tries hard to preserve PHI register assignments across
2549 ** the loop body. Most of the time this loop does nothing, since there
2550 ** are no register mismatches.
2552 ** If a register mismatch is detected and ...
2553 ** - the register is currently free: rename it.
2554 ** - the register is blocked by an invariant: restore/remat and rename it.
2555 ** - Otherwise the register is used by another PHI, so mark it as blocked.
2557 ** The renames are order-sensitive, so just retry the loop if a register
2558 ** is marked as blocked, but has been freed in the meantime. A cycle is
2559 ** detected if all of the blocked registers are allocated. To break the
2560 ** cycle rename one of them to a free register and retry.
2562 ** Note that PHI spill slots are kept in sync and don't need to be shuffled.
2564 static void asm_phi_shuffle(ASMState *as)
2566 RegSet work;
2568 /* Find and resolve PHI register mismatches. */
2569 for (;;) {
2570 RegSet blocked = RSET_EMPTY;
2571 RegSet blockedby = RSET_EMPTY;
2572 RegSet phiset = as->phiset;
2573 while (phiset) { /* Check all left PHI operand registers. */
2574 Reg r = rset_picktop(phiset);
2575 IRIns *irl = IR(as->phireg[r]);
2576 Reg left = irl->r;
2577 if (r != left) { /* Mismatch? */
2578 if (!rset_test(as->freeset, r)) { /* PHI register blocked? */
2579 IRRef ref = regcost_ref(as->cost[r]);
2580 if (irt_ismarked(IR(ref)->t)) { /* Blocked by other PHI (w/reg)? */
2581 rset_set(blocked, r);
2582 if (ra_hasreg(left))
2583 rset_set(blockedby, left);
2584 left = RID_NONE;
2585 } else { /* Otherwise grab register from invariant. */
2586 ra_restore(as, ref);
2587 checkmclim(as);
2590 if (ra_hasreg(left)) {
2591 ra_rename(as, left, r);
2592 checkmclim(as);
2595 rset_clear(phiset, r);
2597 if (!blocked) break; /* Finished. */
2598 if (!(as->freeset & blocked)) { /* Break cycles if none are free. */
2599 asm_phi_break(as, blocked, blockedby, RSET_GPR);
2600 asm_phi_break(as, blocked, blockedby, RSET_FPR);
2601 checkmclim(as);
2602 } /* Else retry some more renames. */
2605 /* Restore/remat invariants whose registers are modified inside the loop. */
2606 work = as->modset & ~(as->freeset | as->phiset);
2607 while (work) {
2608 Reg r = rset_picktop(work);
2609 ra_restore(as, regcost_ref(as->cost[r]));
2610 rset_clear(work, r);
2611 checkmclim(as);
2614 /* Allocate and save all unsaved PHI regs and clear marks. */
2615 work = as->phiset;
2616 while (work) {
2617 Reg r = rset_picktop(work);
2618 IRRef lref = as->phireg[r];
2619 IRIns *ir = IR(lref);
2620 if (ra_hasspill(ir->s)) { /* Left PHI gained a spill slot? */
2621 irt_clearmark(ir->t); /* Handled here, so clear marker now. */
2622 ra_alloc1(as, lref, RID2RSET(r));
2623 ra_save(as, ir, r); /* Save to spill slot inside the loop. */
2624 checkmclim(as);
2626 rset_clear(work, r);
2630 /* Emit renames for left PHIs which are only spilled outside the loop. */
2631 static void asm_phi_fixup(ASMState *as)
2633 RegSet work = as->phiset;
2634 while (work) {
2635 Reg r = rset_picktop(work);
2636 IRRef lref = as->phireg[r];
2637 IRIns *ir = IR(lref);
2638 /* Left PHI gained a spill slot before the loop? */
2639 if (irt_ismarked(ir->t) && ra_hasspill(ir->s)) {
2640 IRRef ren;
2641 lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), lref, as->loopsnapno);
2642 ren = tref_ref(lj_ir_emit(as->J));
2643 as->ir = as->T->ir; /* The IR may have been reallocated. */
2644 IR(ren)->r = (uint8_t)r;
2645 IR(ren)->s = SPS_NONE;
2647 irt_clearmark(ir->t); /* Always clear marker. */
2648 rset_clear(work, r);
2652 /* Setup right PHI reference. */
2653 static void asm_phi(ASMState *as, IRIns *ir)
2655 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
2656 RegSet afree = (as->freeset & allow);
2657 IRIns *irl = IR(ir->op1);
2658 IRIns *irr = IR(ir->op2);
2659 /* Spill slot shuffling is not implemented yet (but rarely needed). */
2660 if (ra_hasspill(irl->s) || ra_hasspill(irr->s))
2661 lj_trace_err(as->J, LJ_TRERR_NYIPHI);
2662 /* Leave at least one register free for non-PHIs (and PHI cycle breaking). */
2663 if ((afree & (afree-1))) { /* Two or more free registers? */
2664 Reg r;
2665 if (ra_noreg(irr->r)) { /* Get a register for the right PHI. */
2666 r = ra_allocref(as, ir->op2, allow);
2667 } else { /* Duplicate right PHI, need a copy (rare). */
2668 r = ra_scratch(as, allow);
2669 emit_movrr(as, r, irr->r);
2671 ir->r = (uint8_t)r;
2672 rset_set(as->phiset, r);
2673 as->phireg[r] = (IRRef1)ir->op1;
2674 irt_setmark(irl->t); /* Marks left PHIs _with_ register. */
2675 if (ra_noreg(irl->r))
2676 ra_sethint(irl->r, r); /* Set register hint for left PHI. */
2677 } else { /* Otherwise allocate a spill slot. */
2678 /* This is overly restrictive, but it triggers only on synthetic code. */
2679 if (ra_hasreg(irl->r) || ra_hasreg(irr->r))
2680 lj_trace_err(as->J, LJ_TRERR_NYIPHI);
2681 ra_spill(as, ir);
2682 irl->s = irr->s = ir->s; /* Sync left/right PHI spill slots. */
2686 /* Fixup the loop branch. */
2687 static void asm_loop_fixup(ASMState *as)
2689 MCode *p = as->mctop;
2690 MCode *target = as->mcp;
2691 if (as->realign) { /* Realigned loops use short jumps. */
2692 as->realign = NULL; /* Stop another retry. */
2693 lua_assert(((intptr_t)target & 15) == 0);
2694 if (as->loopinv) { /* Inverted loop branch? */
2695 p -= 5;
2696 p[0] = XI_JMP;
2697 lua_assert(target - p >= -128);
2698 p[-1] = (MCode)(target - p); /* Patch sjcc. */
2699 if (as->loopinv == 2)
2700 p[-3] = (MCode)(target - p + 2); /* Patch opt. short jp. */
2701 } else {
2702 lua_assert(target - p >= -128);
2703 p[-1] = (MCode)(int8_t)(target - p); /* Patch short jmp. */
2704 p[-2] = XI_JMPs;
2706 } else {
2707 MCode *newloop;
2708 p[-5] = XI_JMP;
2709 if (as->loopinv) { /* Inverted loop branch? */
2710 /* asm_guardcc already inverted the jcc and patched the jmp. */
2711 p -= 5;
2712 newloop = target+4;
2713 *(int32_t *)(p-4) = (int32_t)(target - p); /* Patch jcc. */
2714 if (as->loopinv == 2) {
2715 *(int32_t *)(p-10) = (int32_t)(target - p + 6); /* Patch opt. jp. */
2716 newloop = target+8;
2718 } else { /* Otherwise just patch jmp. */
2719 *(int32_t *)(p-4) = (int32_t)(target - p);
2720 newloop = target+3;
2722 /* Realign small loops and shorten the loop branch. */
2723 if (newloop >= p - 128) {
2724 as->realign = newloop; /* Force a retry and remember alignment. */
2725 as->curins = as->stopins; /* Abort asm_trace now. */
2726 as->T->nins = as->orignins; /* Remove any added renames. */
2731 /* Middle part of a loop. */
2732 static void asm_loop(ASMState *as)
2734 /* LOOP is a guard, so the snapno is up to date. */
2735 as->loopsnapno = as->snapno;
2736 if (as->gcsteps)
2737 asm_gc_check(as, &as->T->snap[as->loopsnapno]);
2738 /* LOOP marks the transition from the variant to the invariant part. */
2739 as->testmcp = as->invmcp = NULL;
2740 as->sectref = 0;
2741 if (!neverfuse(as)) as->fuseref = 0;
2742 asm_phi_shuffle(as);
2743 asm_loop_fixup(as);
2744 as->mcloop = as->mcp;
2745 RA_DBGX((as, "===== LOOP ====="));
2746 if (!as->realign) RA_DBG_FLUSH();
2749 /* -- Head of trace ------------------------------------------------------- */
2751 /* Rematerialize all remaining constants in registers. */
2752 static void asm_const_remat(ASMState *as)
2754 RegSet work = ~as->freeset & RSET_ALL;
2755 while (work) {
2756 Reg r = rset_pickbot(work);
2757 IRRef ref = regcost_ref(as->cost[r]);
2758 if (irref_isk(ref) || ref == REF_BASE) {
2759 ra_rematk(as, IR(ref));
2760 checkmclim(as);
2762 rset_clear(work, r);
2766 /* Head of a root trace. */
2767 static void asm_head_root(ASMState *as)
2769 int32_t spadj;
2770 emit_setgli(as, vmstate, (int32_t)as->J->curtrace);
2771 spadj = sps_adjust(as->evenspill);
2772 as->T->spadjust = (uint16_t)spadj;
2773 emit_addptr(as, RID_ESP, -spadj);
2776 /* Handle BASE coalescing for a root trace. */
2777 static void asm_head_base(ASMState *as)
2779 IRIns *ir = IR(REF_BASE);
2780 Reg r = ir->r;
2781 lua_assert(!ra_hasspill(ir->s));
2782 if (ra_hasreg(r)) {
2783 ra_free(as, r);
2784 if (r != RID_BASE) {
2785 ra_scratch(as, RID2RSET(RID_BASE));
2786 emit_rr(as, XO_MOV, r, RID_BASE);
2791 /* Check Lua stack size for overflow at the start of a side trace.
2792 ** Stack overflow is rare, so let the regular exit handling fix this up.
2793 ** This is done in the context of the *parent* trace and parent exitno!
2795 static void asm_checkstack(ASMState *as, RegSet allow)
2797 /* Try to get an unused temp. register, otherwise spill/restore eax. */
2798 Reg r = allow ? rset_pickbot(allow) : RID_EAX;
2799 emit_jcc(as, CC_B, exitstub_addr(as->J, as->J->exitno));
2800 if (allow == RSET_EMPTY) /* Restore temp. register. */
2801 emit_rmro(as, XO_MOV, r, RID_ESP, sps_scale(SPS_TEMP1));
2802 emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*as->topslot));
2803 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, ptr2addr(&J2G(as->J)->jit_base));
2804 emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack));
2805 emit_getgl(as, r, jit_L);
2806 if (allow == RSET_EMPTY) /* Spill temp. register. */
2807 emit_rmro(as, XO_MOVto, r, RID_ESP, sps_scale(SPS_TEMP1));
2810 /* Head of a side trace.
2812 ** The current simplistic algorithm requires that all slots inherited
2813 ** from the parent are live in a register between pass 2 and pass 3. This
2814 ** avoids the complexity of stack slot shuffling. But of course this may
2815 ** overflow the register set in some cases and cause the dreaded error:
2816 ** "NYI: register coalescing too complex". A refined algorithm is needed.
2818 static void asm_head_side(ASMState *as)
2820 IRRef1 sloadins[RID_MAX];
2821 RegSet allow = RSET_ALL; /* Inverse of all coalesced registers. */
2822 RegSet live = RSET_EMPTY; /* Live parent registers. */
2823 int32_t spadj, spdelta;
2824 int pass2 = 0;
2825 int pass3 = 0;
2826 IRRef i;
2828 /* Scan all parent SLOADs and collect register dependencies. */
2829 for (i = as->curins; i > REF_BASE; i--) {
2830 IRIns *ir = IR(i);
2831 lua_assert((ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_PARENT)) ||
2832 ir->o == IR_FRAME);
2833 if (ir->o == IR_SLOAD) {
2834 RegSP rs = as->parentmap[ir->op1];
2835 if (ra_hasreg(ir->r)) {
2836 rset_clear(allow, ir->r);
2837 if (ra_hasspill(ir->s))
2838 ra_save(as, ir, ir->r);
2839 } else if (ra_hasspill(ir->s)) {
2840 irt_setmark(ir->t);
2841 pass2 = 1;
2843 if (ir->r == rs) { /* Coalesce matching registers right now. */
2844 ra_free(as, ir->r);
2845 } else if (ra_hasspill(regsp_spill(rs))) {
2846 if (ra_hasreg(ir->r))
2847 pass3 = 1;
2848 } else if (ra_used(ir)) {
2849 sloadins[rs] = (IRRef1)i;
2850 rset_set(live, rs); /* Block live parent register. */
2855 /* Calculate stack frame adjustment. */
2856 spadj = sps_adjust(as->evenspill);
2857 spdelta = spadj - (int32_t)as->parent->spadjust;
2858 if (spdelta < 0) { /* Don't shrink the stack frame. */
2859 spadj = (int32_t)as->parent->spadjust;
2860 spdelta = 0;
2862 as->T->spadjust = (uint16_t)spadj;
2864 /* Reload spilled target registers. */
2865 if (pass2) {
2866 for (i = as->curins; i > REF_BASE; i--) {
2867 IRIns *ir = IR(i);
2868 if (irt_ismarked(ir->t)) {
2869 RegSet mask;
2870 Reg r;
2871 RegSP rs;
2872 irt_clearmark(ir->t);
2873 rs = as->parentmap[ir->op1];
2874 if (!ra_hasspill(regsp_spill(rs)))
2875 ra_sethint(ir->r, rs); /* Hint may be gone, set it again. */
2876 else if (sps_scale(regsp_spill(rs))+spdelta == sps_scale(ir->s))
2877 continue; /* Same spill slot, do nothing. */
2878 mask = (irt_isnum(ir->t) ? RSET_FPR : RSET_GPR) & allow;
2879 if (mask == RSET_EMPTY)
2880 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
2881 r = ra_allocref(as, i, mask);
2882 ra_save(as, ir, r);
2883 rset_clear(allow, r);
2884 if (r == rs) { /* Coalesce matching registers right now. */
2885 ra_free(as, r);
2886 rset_clear(live, r);
2887 } else if (ra_hasspill(regsp_spill(rs))) {
2888 pass3 = 1;
2890 checkmclim(as);
2895 /* Store trace number and adjust stack frame relative to the parent. */
2896 emit_setgli(as, vmstate, (int32_t)as->J->curtrace);
2897 emit_addptr(as, RID_ESP, -spdelta);
2899 /* Restore target registers from parent spill slots. */
2900 if (pass3) {
2901 RegSet work = ~as->freeset & RSET_ALL;
2902 while (work) {
2903 Reg r = rset_pickbot(work);
2904 IRIns *ir = IR(regcost_ref(as->cost[r]));
2905 RegSP rs = as->parentmap[ir->op1];
2906 rset_clear(work, r);
2907 if (ra_hasspill(regsp_spill(rs))) {
2908 int32_t ofs = sps_scale(regsp_spill(rs));
2909 ra_free(as, r);
2910 emit_movrmro(as, r, RID_ESP, ofs);
2911 checkmclim(as);
2916 /* Shuffle registers to match up target regs with parent regs. */
2917 for (;;) {
2918 RegSet work;
2920 /* Repeatedly coalesce free live registers by moving to their target. */
2921 while ((work = as->freeset & live) != RSET_EMPTY) {
2922 Reg rp = rset_pickbot(work);
2923 IRIns *ir = IR(sloadins[rp]);
2924 rset_clear(live, rp);
2925 rset_clear(allow, rp);
2926 ra_free(as, ir->r);
2927 emit_movrr(as, ir->r, rp);
2928 checkmclim(as);
2931 /* We're done if no live registers remain. */
2932 if (live == RSET_EMPTY)
2933 break;
2935 /* Break cycles by renaming one target to a temp. register. */
2936 if (live & RSET_GPR) {
2937 RegSet tmpset = as->freeset & ~live & allow & RSET_GPR;
2938 if (tmpset == RSET_EMPTY)
2939 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
2940 ra_rename(as, rset_pickbot(live & RSET_GPR), rset_pickbot(tmpset));
2942 if (live & RSET_FPR) {
2943 RegSet tmpset = as->freeset & ~live & allow & RSET_FPR;
2944 if (tmpset == RSET_EMPTY)
2945 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
2946 ra_rename(as, rset_pickbot(live & RSET_FPR), rset_pickbot(tmpset));
2948 checkmclim(as);
2949 /* Continue with coalescing to fix up the broken cycle(s). */
2952 /* Check Lua stack size if frames have been added. */
2953 if (as->topslot)
2954 asm_checkstack(as, allow & RSET_GPR);
2957 /* -- Tail of trace ------------------------------------------------------- */
2959 /* Sync Lua stack slots to match the last snapshot.
2960 ** Note: code generation is backwards, so this is best read bottom-up.
2962 static void asm_tail_sync(ASMState *as)
2964 SnapShot *snap = &as->T->snap[as->T->nsnap-1]; /* Last snapshot. */
2965 BCReg s, nslots = snap->nslots;
2966 IRRef2 *map = &as->T->snapmap[snap->mapofs];
2967 IRRef2 *flinks = map + nslots + snap->nframelinks;
2968 BCReg newbase = 0;
2969 BCReg secondbase = ~(BCReg)0;
2970 BCReg topslot = 0;
2972 checkmclim(as);
2973 ra_allocref(as, REF_BASE, RID2RSET(RID_BASE));
2975 /* Must check all frames to find topslot (outer can be larger than inner). */
2976 for (s = 0; s < nslots; s++) {
2977 IRRef ref = snap_ref(map[s]);
2978 if (!irref_isk(ref)) {
2979 IRIns *ir = IR(ref);
2980 if (ir->o == IR_FRAME && irt_isfunc(ir->t)) {
2981 GCfunc *fn = ir_kfunc(IR(ir->op2));
2982 if (isluafunc(fn)) {
2983 BCReg fs = s + funcproto(fn)->framesize;
2984 if (fs > topslot) topslot = fs;
2985 if (s != 0) {
2986 newbase = s;
2987 if (secondbase == ~(BCReg)0) secondbase = s;
2993 as->topslot = topslot; /* Used in asm_head_side(). */
2995 if (as->T->link == TRACE_INTERP) {
2996 /* Setup fixed registers for exit to interpreter. */
2997 emit_loada(as, RID_DISPATCH, J2GG(as->J)->dispatch);
2998 emit_loadi(as, RID_PC, (int32_t)map[nslots]);
2999 } else if (newbase) {
3000 /* Save modified BASE for linking to trace with higher start frame. */
3001 emit_setgl(as, RID_BASE, jit_base);
3004 emit_addptr(as, RID_BASE, 8*(int32_t)newbase);
3006 /* Clear stack slots of newly added frames. */
3007 if (nslots <= topslot) {
3008 if (nslots < topslot) {
3009 for (s = nslots; s <= topslot; s++) {
3010 emit_movtomro(as, RID_EAX, RID_BASE, 8*(int32_t)s-4);
3011 checkmclim(as);
3013 emit_loadi(as, RID_EAX, LJ_TNIL);
3014 } else {
3015 emit_movmroi(as, RID_BASE, 8*(int32_t)nslots-4, LJ_TNIL);
3019 /* Store the value of all modified slots to the Lua stack. */
3020 for (s = 0; s < nslots; s++) {
3021 int32_t ofs = 8*((int32_t)s-1);
3022 IRRef ref = snap_ref(map[s]);
3023 if (ref) {
3024 IRIns *ir = IR(ref);
3025 /* No need to restore readonly slots and unmodified non-parent slots. */
3026 if (ir->o == IR_SLOAD && ir->op1 == s &&
3027 (ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT)
3028 continue;
3029 if (irt_isnum(ir->t)) {
3030 Reg src = ra_alloc1(as, ref, RSET_FPR);
3031 emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
3032 } else if (ir->o == IR_FRAME) {
3033 emit_movmroi(as, RID_BASE, ofs, ptr2addr(ir_kgc(IR(ir->op2))));
3034 if (s != 0) /* Do not overwrite link to previous frame. */
3035 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*--flinks));
3036 } else {
3037 lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t));
3038 if (!irref_isk(ref)) {
3039 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
3040 emit_movtomro(as, src, RID_BASE, ofs);
3041 } else if (!irt_ispri(ir->t)) {
3042 emit_movmroi(as, RID_BASE, ofs, ir->i);
3044 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
3046 } else if (s > secondbase) {
3047 emit_movmroi(as, RID_BASE, ofs+4, LJ_TNIL);
3049 checkmclim(as);
3051 lua_assert(map + nslots == flinks-1);
3054 /* Fixup the tail code. */
3055 static void asm_tail_fixup(ASMState *as, TraceNo lnk)
3057 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
3058 MCode *p = as->mctop;
3059 MCode *target, *q;
3060 int32_t spadj = as->T->spadjust;
3061 if (spadj == 0) {
3062 p -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6;
3063 } else {
3064 MCode *p1;
3065 /* Patch stack adjustment. */
3066 if (checki8(spadj)) {
3067 p -= 3;
3068 p1 = p-6;
3069 *p1 = (MCode)spadj;
3070 } else {
3071 p1 = p-9;
3072 *(int32_t *)p1 = spadj;
3074 if ((as->flags & JIT_F_LEA_AGU)) {
3075 p1[-3] = (MCode)XI_LEA;
3076 p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP);
3077 p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
3078 } else {
3079 p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
3080 p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
3083 /* Patch exit branch. */
3084 target = lnk == TRACE_INTERP ? (MCode *)lj_vm_exit_interp :
3085 as->J->trace[lnk]->mcode;
3086 *(int32_t *)(p-4) = (int32_t)(target - p);
3087 p[-5] = XI_JMP;
3088 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
3089 for (q = as->mctop-1; q >= p; q--)
3090 *q = XI_NOP;
3091 as->mctop = p;
3094 /* -- Instruction dispatch ------------------------------------------------ */
3096 /* Assemble a single instruction. */
3097 static void asm_ir(ASMState *as, IRIns *ir)
3099 switch ((IROp)ir->o) {
3100 /* Miscellaneous ops. */
3101 case IR_LOOP: asm_loop(as); break;
3102 case IR_NOP: break;
3103 case IR_PHI: asm_phi(as, ir); break;
3105 /* Guarded assertions. */
3106 case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break;
3107 case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break;
3108 case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break;
3109 case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break;
3110 case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break;
3111 case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break;
3112 case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break;
3113 case IR_ABC:
3114 case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
3116 case IR_FRAME:
3117 if (ir->op1 == ir->op2) break; /* No check needed for placeholder. */
3118 /* fallthrough */
3119 case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
3120 case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break;
3122 /* Bit ops. */
3123 case IR_BNOT: asm_bitnot(as, ir); break;
3124 case IR_BSWAP: asm_bitswap(as, ir); break;
3126 case IR_BAND: asm_intarith(as, ir, XOg_AND); break;
3127 case IR_BOR: asm_intarith(as, ir, XOg_OR); break;
3128 case IR_BXOR: asm_intarith(as, ir, XOg_XOR); break;
3130 case IR_BSHL: asm_bitshift(as, ir, XOg_SHL); break;
3131 case IR_BSHR: asm_bitshift(as, ir, XOg_SHR); break;
3132 case IR_BSAR: asm_bitshift(as, ir, XOg_SAR); break;
3133 case IR_BROL: asm_bitshift(as, ir, XOg_ROL); break;
3134 case IR_BROR: asm_bitshift(as, ir, XOg_ROR); break;
3136 /* Arithmetic ops. */
3137 case IR_ADD: asm_add(as, ir); break;
3138 case IR_SUB:
3139 if (irt_isnum(ir->t))
3140 asm_fparith(as, ir, XO_SUBSD);
3141 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
3142 asm_intarith(as, ir, XOg_SUB);
3143 break;
3144 case IR_MUL: asm_fparith(as, ir, XO_MULSD); break;
3145 case IR_DIV: asm_fparith(as, ir, XO_DIVSD); break;
3147 case IR_NEG: asm_fparith(as, ir, XO_XORPS); break;
3148 case IR_ABS: asm_fparith(as, ir, XO_ANDPS); break;
3150 case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
3151 case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;
3153 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
3154 asm_fpmath(as, ir);
3155 break;
3156 case IR_POWI: asm_powi(as, ir); break;
3158 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3159 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
3160 case IR_SUBOV: asm_intarith(as, ir, XOg_SUB); break;
3162 /* Memory references. */
3163 case IR_AREF: asm_aref(as, ir); break;
3164 case IR_HREF: asm_href(as, ir); break;
3165 case IR_HREFK: asm_hrefk(as, ir); break;
3166 case IR_NEWREF: asm_newref(as, ir); break;
3167 case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break;
3168 case IR_FREF: asm_fref(as, ir); break;
3169 case IR_STRREF: asm_strref(as, ir); break;
3171 /* Loads and stores. */
3172 case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: asm_ahuload(as, ir); break;
3173 case IR_FLOAD: case IR_XLOAD: asm_fxload(as, ir); break;
3174 case IR_SLOAD: asm_sload(as, ir); break;
3176 case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break;
3177 case IR_FSTORE: asm_fstore(as, ir); break;
3179 /* Allocations. */
3180 case IR_SNEW: asm_snew(as, ir); break;
3181 case IR_TNEW: asm_tnew(as, ir); break;
3182 case IR_TDUP: asm_tdup(as, ir); break;
3184 /* Write barriers. */
3185 case IR_TBAR: asm_tbar(as, ir); break;
3186 case IR_OBAR: asm_obar(as, ir); break;
3188 /* Type conversions. */
3189 case IR_TONUM: asm_tonum(as, ir); break;
3190 case IR_TOINT:
3191 if (irt_isguard(ir->t))
3192 asm_tointg(as, ir, ra_alloc1(as, ir->op1, RSET_FPR));
3193 else
3194 asm_toint(as, ir); break;
3195 break;
3196 case IR_TOBIT: asm_tobit(as, ir); break;
3197 case IR_TOSTR: asm_tostr(as, ir); break;
3198 case IR_STRTO: asm_strto(as, ir); break;
3200 /* Calls. */
3201 case IR_CALLN: case IR_CALLL: case IR_CALLS: asm_call(as, ir); break;
3202 case IR_CARG: break;
3204 default:
3205 setintV(&as->J->errinfo, ir->o);
3206 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
3207 break;
3211 /* Assemble a trace in linear backwards order. */
3212 static void asm_trace(ASMState *as)
3214 for (as->curins--; as->curins > as->stopins; as->curins--) {
3215 IRIns *ir = IR(as->curins);
3216 if (irt_isguard(ir->t))
3217 asm_snap_prep(as);
3218 else if (!ra_used(ir) && !irm_sideeff(lj_ir_mode[ir->o]) &&
3219 (as->flags & JIT_F_OPT_DCE))
3220 continue; /* Dead-code elimination can be soooo easy. */
3221 RA_DBG_REF();
3222 checkmclim(as);
3223 asm_ir(as, ir);
3227 /* -- Trace setup --------------------------------------------------------- */
3229 /* Clear reg/sp for all instructions and add register hints. */
3230 static void asm_setup_regsp(ASMState *as, Trace *T)
3232 IRRef i, nins;
3233 int inloop;
3235 ra_setup(as);
3237 /* Clear reg/sp for constants. */
3238 for (i = T->nk; i < REF_BIAS; i++)
3239 IR(i)->prev = REGSP_INIT;
3241 /* REF_BASE is used for implicit references to the BASE register. */
3242 IR(REF_BASE)->prev = REGSP_HINT(RID_BASE);
3244 nins = T->nins;
3245 if (IR(nins-1)->o == IR_RENAME) {
3246 do { nins--; } while (IR(nins-1)->o == IR_RENAME);
3247 T->nins = nins; /* Remove any renames left over from ASM restart. */
3249 as->snaprename = nins;
3250 as->snapref = nins;
3251 as->snapno = T->nsnap;
3253 as->stopins = REF_BASE;
3254 as->orignins = nins;
3255 as->curins = nins;
3257 inloop = 0;
3258 as->evenspill = SPS_FIRST;
3259 for (i = REF_FIRST; i < nins; i++) {
3260 IRIns *ir = IR(i);
3261 switch (ir->o) {
3262 case IR_LOOP:
3263 inloop = 1;
3264 break;
3265 /* Set hints for slot loads from a parent trace. */
3266 case IR_SLOAD:
3267 if ((ir->op2 & IRSLOAD_PARENT)) {
3268 RegSP rs = as->parentmap[ir->op1];
3269 lua_assert(regsp_used(rs));
3270 as->stopins = i;
3271 if (!ra_hasspill(regsp_spill(rs)) && ra_hasreg(regsp_reg(rs))) {
3272 ir->prev = (uint16_t)REGSP_HINT(regsp_reg(rs));
3273 continue;
3276 break;
3277 case IR_FRAME:
3278 if (i == as->stopins+1 && ir->op1 == ir->op2)
3279 as->stopins++;
3280 break;
3281 case IR_CALLN: case IR_CALLL: case IR_CALLS: {
3282 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
3283 /* NYI: not fastcall-aware, but doesn't matter (yet). */
3284 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
3285 as->evenspill = (int32_t)CCI_NARGS(ci);
3286 #if LJ_64
3287 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
3288 #else
3289 ir->prev = REGSP_HINT(RID_RET);
3290 #endif
3291 if (inloop)
3292 as->modset |= (ci->flags & CCI_NOFPRCLOBBER) ?
3293 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
3294 continue;
3296 /* C calls evict all scratch regs and return results in RID_RET. */
3297 case IR_SNEW: case IR_TNEW: case IR_TDUP: case IR_TOSTR:
3298 case IR_NEWREF:
3299 ir->prev = REGSP_HINT(RID_RET);
3300 if (inloop)
3301 as->modset = RSET_SCRATCH;
3302 continue;
3303 case IR_STRTO: case IR_OBAR:
3304 if (inloop)
3305 as->modset = RSET_SCRATCH;
3306 break;
3307 case IR_POWI:
3308 ir->prev = REGSP_HINT(RID_XMM0);
3309 if (inloop)
3310 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
3311 continue;
3312 case IR_FPMATH:
3313 if (ir->op2 == IRFPM_EXP2) { /* May be joined to lj_vm_pow_sse. */
3314 ir->prev = REGSP_HINT(RID_XMM0);
3315 #if !LJ_64
3316 if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */
3317 as->evenspill = 4;
3318 #endif
3319 if (inloop)
3320 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
3321 continue;
3322 } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
3323 ir->prev = REGSP_HINT(RID_XMM0);
3324 if (inloop)
3325 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
3326 continue;
3328 break;
3329 /* Non-constant shift counts need to be in RID_ECX. */
3330 case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR:
3331 if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r))
3332 IR(ir->op2)->r = REGSP_HINT(RID_ECX);
3333 break;
3334 /* Do not propagate hints across type conversions. */
3335 case IR_TONUM: case IR_TOINT: case IR_TOBIT:
3336 break;
3337 default:
3338 /* Propagate hints across likely 'op reg, imm' or 'op reg'. */
3339 if (irref_isk(ir->op2) && !irref_isk(ir->op1)) {
3340 ir->prev = IR(ir->op1)->prev;
3341 continue;
3343 break;
3345 ir->prev = REGSP_INIT;
3347 if ((as->evenspill & 1))
3348 as->oddspill = as->evenspill++;
3349 else
3350 as->oddspill = 0;
3353 /* -- Assembler core ------------------------------------------------------ */
3355 /* Define this if you want to run LuaJIT with Valgrind. */
3356 #ifdef LUAJIT_USE_VALGRIND
3357 #include <valgrind/valgrind.h>
3358 #define VG_INVALIDATE(p, sz) VALGRIND_DISCARD_TRANSLATIONS(p, sz)
3359 #else
3360 #define VG_INVALIDATE(p, sz) ((void)0)
3361 #endif
3363 /* Assemble a trace. */
3364 void lj_asm_trace(jit_State *J, Trace *T)
3366 ASMState as_;
3367 ASMState *as = &as_;
3369 /* Setup initial state. Copy some fields to reduce indirections. */
3370 as->J = J;
3371 as->T = T;
3372 as->ir = T->ir;
3373 as->flags = J->flags;
3374 as->loopref = J->loopref;
3375 as->realign = NULL;
3376 as->loopinv = 0;
3377 if (J->parent) {
3378 as->parent = J->trace[J->parent];
3379 lj_snap_regspmap(as->parentmap, as->parent, J->exitno);
3380 } else {
3381 as->parent = NULL;
3383 as->mctop = lj_mcode_reserve(J, &as->mcbot); /* Reserve MCode memory. */
3384 as->mcp = as->mctop;
3385 as->mclim = as->mcbot + MCLIM_REDZONE;
3386 asm_exitstub_setup(as, T->nsnap);
3388 do {
3389 as->mcp = as->mctop;
3390 as->curins = T->nins;
3391 RA_DBG_START();
3392 RA_DBGX((as, "===== STOP ====="));
3393 /* Realign and leave room for backwards loop branch or exit branch. */
3394 if (as->realign) {
3395 int i = ((int)(intptr_t)as->realign) & 15;
3396 MCode *p = as->mctop;
3397 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
3398 while (i-- > 0)
3399 *--p = XI_NOP;
3400 as->mctop = p;
3401 as->mcp = p - (as->loopinv ? 5 : 2); /* Space for short/near jmp. */
3402 } else {
3403 as->mcp = as->mctop - 5; /* Space for exit branch (near jmp). */
3405 as->invmcp = as->mcp;
3406 as->mcloop = NULL;
3407 as->testmcp = NULL;
3408 as->topslot = 0;
3409 as->gcsteps = 0;
3410 as->sectref = as->loopref;
3411 as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED;
3413 /* Setup register allocation. */
3414 asm_setup_regsp(as, T);
3416 if (!as->loopref) {
3417 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
3418 as->mcp -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6;
3419 as->invmcp = NULL;
3420 asm_tail_sync(as);
3422 asm_trace(as);
3423 } while (as->realign); /* Retry in case the MCode needs to be realigned. */
3425 RA_DBG_REF();
3426 checkmclim(as);
3427 if (as->gcsteps)
3428 asm_gc_check(as, &as->T->snap[0]);
3429 if (!J->parent)
3430 asm_head_base(as);
3431 asm_const_remat(as);
3432 if (J->parent)
3433 asm_head_side(as);
3434 else
3435 asm_head_root(as);
3436 asm_phi_fixup(as);
3438 RA_DBGX((as, "===== START ===="));
3439 RA_DBG_FLUSH();
3440 if (as->freeset != RSET_ALL)
3441 lj_trace_err(as->J, LJ_TRERR_BADRA); /* Ouch! Should never happen. */
3443 /* Set trace entry point before fixing up tail to allow link to self. */
3444 T->mcode = as->mcp;
3445 T->mcloop = as->mcloop ? (MSize)(as->mcloop - as->mcp) : 0;
3446 if (!as->loopref)
3447 asm_tail_fixup(as, T->link); /* Note: this may change as->mctop! */
3448 T->szmcode = (MSize)(as->mctop - as->mcp);
3449 VG_INVALIDATE(T->mcode, T->szmcode);
3452 /* Patch exit jumps of existing machine code to a new target. */
3453 void lj_asm_patchexit(jit_State *J, Trace *T, ExitNo exitno, MCode *target)
3455 MCode *p = T->mcode;
3456 MCode *mcarea = lj_mcode_patch(J, p, 0);
3457 MSize len = T->szmcode;
3458 MCode *px = exitstub_addr(J, exitno) - 6;
3459 MCode *pe = p+len-6;
3460 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px)
3461 *(int32_t *)(p+len-4) = (int32_t)(target - (p+len));
3462 for (; p < pe; p++) {
3463 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) {
3464 *(int32_t *)(p+2) = (int32_t)(target - (p+6));
3465 p += 5;
3468 lj_mcode_patch(J, mcarea, 1);
3469 VG_INVALIDATE(T->mcode, T->szmcode);
3472 #undef IR
3474 #endif