Add IR_XBAR, a barrier against XLOAD/XSTORE optimizations.
[luajit-2.0.git] / src / lj_asm.c
blobc349c990189a8872feb59e6e92f8a9dcf0b8b850
1 /*
2 ** IR assembler (SSA IR -> machine code).
3 ** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
4 */
6 #define lj_asm_c
7 #define LUA_CORE
9 #include "lj_obj.h"
11 #if LJ_HASJIT
13 #include "lj_gc.h"
14 #include "lj_str.h"
15 #include "lj_tab.h"
16 #include "lj_frame.h"
17 #if LJ_HASFFI
18 #include "lj_ctype.h"
19 #endif
20 #include "lj_ir.h"
21 #include "lj_jit.h"
22 #include "lj_iropt.h"
23 #include "lj_mcode.h"
24 #include "lj_iropt.h"
25 #include "lj_trace.h"
26 #include "lj_snap.h"
27 #include "lj_asm.h"
28 #include "lj_dispatch.h"
29 #include "lj_vm.h"
30 #include "lj_target.h"
32 /* -- Assembler state and common macros ----------------------------------- */
34 /* Assembler state. */
35 typedef struct ASMState {
36 RegCost cost[RID_MAX]; /* Reference and blended allocation cost for regs. */
38 MCode *mcp; /* Current MCode pointer (grows down). */
39 MCode *mclim; /* Lower limit for MCode memory + red zone. */
41 IRIns *ir; /* Copy of pointer to IR instructions/constants. */
42 jit_State *J; /* JIT compiler state. */
44 x86ModRM mrm; /* Fused x86 address operand. */
46 RegSet freeset; /* Set of free registers. */
47 RegSet modset; /* Set of registers modified inside the loop. */
48 RegSet weakset; /* Set of weakly referenced registers. */
49 RegSet phiset; /* Set of PHI registers. */
51 uint32_t flags; /* Copy of JIT compiler flags. */
52 int loopinv; /* Loop branch inversion (0:no, 1:yes, 2:yes+CC_P). */
54 int32_t evenspill; /* Next even spill slot. */
55 int32_t oddspill; /* Next odd spill slot (or 0). */
57 IRRef curins; /* Reference of current instruction. */
58 IRRef stopins; /* Stop assembly before hitting this instruction. */
59 IRRef orignins; /* Original T->nins. */
61 IRRef snapref; /* Current snapshot is active after this reference. */
62 IRRef snaprename; /* Rename highwater mark for snapshot check. */
63 SnapNo snapno; /* Current snapshot number. */
64 SnapNo loopsnapno; /* Loop snapshot number. */
66 IRRef fuseref; /* Fusion limit (loopref, 0 or FUSE_DISABLED). */
67 IRRef sectref; /* Section base reference (loopref or 0). */
68 IRRef loopref; /* Reference of LOOP instruction (or 0). */
70 BCReg topslot; /* Number of slots for stack check (unless 0). */
71 MSize gcsteps; /* Accumulated number of GC steps (per section). */
73 GCtrace *T; /* Trace to assemble. */
74 GCtrace *parent; /* Parent trace (or NULL). */
76 MCode *mcbot; /* Bottom of reserved MCode. */
77 MCode *mctop; /* Top of generated MCode. */
78 MCode *mcloop; /* Pointer to loop MCode (or NULL). */
79 MCode *invmcp; /* Points to invertible loop branch (or NULL). */
80 MCode *testmcp; /* Pending opportunity to remove test r,r. */
81 MCode *realign; /* Realign loop if not NULL. */
83 IRRef1 phireg[RID_MAX]; /* PHI register references. */
84 uint16_t parentmap[LJ_MAX_JSLOTS]; /* Parent slot to RegSP map. */
85 } ASMState;
87 #define IR(ref) (&as->ir[(ref)])
89 #define ASMREF_TMP1 REF_TRUE /* Temp. register. */
90 #define ASMREF_TMP2 REF_FALSE /* Temp. register. */
91 #define ASMREF_L REF_NIL /* Stores register for L. */
93 /* Check for variant to invariant references. */
94 #define iscrossref(as, ref) ((ref) < as->sectref)
96 /* Inhibit memory op fusion from variant to invariant references. */
97 #define FUSE_DISABLED (~(IRRef)0)
98 #define mayfuse(as, ref) ((ref) > as->fuseref)
99 #define neverfuse(as) (as->fuseref == FUSE_DISABLED)
100 #define opisfusableload(o) \
101 ((o) == IR_ALOAD || (o) == IR_HLOAD || (o) == IR_ULOAD || \
102 (o) == IR_FLOAD || (o) == IR_XLOAD || (o) == IR_SLOAD || (o) == IR_VLOAD)
104 /* Instruction selection for XMM moves. */
105 #define XMM_MOVRR(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVSD : XO_MOVAPS)
106 #define XMM_MOVRM(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVLPD : XO_MOVSD)
108 /* Sparse limit checks using a red zone before the actual limit. */
109 #define MCLIM_REDZONE 64
110 #define checkmclim(as) \
111 if (LJ_UNLIKELY(as->mcp < as->mclim)) asm_mclimit(as)
113 static LJ_NORET LJ_NOINLINE void asm_mclimit(ASMState *as)
115 lj_mcode_limiterr(as->J, (size_t)(as->mctop - as->mcp + 4*MCLIM_REDZONE));
118 /* -- Emit x86 instructions ----------------------------------------------- */
120 #define MODRM(mode, r1, r2) ((MCode)((mode)+(((r1)&7)<<3)+((r2)&7)))
122 #if LJ_64
123 #define REXRB(p, rr, rb) \
124 { MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \
125 if (rex != 0x40) *--(p) = rex; }
126 #define FORCE_REX 0x200
127 #define REX_64 (FORCE_REX|0x080000)
128 #else
129 #define REXRB(p, rr, rb) ((void)0)
130 #define FORCE_REX 0
131 #define REX_64 0
132 #endif
134 #define emit_i8(as, i) (*--as->mcp = (MCode)(i))
135 #define emit_i32(as, i) (*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4)
136 #define emit_u32(as, u) (*(uint32_t *)(as->mcp-4) = (u), as->mcp -= 4)
138 #define emit_x87op(as, xo) \
139 (*(uint16_t *)(as->mcp-2) = (uint16_t)(xo), as->mcp -= 2)
141 /* op */
142 static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
143 MCode *p, int delta)
145 int n = (int8_t)xo;
146 #if defined(__GNUC__)
147 if (__builtin_constant_p(xo) && n == -2)
148 p[delta-2] = (MCode)(xo >> 24);
149 else if (__builtin_constant_p(xo) && n == -3)
150 *(uint16_t *)(p+delta-3) = (uint16_t)(xo >> 16);
151 else
152 #endif
153 *(uint32_t *)(p+delta-5) = (uint32_t)xo;
154 p += n + delta;
155 #if LJ_64
157 uint32_t rex = 0x40 + ((rr>>1)&(4+(FORCE_REX>>1)))+((rx>>2)&2)+((rb>>3)&1);
158 if (rex != 0x40) {
159 rex |= (rr >> 16);
160 if (n == -4) { *p = (MCode)rex; rex = (MCode)(xo >> 8); }
161 else if ((xo & 0xffffff) == 0x6600fd) { *p = (MCode)rex; rex = 0x66; }
162 *--p = (MCode)rex;
165 #else
166 UNUSED(rr); UNUSED(rb); UNUSED(rx);
167 #endif
168 return p;
171 /* op + modrm */
172 #define emit_opm(xo, mode, rr, rb, p, delta) \
173 (p[(delta)-1] = MODRM((mode), (rr), (rb)), \
174 emit_op((xo), (rr), (rb), 0, (p), (delta)))
176 /* op + modrm + sib */
177 #define emit_opmx(xo, mode, scale, rr, rb, rx, p) \
178 (p[-1] = MODRM((scale), (rx), (rb)), \
179 p[-2] = MODRM((mode), (rr), RID_ESP), \
180 emit_op((xo), (rr), (rb), (rx), (p), -1))
182 /* op r1, r2 */
183 static void emit_rr(ASMState *as, x86Op xo, Reg r1, Reg r2)
185 MCode *p = as->mcp;
186 as->mcp = emit_opm(xo, XM_REG, r1, r2, p, 0);
189 #if LJ_64 && defined(LUA_USE_ASSERT)
190 /* [addr] is sign-extended in x64 and must be in lower 2G (not 4G). */
191 static int32_t ptr2addr(const void *p)
193 lua_assert((uintptr_t)p < (uintptr_t)0x80000000);
194 return i32ptr(p);
196 #else
197 #define ptr2addr(p) (i32ptr((p)))
198 #endif
200 /* op r, [addr] */
201 static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr)
203 MCode *p = as->mcp;
204 *(int32_t *)(p-4) = ptr2addr(addr);
205 #if LJ_64
206 p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
207 as->mcp = emit_opm(xo, XM_OFS0, rr, RID_ESP, p, -5);
208 #else
209 as->mcp = emit_opm(xo, XM_OFS0, rr, RID_EBP, p, -4);
210 #endif
213 /* op r, [base+ofs] */
214 static void emit_rmro(ASMState *as, x86Op xo, Reg rr, Reg rb, int32_t ofs)
216 MCode *p = as->mcp;
217 x86Mode mode;
218 if (ra_hasreg(rb)) {
219 if (ofs == 0 && (rb&7) != RID_EBP) {
220 mode = XM_OFS0;
221 } else if (checki8(ofs)) {
222 *--p = (MCode)ofs;
223 mode = XM_OFS8;
224 } else {
225 p -= 4;
226 *(int32_t *)p = ofs;
227 mode = XM_OFS32;
229 if ((rb&7) == RID_ESP)
230 *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
231 } else {
232 *(int32_t *)(p-4) = ofs;
233 #if LJ_64
234 p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
235 p -= 5;
236 rb = RID_ESP;
237 #else
238 p -= 4;
239 rb = RID_EBP;
240 #endif
241 mode = XM_OFS0;
243 as->mcp = emit_opm(xo, mode, rr, rb, p, 0);
246 /* op r, [base+idx*scale+ofs] */
247 static void emit_rmrxo(ASMState *as, x86Op xo, Reg rr, Reg rb, Reg rx,
248 x86Mode scale, int32_t ofs)
250 MCode *p = as->mcp;
251 x86Mode mode;
252 if (ofs == 0 && (rb&7) != RID_EBP) {
253 mode = XM_OFS0;
254 } else if (checki8(ofs)) {
255 mode = XM_OFS8;
256 *--p = (MCode)ofs;
257 } else {
258 mode = XM_OFS32;
259 p -= 4;
260 *(int32_t *)p = ofs;
262 as->mcp = emit_opmx(xo, mode, scale, rr, rb, rx, p);
265 /* op r, i */
266 static void emit_gri(ASMState *as, x86Group xg, Reg rb, int32_t i)
268 MCode *p = as->mcp;
269 x86Op xo;
270 if (checki8(i)) {
271 *--p = (MCode)i;
272 xo = XG_TOXOi8(xg);
273 } else {
274 p -= 4;
275 *(int32_t *)p = i;
276 xo = XG_TOXOi(xg);
278 as->mcp = emit_opm(xo, XM_REG, (Reg)(xg & 7) | (rb & REX_64), rb, p, 0);
281 /* op [base+ofs], i */
282 static void emit_gmroi(ASMState *as, x86Group xg, Reg rb, int32_t ofs,
283 int32_t i)
285 x86Op xo;
286 if (checki8(i)) {
287 emit_i8(as, i);
288 xo = XG_TOXOi8(xg);
289 } else {
290 emit_i32(as, i);
291 xo = XG_TOXOi(xg);
293 emit_rmro(as, xo, (Reg)(xg & 7), rb, ofs);
296 #define emit_shifti(as, xg, r, i) \
297 (emit_i8(as, (i)), emit_rr(as, XO_SHIFTi, (Reg)(xg), (r)))
299 /* op r, rm/mrm */
300 static void emit_mrm(ASMState *as, x86Op xo, Reg rr, Reg rb)
302 MCode *p = as->mcp;
303 x86Mode mode = XM_REG;
304 if (rb == RID_MRM) {
305 rb = as->mrm.base;
306 if (rb == RID_NONE) {
307 rb = RID_EBP;
308 mode = XM_OFS0;
309 p -= 4;
310 *(int32_t *)p = as->mrm.ofs;
311 if (as->mrm.idx != RID_NONE)
312 goto mrmidx;
313 #if LJ_64
314 *--p = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
315 rb = RID_ESP;
316 #endif
317 } else {
318 if (as->mrm.ofs == 0 && (rb&7) != RID_EBP) {
319 mode = XM_OFS0;
320 } else if (checki8(as->mrm.ofs)) {
321 *--p = (MCode)as->mrm.ofs;
322 mode = XM_OFS8;
323 } else {
324 p -= 4;
325 *(int32_t *)p = as->mrm.ofs;
326 mode = XM_OFS32;
328 if (as->mrm.idx != RID_NONE) {
329 mrmidx:
330 as->mcp = emit_opmx(xo, mode, as->mrm.scale, rr, rb, as->mrm.idx, p);
331 return;
333 if ((rb&7) == RID_ESP)
334 *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
337 as->mcp = emit_opm(xo, mode, rr, rb, p, 0);
340 static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
342 if (ofs) {
343 if ((as->flags & JIT_F_LEA_AGU))
344 emit_rmro(as, XO_LEA, r, r, ofs);
345 else
346 emit_gri(as, XG_ARITHi(XOg_ADD), r, ofs);
350 /* op rm/mrm, i */
351 static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i)
353 x86Op xo;
354 if (checki8(i)) {
355 emit_i8(as, i);
356 xo = XG_TOXOi8(xg);
357 } else {
358 emit_i32(as, i);
359 xo = XG_TOXOi(xg);
361 emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64));
364 /* -- Emit moves ---------------------------------------------------------- */
366 /* mov [base+ofs], i */
367 static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
369 emit_i32(as, i);
370 emit_rmro(as, XO_MOVmi, 0, base, ofs);
373 /* mov [base+ofs], r */
374 #define emit_movtomro(as, r, base, ofs) \
375 emit_rmro(as, XO_MOVto, (r), (base), (ofs))
377 /* Get/set global_State fields. */
378 #define emit_opgl(as, xo, r, field) \
379 emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field)
380 #define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r), field)
381 #define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r), field)
382 #define emit_setgli(as, field, i) \
383 (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, field))
385 /* mov r, i / xor r, r */
386 static void emit_loadi(ASMState *as, Reg r, int32_t i)
388 /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
389 if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP ||
390 (as->curins+1 < as->T->nins &&
391 IR(as->curins+1)->o == IR_HIOP)))) {
392 emit_rr(as, XO_ARITH(XOg_XOR), r, r);
393 } else {
394 MCode *p = as->mcp;
395 *(int32_t *)(p-4) = i;
396 p[-5] = (MCode)(XI_MOVri+(r&7));
397 p -= 5;
398 REXRB(p, 0, r);
399 as->mcp = p;
403 /* mov r, addr */
404 #define emit_loada(as, r, addr) \
405 emit_loadi(as, (r), ptr2addr((addr)))
407 #if LJ_64
408 /* mov r, imm64 or shorter 32 bit extended load. */
409 static void emit_loadu64(ASMState *as, Reg r, uint64_t u64)
411 if (checku32(u64)) { /* 32 bit load clears upper 32 bits. */
412 emit_loadi(as, r, (int32_t)u64);
413 } else if (checki32((int64_t)u64)) { /* Sign-extended 32 bit load. */
414 MCode *p = as->mcp;
415 *(int32_t *)(p-4) = (int32_t)u64;
416 as->mcp = emit_opm(XO_MOVmi, XM_REG, REX_64, r, p, -4);
417 } else { /* Full-size 64 bit load. */
418 MCode *p = as->mcp;
419 *(uint64_t *)(p-8) = u64;
420 p[-9] = (MCode)(XI_MOVri+(r&7));
421 p[-10] = 0x48 + ((r>>3)&1);
422 p -= 10;
423 as->mcp = p;
426 #endif
428 /* movsd r, [&tv->n] / xorps r, r */
429 static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
431 if (tvispzero(tv)) /* Use xor only for +0. */
432 emit_rr(as, XO_XORPS, r, r);
433 else
434 emit_rma(as, XMM_MOVRM(as), r, &tv->n);
437 /* -- Emit branches ------------------------------------------------------- */
439 /* Label for short jumps. */
440 typedef MCode *MCLabel;
442 #if LJ_32 && LJ_HASFFI
443 /* jmp short target */
444 static void emit_sjmp(ASMState *as, MCLabel target)
446 MCode *p = as->mcp;
447 ptrdiff_t delta = target - p;
448 lua_assert(delta == (int8_t)delta);
449 p[-1] = (MCode)(int8_t)delta;
450 p[-2] = XI_JMPs;
451 as->mcp = p - 2;
453 #endif
455 /* jcc short target */
456 static void emit_sjcc(ASMState *as, int cc, MCLabel target)
458 MCode *p = as->mcp;
459 ptrdiff_t delta = target - p;
460 lua_assert(delta == (int8_t)delta);
461 p[-1] = (MCode)(int8_t)delta;
462 p[-2] = (MCode)(XI_JCCs+(cc&15));
463 as->mcp = p - 2;
466 /* jcc short (pending target) */
467 static MCLabel emit_sjcc_label(ASMState *as, int cc)
469 MCode *p = as->mcp;
470 p[-1] = 0;
471 p[-2] = (MCode)(XI_JCCs+(cc&15));
472 as->mcp = p - 2;
473 return p;
476 /* Fixup jcc short target. */
477 static void emit_sfixup(ASMState *as, MCLabel source)
479 source[-1] = (MCode)(as->mcp-source);
482 /* Return label pointing to current PC. */
483 #define emit_label(as) ((as)->mcp)
485 /* Compute relative 32 bit offset for jump and call instructions. */
486 static LJ_AINLINE int32_t jmprel(MCode *p, MCode *target)
488 ptrdiff_t delta = target - p;
489 lua_assert(delta == (int32_t)delta);
490 return (int32_t)delta;
493 /* jcc target */
494 static void emit_jcc(ASMState *as, int cc, MCode *target)
496 MCode *p = as->mcp;
497 *(int32_t *)(p-4) = jmprel(p, target);
498 p[-5] = (MCode)(XI_JCCn+(cc&15));
499 p[-6] = 0x0f;
500 as->mcp = p - 6;
503 /* call target */
504 static void emit_call_(ASMState *as, MCode *target)
506 MCode *p = as->mcp;
507 #if LJ_64
508 if (target-p != (int32_t)(target-p)) {
509 /* Assumes RID_RET is never an argument to calls and always clobbered. */
510 emit_rr(as, XO_GROUP5, XOg_CALL, RID_RET);
511 emit_loadu64(as, RID_RET, (uint64_t)target);
512 return;
514 #endif
515 *(int32_t *)(p-4) = jmprel(p, target);
516 p[-5] = XI_CALL;
517 as->mcp = p - 5;
520 #define emit_call(as, f) emit_call_(as, (MCode *)(void *)(f))
522 /* -- Register allocator debugging ---------------------------------------- */
524 /* #define LUAJIT_DEBUG_RA */
526 #ifdef LUAJIT_DEBUG_RA
528 #include <stdio.h>
529 #include <stdarg.h>
531 #define RIDNAME(name) #name,
532 static const char *const ra_regname[] = {
533 GPRDEF(RIDNAME)
534 FPRDEF(RIDNAME)
535 "mrm",
536 NULL
538 #undef RIDNAME
540 static char ra_dbg_buf[65536];
541 static char *ra_dbg_p;
542 static char *ra_dbg_merge;
543 static MCode *ra_dbg_mcp;
545 static void ra_dstart(void)
547 ra_dbg_p = ra_dbg_buf;
548 ra_dbg_merge = NULL;
549 ra_dbg_mcp = NULL;
552 static void ra_dflush(void)
554 fwrite(ra_dbg_buf, 1, (size_t)(ra_dbg_p-ra_dbg_buf), stdout);
555 ra_dstart();
558 static void ra_dprintf(ASMState *as, const char *fmt, ...)
560 char *p;
561 va_list argp;
562 va_start(argp, fmt);
563 p = ra_dbg_mcp == as->mcp ? ra_dbg_merge : ra_dbg_p;
564 ra_dbg_mcp = NULL;
565 p += sprintf(p, "%08x \e[36m%04d ", (uintptr_t)as->mcp, as->curins-REF_BIAS);
566 for (;;) {
567 const char *e = strchr(fmt, '$');
568 if (e == NULL) break;
569 memcpy(p, fmt, (size_t)(e-fmt));
570 p += e-fmt;
571 if (e[1] == 'r') {
572 Reg r = va_arg(argp, Reg) & RID_MASK;
573 if (r <= RID_MAX) {
574 const char *q;
575 for (q = ra_regname[r]; *q; q++)
576 *p++ = *q >= 'A' && *q <= 'Z' ? *q + 0x20 : *q;
577 } else {
578 *p++ = '?';
579 lua_assert(0);
581 } else if (e[1] == 'f' || e[1] == 'i') {
582 IRRef ref;
583 if (e[1] == 'f')
584 ref = va_arg(argp, IRRef);
585 else
586 ref = va_arg(argp, IRIns *) - as->ir;
587 if (ref >= REF_BIAS)
588 p += sprintf(p, "%04d", ref - REF_BIAS);
589 else
590 p += sprintf(p, "K%03d", REF_BIAS - ref);
591 } else if (e[1] == 's') {
592 uint32_t slot = va_arg(argp, uint32_t);
593 p += sprintf(p, "[esp+0x%x]", sps_scale(slot));
594 } else {
595 lua_assert(0);
597 fmt = e+2;
599 va_end(argp);
600 while (*fmt)
601 *p++ = *fmt++;
602 *p++ = '\e'; *p++ = '['; *p++ = 'm'; *p++ = '\n';
603 if (p > ra_dbg_buf+sizeof(ra_dbg_buf)-256) {
604 fwrite(ra_dbg_buf, 1, (size_t)(p-ra_dbg_buf), stdout);
605 p = ra_dbg_buf;
607 ra_dbg_p = p;
610 #define RA_DBG_START() ra_dstart()
611 #define RA_DBG_FLUSH() ra_dflush()
612 #define RA_DBG_REF() \
613 do { char *_p = ra_dbg_p; ra_dprintf(as, ""); \
614 ra_dbg_merge = _p; ra_dbg_mcp = as->mcp; } while (0)
615 #define RA_DBGX(x) ra_dprintf x
617 #else
618 #define RA_DBG_START() ((void)0)
619 #define RA_DBG_FLUSH() ((void)0)
620 #define RA_DBG_REF() ((void)0)
621 #define RA_DBGX(x) ((void)0)
622 #endif
624 /* -- Register allocator -------------------------------------------------- */
626 #define ra_free(as, r) rset_set(as->freeset, (r))
627 #define ra_modified(as, r) rset_set(as->modset, (r))
628 #define ra_weak(as, r) rset_set(as->weakset, (r))
629 #define ra_noweak(as, r) rset_clear(as->weakset, (r))
631 #define ra_used(ir) (ra_hasreg((ir)->r) || ra_hasspill((ir)->s))
633 /* Setup register allocator. */
634 static void ra_setup(ASMState *as)
636 /* Initially all regs (except the stack pointer) are free for use. */
637 as->freeset = RSET_ALL;
638 as->modset = RSET_EMPTY;
639 as->weakset = RSET_EMPTY;
640 as->phiset = RSET_EMPTY;
641 memset(as->phireg, 0, sizeof(as->phireg));
642 memset(as->cost, 0, sizeof(as->cost));
643 as->cost[RID_ESP] = REGCOST(~0u, 0u);
646 /* Rematerialize constants. */
647 static Reg ra_rematk(ASMState *as, IRIns *ir)
649 Reg r = ir->r;
650 lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
651 ra_free(as, r);
652 ra_modified(as, r);
653 ir->r = RID_INIT; /* Do not keep any hint. */
654 RA_DBGX((as, "remat $i $r", ir, r));
655 if (ir->o == IR_KNUM) {
656 emit_loadn(as, r, ir_knum(ir));
657 } else if (ir->o == IR_BASE) {
658 ra_sethint(ir->r, RID_BASE); /* Restore BASE register hint. */
659 emit_getgl(as, r, jit_base);
660 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
661 lua_assert(irt_isnil(ir->t));
662 emit_getgl(as, r, jit_L);
663 #if LJ_64
664 } else if (ir->o == IR_KINT64) {
665 emit_loadu64(as, r, ir_kint64(ir)->u64);
666 #endif
667 } else {
668 lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
669 ir->o == IR_KPTR || ir->o == IR_KKPTR || ir->o == IR_KNULL);
670 emit_loadi(as, r, ir->i);
672 return r;
675 /* Force a spill. Allocate a new spill slot if needed. */
676 static int32_t ra_spill(ASMState *as, IRIns *ir)
678 int32_t slot = ir->s;
679 if (!ra_hasspill(slot)) {
680 if (irt_is64(ir->t)) {
681 slot = as->evenspill;
682 as->evenspill += 2;
683 } else if (as->oddspill) {
684 slot = as->oddspill;
685 as->oddspill = 0;
686 } else {
687 slot = as->evenspill;
688 as->oddspill = slot+1;
689 as->evenspill += 2;
691 if (as->evenspill > 256)
692 lj_trace_err(as->J, LJ_TRERR_SPILLOV);
693 ir->s = (uint8_t)slot;
695 return sps_scale(slot);
698 /* Release the temporarily allocated register in ASMREF_TMP1/ASMREF_TMP2. */
699 static Reg ra_releasetmp(ASMState *as, IRRef ref)
701 IRIns *ir = IR(ref);
702 Reg r = ir->r;
703 lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
704 ra_free(as, r);
705 ra_modified(as, r);
706 ir->r = RID_INIT;
707 return r;
710 /* Use 64 bit operations to handle 64 bit IR types. */
711 #if LJ_64
712 #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
713 #else
714 #define REX_64IR(ir, r) (r)
715 #endif
717 /* Generic move between two regs. */
718 static void ra_movrr(ASMState *as, IRIns *ir, Reg r1, Reg r2)
720 UNUSED(ir);
721 if (r1 < RID_MAX_GPR)
722 emit_rr(as, XO_MOV, REX_64IR(ir, r1), r2);
723 else
724 emit_rr(as, XMM_MOVRR(as), r1, r2);
727 /* Restore a register (marked as free). Rematerialize or force a spill. */
728 static Reg ra_restore(ASMState *as, IRRef ref)
730 IRIns *ir = IR(ref);
731 if (irref_isk(ref) || ref == REF_BASE) {
732 return ra_rematk(as, ir);
733 } else {
734 int32_t ofs = ra_spill(as, ir); /* Force a spill slot. */
735 Reg r = ir->r;
736 lua_assert(ra_hasreg(r));
737 ra_sethint(ir->r, r); /* Keep hint. */
738 ra_free(as, r);
739 if (!rset_test(as->weakset, r)) { /* Only restore non-weak references. */
740 ra_modified(as, r);
741 RA_DBGX((as, "restore $i $r", ir, r));
742 if (r < RID_MAX_GPR)
743 emit_rmro(as, XO_MOV, REX_64IR(ir, r), RID_ESP, ofs);
744 else
745 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
746 r, RID_ESP, ofs);
748 return r;
752 /* Save a register to a spill slot. */
753 static void ra_save(ASMState *as, IRIns *ir, Reg r)
755 RA_DBGX((as, "save $i $r", ir, r));
756 if (r < RID_MAX_GPR)
757 emit_rmro(as, XO_MOVto, REX_64IR(ir, r), RID_ESP, sps_scale(ir->s));
758 else
759 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSDto : XO_MOVSSto,
760 r, RID_ESP, sps_scale(ir->s));
763 #define MINCOST(r) \
764 if (LJ_LIKELY(allow&RID2RSET(r)) && as->cost[r] < cost) \
765 cost = as->cost[r]
767 /* Evict the register with the lowest cost, forcing a restore. */
768 static Reg ra_evict(ASMState *as, RegSet allow)
770 IRRef ref;
771 RegCost cost = ~(RegCost)0;
772 lua_assert(allow != RSET_EMPTY);
773 if (allow < RID2RSET(RID_MAX_GPR)) {
774 MINCOST(RID_EAX);MINCOST(RID_ECX);MINCOST(RID_EDX);MINCOST(RID_EBX);
775 MINCOST(RID_EBP);MINCOST(RID_ESI);MINCOST(RID_EDI);
776 #if LJ_64
777 MINCOST(RID_R8D);MINCOST(RID_R9D);MINCOST(RID_R10D);MINCOST(RID_R11D);
778 MINCOST(RID_R12D);MINCOST(RID_R13D);MINCOST(RID_R14D);MINCOST(RID_R15D);
779 #endif
780 } else {
781 MINCOST(RID_XMM0);MINCOST(RID_XMM1);MINCOST(RID_XMM2);MINCOST(RID_XMM3);
782 MINCOST(RID_XMM4);MINCOST(RID_XMM5);MINCOST(RID_XMM6);MINCOST(RID_XMM7);
783 #if LJ_64
784 MINCOST(RID_XMM8);MINCOST(RID_XMM9);MINCOST(RID_XMM10);MINCOST(RID_XMM11);
785 MINCOST(RID_XMM12);MINCOST(RID_XMM13);MINCOST(RID_XMM14);MINCOST(RID_XMM15);
786 #endif
788 ref = regcost_ref(cost);
789 lua_assert(ref >= as->T->nk && ref < as->T->nins);
790 /* Preferably pick any weak ref instead of a non-weak, non-const ref. */
791 if (!irref_isk(ref) && (as->weakset & allow)) {
792 IRIns *ir = IR(ref);
793 if (!rset_test(as->weakset, ir->r))
794 ref = regcost_ref(as->cost[rset_pickbot((as->weakset & allow))]);
796 return ra_restore(as, ref);
799 /* Pick any register (marked as free). Evict on-demand. */
800 static Reg ra_pick(ASMState *as, RegSet allow)
802 RegSet pick = as->freeset & allow;
803 if (!pick)
804 return ra_evict(as, allow);
805 else
806 return rset_picktop(pick);
809 /* Get a scratch register (marked as free). */
810 static Reg ra_scratch(ASMState *as, RegSet allow)
812 Reg r = ra_pick(as, allow);
813 ra_modified(as, r);
814 RA_DBGX((as, "scratch $r", r));
815 return r;
818 /* Evict all registers from a set (if not free). */
819 static void ra_evictset(ASMState *as, RegSet drop)
821 as->modset |= drop;
822 drop &= ~as->freeset;
823 while (drop) {
824 Reg r = rset_picktop(drop);
825 ra_restore(as, regcost_ref(as->cost[r]));
826 rset_clear(drop, r);
827 checkmclim(as);
831 /* Evict (rematerialize) all registers allocated to constants. */
832 static void ra_evictk(ASMState *as)
834 RegSet work = ~as->freeset & RSET_ALL;
835 while (work) {
836 Reg r = rset_pickbot(work);
837 IRRef ref = regcost_ref(as->cost[r]);
838 if (irref_isk(ref)) {
839 ra_rematk(as, IR(ref));
840 checkmclim(as);
842 rset_clear(work, r);
846 /* Allocate a register for ref from the allowed set of registers.
847 ** Note: this function assumes the ref does NOT have a register yet!
848 ** Picks an optimal register, sets the cost and marks the register as non-free.
850 static Reg ra_allocref(ASMState *as, IRRef ref, RegSet allow)
852 IRIns *ir = IR(ref);
853 RegSet pick = as->freeset & allow;
854 Reg r;
855 lua_assert(ra_noreg(ir->r));
856 if (pick) {
857 /* First check register hint from propagation or PHI. */
858 if (ra_hashint(ir->r)) {
859 r = ra_gethint(ir->r);
860 if (rset_test(pick, r)) /* Use hint register if possible. */
861 goto found;
862 /* Rematerialization is cheaper than missing a hint. */
863 if (rset_test(allow, r) && irref_isk(regcost_ref(as->cost[r]))) {
864 ra_rematk(as, IR(regcost_ref(as->cost[r])));
865 goto found;
867 RA_DBGX((as, "hintmiss $f $r", ref, r));
869 /* Invariants should preferably get unmodified registers. */
870 if (ref < as->loopref && !irt_isphi(ir->t)) {
871 if ((pick & ~as->modset))
872 pick &= ~as->modset;
873 r = rset_pickbot(pick); /* Reduce conflicts with inverse allocation. */
874 } else {
875 #if LJ_64
876 /* We've got plenty of regs, so get callee-save regs if possible. */
877 if ((pick & ~RSET_SCRATCH))
878 pick &= ~RSET_SCRATCH;
879 #endif
880 r = rset_picktop(pick);
882 } else {
883 r = ra_evict(as, allow);
885 found:
886 RA_DBGX((as, "alloc $f $r", ref, r));
887 ir->r = (uint8_t)r;
888 rset_clear(as->freeset, r);
889 ra_noweak(as, r);
890 as->cost[r] = REGCOST_REF_T(ref, irt_t(ir->t));
891 return r;
894 /* Allocate a register on-demand. */
895 static Reg ra_alloc1(ASMState *as, IRRef ref, RegSet allow)
897 Reg r = IR(ref)->r;
898 /* Note: allow is ignored if the register is already allocated. */
899 if (ra_noreg(r)) r = ra_allocref(as, ref, allow);
900 ra_noweak(as, r);
901 return r;
904 /* Rename register allocation and emit move. */
905 static void ra_rename(ASMState *as, Reg down, Reg up)
907 IRRef ren, ref = regcost_ref(as->cost[up] = as->cost[down]);
908 IRIns *ir = IR(ref);
909 ir->r = (uint8_t)up;
910 as->cost[down] = 0;
911 lua_assert((down < RID_MAX_GPR) == (up < RID_MAX_GPR));
912 lua_assert(!rset_test(as->freeset, down) && rset_test(as->freeset, up));
913 ra_free(as, down); /* 'down' is free ... */
914 ra_modified(as, down);
915 rset_clear(as->freeset, up); /* ... and 'up' is now allocated. */
916 ra_noweak(as, up);
917 RA_DBGX((as, "rename $f $r $r", regcost_ref(as->cost[up]), down, up));
918 ra_movrr(as, ir, down, up); /* Backwards codegen needs inverse move. */
919 if (!ra_hasspill(IR(ref)->s)) { /* Add the rename to the IR. */
920 lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, as->snapno);
921 ren = tref_ref(lj_ir_emit(as->J));
922 as->ir = as->T->ir; /* The IR may have been reallocated. */
923 IR(ren)->r = (uint8_t)down;
924 IR(ren)->s = SPS_NONE;
928 /* Pick a destination register (marked as free).
929 ** Caveat: allow is ignored if there's already a destination register.
930 ** Use ra_destreg() to get a specific register.
932 static Reg ra_dest(ASMState *as, IRIns *ir, RegSet allow)
934 Reg dest = ir->r;
935 if (ra_hasreg(dest)) {
936 ra_free(as, dest);
937 ra_modified(as, dest);
938 } else {
939 dest = ra_scratch(as, allow);
941 if (LJ_UNLIKELY(ra_hasspill(ir->s))) ra_save(as, ir, dest);
942 return dest;
945 /* Force a specific destination register (marked as free). */
946 static void ra_destreg(ASMState *as, IRIns *ir, Reg r)
948 Reg dest = ra_dest(as, ir, RID2RSET(r));
949 if (dest != r) {
950 ra_scratch(as, RID2RSET(r));
951 ra_movrr(as, ir, dest, r);
955 /* Propagate dest register to left reference. Emit moves as needed.
956 ** This is a required fixup step for all 2-operand machine instructions.
958 static void ra_left(ASMState *as, Reg dest, IRRef lref)
960 IRIns *ir = IR(lref);
961 Reg left = ir->r;
962 if (ra_noreg(left)) {
963 if (irref_isk(lref)) {
964 if (ir->o == IR_KNUM) {
965 cTValue *tv = ir_knum(ir);
966 /* FP remat needs a load except for +0. Still better than eviction. */
967 if (tvispzero(tv) || !(as->freeset & RSET_FPR)) {
968 emit_loadn(as, dest, tv);
969 return;
971 #if LJ_64
972 } else if (ir->o == IR_KINT64) {
973 emit_loadu64(as, dest, ir_kint64(ir)->u64);
974 return;
975 #endif
976 } else {
977 lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
978 ir->o == IR_KPTR || ir->o == IR_KKPTR || ir->o == IR_KNULL);
979 emit_loadi(as, dest, ir->i);
980 return;
983 if (!ra_hashint(left) && !iscrossref(as, lref))
984 ra_sethint(ir->r, dest); /* Propagate register hint. */
985 left = ra_allocref(as, lref, dest < RID_MAX_GPR ? RSET_GPR : RSET_FPR);
987 ra_noweak(as, left);
988 /* Move needed for true 3-operand instruction: y=a+b ==> y=a; y+=b. */
989 if (dest != left) {
990 /* Use register renaming if dest is the PHI reg. */
991 if (irt_isphi(ir->t) && as->phireg[dest] == lref) {
992 ra_modified(as, left);
993 ra_rename(as, left, dest);
994 } else {
995 ra_movrr(as, ir, dest, left);
1000 /* -- Exit stubs ---------------------------------------------------------- */
1002 /* Generate an exit stub group at the bottom of the reserved MCode memory. */
1003 static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
1005 ExitNo i, groupofs = (group*EXITSTUBS_PER_GROUP) & 0xff;
1006 MCode *mxp = as->mcbot;
1007 MCode *mxpstart = mxp;
1008 if (mxp + (2+2)*EXITSTUBS_PER_GROUP+8+5 >= as->mctop)
1009 asm_mclimit(as);
1010 /* Push low byte of exitno for each exit stub. */
1011 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)groupofs;
1012 for (i = 1; i < EXITSTUBS_PER_GROUP; i++) {
1013 *mxp++ = XI_JMPs; *mxp++ = (MCode)((2+2)*(EXITSTUBS_PER_GROUP - i) - 2);
1014 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)(groupofs + i);
1016 /* Push the high byte of the exitno for each exit stub group. */
1017 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8);
1018 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
1019 *mxp++ = XI_MOVmi;
1020 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP);
1021 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
1022 *mxp++ = 2*sizeof(void *);
1023 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4;
1024 /* Jump to exit handler which fills in the ExitState. */
1025 *mxp++ = XI_JMP; mxp += 4;
1026 *((int32_t *)(mxp-4)) = jmprel(mxp, (MCode *)(void *)lj_vm_exit_handler);
1027 /* Commit the code for this group (even if assembly fails later on). */
1028 lj_mcode_commitbot(as->J, mxp);
1029 as->mcbot = mxp;
1030 as->mclim = as->mcbot + MCLIM_REDZONE;
1031 return mxpstart;
1034 /* Setup all needed exit stubs. */
1035 static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
1037 ExitNo i;
1038 if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
1039 lj_trace_err(as->J, LJ_TRERR_SNAPOV);
1040 for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
1041 if (as->J->exitstubgroup[i] == NULL)
1042 as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
1045 /* -- Snapshot and guard handling ----------------------------------------- */
1047 /* Can we rematerialize a KNUM instead of forcing a spill? */
1048 static int asm_snap_canremat(ASMState *as)
1050 Reg r;
1051 for (r = RID_MIN_FPR; r < RID_MAX_FPR; r++)
1052 if (irref_isk(regcost_ref(as->cost[r])))
1053 return 1;
1054 return 0;
1057 /* Allocate registers or spill slots for refs escaping to a snapshot. */
1058 static void asm_snap_alloc(ASMState *as)
1060 SnapShot *snap = &as->T->snap[as->snapno];
1061 SnapEntry *map = &as->T->snapmap[snap->mapofs];
1062 MSize n, nent = snap->nent;
1063 for (n = 0; n < nent; n++) {
1064 IRRef ref = snap_ref(map[n]);
1065 if (!irref_isk(ref)) {
1066 IRIns *ir = IR(ref);
1067 if (!ra_used(ir)) {
1068 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
1069 /* Get a weak register if we have a free one or can rematerialize. */
1070 if ((as->freeset & allow) ||
1071 (allow == RSET_FPR && asm_snap_canremat(as))) {
1072 Reg r = ra_allocref(as, ref, allow); /* Allocate a register. */
1073 if (!irt_isphi(ir->t))
1074 ra_weak(as, r); /* But mark it as weakly referenced. */
1075 checkmclim(as);
1076 RA_DBGX((as, "snapreg $f $r", ref, ir->r));
1077 } else {
1078 ra_spill(as, ir); /* Otherwise force a spill slot. */
1079 RA_DBGX((as, "snapspill $f $s", ref, ir->s));
1086 /* All guards for a snapshot use the same exitno. This is currently the
1087 ** same as the snapshot number. Since the exact origin of the exit cannot
1088 ** be determined, all guards for the same snapshot must exit with the same
1089 ** RegSP mapping.
1090 ** A renamed ref which has been used in a prior guard for the same snapshot
1091 ** would cause an inconsistency. The easy way out is to force a spill slot.
1093 static int asm_snap_checkrename(ASMState *as, IRRef ren)
1095 SnapShot *snap = &as->T->snap[as->snapno];
1096 SnapEntry *map = &as->T->snapmap[snap->mapofs];
1097 MSize n, nent = snap->nent;
1098 for (n = 0; n < nent; n++) {
1099 IRRef ref = snap_ref(map[n]);
1100 if (ref == ren) {
1101 IRIns *ir = IR(ref);
1102 ra_spill(as, ir); /* Register renamed, so force a spill slot. */
1103 RA_DBGX((as, "snaprensp $f $s", ref, ir->s));
1104 return 1; /* Found. */
1107 return 0; /* Not found. */
1110 /* Prepare snapshot for next guard instruction. */
1111 static void asm_snap_prep(ASMState *as)
1113 if (as->curins < as->snapref) {
1114 do {
1115 lua_assert(as->snapno != 0);
1116 as->snapno--;
1117 as->snapref = as->T->snap[as->snapno].ref;
1118 } while (as->curins < as->snapref);
1119 asm_snap_alloc(as);
1120 as->snaprename = as->T->nins;
1121 } else {
1122 /* Process any renames above the highwater mark. */
1123 for (; as->snaprename < as->T->nins; as->snaprename++) {
1124 IRIns *ir = IR(as->snaprename);
1125 if (asm_snap_checkrename(as, ir->op1))
1126 ir->op2 = REF_BIAS-1; /* Kill rename. */
1131 /* Emit conditional branch to exit for guard.
1132 ** It's important to emit this *after* all registers have been allocated,
1133 ** because rematerializations may invalidate the flags.
1135 static void asm_guardcc(ASMState *as, int cc)
1137 MCode *target = exitstub_addr(as->J, as->snapno);
1138 MCode *p = as->mcp;
1139 if (LJ_UNLIKELY(p == as->invmcp)) {
1140 as->loopinv = 1;
1141 *(int32_t *)(p+1) = jmprel(p+5, target);
1142 target = p;
1143 cc ^= 1;
1144 if (as->realign) {
1145 emit_sjcc(as, cc, target);
1146 return;
1149 emit_jcc(as, cc, target);
1152 /* -- Memory operand fusion ----------------------------------------------- */
1154 /* Arch-specific field offsets. */
1155 static const uint8_t field_ofs[IRFL__MAX+1] = {
1156 #define FLOFS(name, ofs) (uint8_t)(ofs),
1157 IRFLDEF(FLOFS)
1158 #undef FLOFS
1162 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
1163 #define CONFLICT_SEARCH_LIM 31
1165 /* Check if a reference is a signed 32 bit constant. */
1166 static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
1168 if (irref_isk(ref)) {
1169 IRIns *ir = IR(ref);
1170 if (ir->o != IR_KINT64) {
1171 *k = ir->i;
1172 return 1;
1173 } else if (checki32((int64_t)ir_kint64(ir)->u64)) {
1174 *k = (int32_t)ir_kint64(ir)->u64;
1175 return 1;
1178 return 0;
1181 /* Check if there's no conflicting instruction between curins and ref.
1182 ** Also avoid fusing loads if there are multiple references.
1184 static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
1186 IRIns *ir = as->ir;
1187 IRRef i = as->curins;
1188 if (i > ref + CONFLICT_SEARCH_LIM)
1189 return 0; /* Give up, ref is too far away. */
1190 while (--i > ref) {
1191 if (ir[i].o == conflict)
1192 return 0; /* Conflict found. */
1193 else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref))
1194 return 0;
1196 return 1; /* Ok, no conflict. */
1199 /* Fuse array base into memory operand. */
1200 static IRRef asm_fuseabase(ASMState *as, IRRef ref)
1202 IRIns *irb = IR(ref);
1203 as->mrm.ofs = 0;
1204 if (irb->o == IR_FLOAD) {
1205 IRIns *ira = IR(irb->op1);
1206 lua_assert(irb->op2 == IRFL_TAB_ARRAY);
1207 /* We can avoid the FLOAD of t->array for colocated arrays. */
1208 if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
1209 noconflict(as, irb->op1, IR_NEWREF, 1)) {
1210 as->mrm.ofs = (int32_t)sizeof(GCtab); /* Ofs to colocated array. */
1211 return irb->op1; /* Table obj. */
1213 } else if (irb->o == IR_ADD && irref_isk(irb->op2)) {
1214 /* Fuse base offset (vararg load). */
1215 as->mrm.ofs = IR(irb->op2)->i;
1216 return irb->op1;
1218 return ref; /* Otherwise use the given array base. */
1221 /* Fuse array reference into memory operand. */
1222 static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow)
1224 IRIns *irx;
1225 lua_assert(ir->o == IR_AREF);
1226 as->mrm.base = (uint8_t)ra_alloc1(as, asm_fuseabase(as, ir->op1), allow);
1227 irx = IR(ir->op2);
1228 if (irref_isk(ir->op2)) {
1229 as->mrm.ofs += 8*irx->i;
1230 as->mrm.idx = RID_NONE;
1231 } else {
1232 rset_clear(allow, as->mrm.base);
1233 as->mrm.scale = XM_SCALE8;
1234 /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
1235 ** Doesn't help much without ABCelim, but reduces register pressure.
1237 if (!LJ_64 && /* Has bad effects with negative index on x64. */
1238 mayfuse(as, ir->op2) && ra_noreg(irx->r) &&
1239 irx->o == IR_ADD && irref_isk(irx->op2)) {
1240 as->mrm.ofs += 8*IR(irx->op2)->i;
1241 as->mrm.idx = (uint8_t)ra_alloc1(as, irx->op1, allow);
1242 } else {
1243 as->mrm.idx = (uint8_t)ra_alloc1(as, ir->op2, allow);
1248 /* Fuse array/hash/upvalue reference into memory operand.
1249 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
1250 ** pass the final allow mask, excluding any GPRs used for other inputs.
1251 ** In particular: 2-operand GPR instructions need to call ra_dest() first!
1253 static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
1255 IRIns *ir = IR(ref);
1256 if (ra_noreg(ir->r)) {
1257 switch ((IROp)ir->o) {
1258 case IR_AREF:
1259 if (mayfuse(as, ref)) {
1260 asm_fusearef(as, ir, allow);
1261 return;
1263 break;
1264 case IR_HREFK:
1265 if (mayfuse(as, ref)) {
1266 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
1267 as->mrm.ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
1268 as->mrm.idx = RID_NONE;
1269 return;
1271 break;
1272 case IR_UREFC:
1273 if (irref_isk(ir->op1)) {
1274 GCfunc *fn = ir_kfunc(IR(ir->op1));
1275 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
1276 as->mrm.ofs = ptr2addr(&uv->tv);
1277 as->mrm.base = as->mrm.idx = RID_NONE;
1278 return;
1280 break;
1281 default:
1282 lua_assert(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO);
1283 break;
1286 as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow);
1287 as->mrm.ofs = 0;
1288 as->mrm.idx = RID_NONE;
1291 /* Fuse FLOAD/FREF reference into memory operand. */
1292 static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
1294 lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF);
1295 as->mrm.ofs = field_ofs[ir->op2];
1296 as->mrm.idx = RID_NONE;
1297 if (irref_isk(ir->op1)) {
1298 as->mrm.ofs += IR(ir->op1)->i;
1299 as->mrm.base = RID_NONE;
1300 } else {
1301 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
1305 /* Fuse string reference into memory operand. */
1306 static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
1308 IRIns *irr;
1309 lua_assert(ir->o == IR_STRREF);
1310 as->mrm.base = as->mrm.idx = RID_NONE;
1311 as->mrm.scale = XM_SCALE1;
1312 as->mrm.ofs = sizeof(GCstr);
1313 if (irref_isk(ir->op1)) {
1314 as->mrm.ofs += IR(ir->op1)->i;
1315 } else {
1316 Reg r = ra_alloc1(as, ir->op1, allow);
1317 rset_clear(allow, r);
1318 as->mrm.base = (uint8_t)r;
1320 irr = IR(ir->op2);
1321 if (irref_isk(ir->op2)) {
1322 as->mrm.ofs += irr->i;
1323 } else {
1324 Reg r;
1325 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
1326 if (!LJ_64 && /* Has bad effects with negative index on x64. */
1327 mayfuse(as, ir->op2) && irr->o == IR_ADD && irref_isk(irr->op2)) {
1328 as->mrm.ofs += IR(irr->op2)->i;
1329 r = ra_alloc1(as, irr->op1, allow);
1330 } else {
1331 r = ra_alloc1(as, ir->op2, allow);
1333 if (as->mrm.base == RID_NONE)
1334 as->mrm.base = (uint8_t)r;
1335 else
1336 as->mrm.idx = (uint8_t)r;
1340 static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow)
1342 IRIns *ir = IR(ref);
1343 as->mrm.idx = RID_NONE;
1344 if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
1345 as->mrm.ofs = ir->i;
1346 as->mrm.base = RID_NONE;
1347 } else if (ir->o == IR_STRREF) {
1348 asm_fusestrref(as, ir, allow);
1349 } else {
1350 as->mrm.ofs = 0;
1351 if (mayfuse(as, ref) && ir->o == IR_ADD && ra_noreg(ir->r)) {
1352 /* Gather (base+idx*sz)+ofs as emitted by cdata ptr/array indexing. */
1353 IRIns *irx;
1354 IRRef idx;
1355 Reg r;
1356 if (asm_isk32(as, ir->op2, &as->mrm.ofs)) { /* Recognize x+ofs. */
1357 ref = ir->op1;
1358 ir = IR(ref);
1359 if (!(ir->o == IR_ADD && mayfuse(as, ref) && ra_noreg(ir->r)))
1360 goto noadd;
1362 as->mrm.scale = XM_SCALE1;
1363 idx = ir->op1;
1364 ref = ir->op2;
1365 irx = IR(idx);
1366 if (!(irx->o == IR_BSHL || irx->o == IR_ADD)) { /* Try other operand. */
1367 idx = ir->op2;
1368 ref = ir->op1;
1369 irx = IR(idx);
1371 if (mayfuse(as, idx) && ra_noreg(irx->r)) {
1372 if (irx->o == IR_BSHL && irref_isk(irx->op2) && IR(irx->op2)->i <= 3) {
1373 /* Recognize idx<<b with b = 0-3, corresponding to sz = (1),2,4,8. */
1374 idx = irx->op1;
1375 as->mrm.scale = (uint8_t)(IR(irx->op2)->i << 6);
1376 } else if (irx->o == IR_ADD && irx->op1 == irx->op2) {
1377 /* FOLD does idx*2 ==> idx<<1 ==> idx+idx. */
1378 idx = irx->op1;
1379 as->mrm.scale = XM_SCALE2;
1382 r = ra_alloc1(as, idx, allow);
1383 rset_clear(allow, r);
1384 as->mrm.idx = (uint8_t)r;
1386 noadd:
1387 as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow);
1391 /* Fuse load into memory operand. */
1392 static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
1394 IRIns *ir = IR(ref);
1395 if (ra_hasreg(ir->r)) {
1396 if (allow != RSET_EMPTY) { /* Fast path. */
1397 ra_noweak(as, ir->r);
1398 return ir->r;
1400 fusespill:
1401 /* Force a spill if only memory operands are allowed (asm_x87load). */
1402 as->mrm.base = RID_ESP;
1403 as->mrm.ofs = ra_spill(as, ir);
1404 as->mrm.idx = RID_NONE;
1405 return RID_MRM;
1407 if (ir->o == IR_KNUM) {
1408 RegSet avail = as->freeset & ~as->modset & RSET_FPR;
1409 lua_assert(allow != RSET_EMPTY);
1410 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */
1411 as->mrm.ofs = ptr2addr(ir_knum(ir));
1412 as->mrm.base = as->mrm.idx = RID_NONE;
1413 return RID_MRM;
1415 } else if (mayfuse(as, ref)) {
1416 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
1417 if (ir->o == IR_SLOAD) {
1418 if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
1419 noconflict(as, ref, IR_RETF, 0)) {
1420 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
1421 as->mrm.ofs = 8*((int32_t)ir->op1-1) + ((ir->op2&IRSLOAD_FRAME)?4:0);
1422 as->mrm.idx = RID_NONE;
1423 return RID_MRM;
1425 } else if (ir->o == IR_FLOAD) {
1426 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
1427 if ((irt_isint(ir->t) || irt_isaddr(ir->t)) &&
1428 noconflict(as, ref, IR_FSTORE, 0)) {
1429 asm_fusefref(as, ir, xallow);
1430 return RID_MRM;
1432 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
1433 if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0)) {
1434 asm_fuseahuref(as, ir->op1, xallow);
1435 return RID_MRM;
1437 } else if (ir->o == IR_XLOAD) {
1438 /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp).
1439 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
1441 if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) &&
1442 noconflict(as, ref, IR_XSTORE, 0)) {
1443 asm_fusexref(as, ir->op1, xallow);
1444 return RID_MRM;
1446 } else if (ir->o == IR_VLOAD) {
1447 asm_fuseahuref(as, ir->op1, xallow);
1448 return RID_MRM;
1451 if (!(as->freeset & allow) &&
1452 (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref)))
1453 goto fusespill;
1454 return ra_allocref(as, ref, allow);
1457 /* -- Calls --------------------------------------------------------------- */
1459 /* Generate a call to a C function. */
1460 static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
1462 uint32_t n, nargs = CCI_NARGS(ci);
1463 int32_t ofs = STACKARG_OFS;
1464 uint32_t gprs = REGARG_GPRS;
1465 #if LJ_64
1466 Reg fpr = REGARG_FIRSTFPR;
1467 #endif
1468 lua_assert(!(nargs > 2 && (ci->flags&CCI_FASTCALL))); /* Avoid stack adj. */
1469 if ((void *)ci->func)
1470 emit_call(as, ci->func);
1471 for (n = 0; n < nargs; n++) { /* Setup args. */
1472 IRRef ref = args[n];
1473 IRIns *ir = IR(ref);
1474 Reg r;
1475 #if LJ_64 && LJ_ABI_WIN
1476 /* Windows/x64 argument registers are strictly positional. */
1477 r = irt_isfp(ir->t) ? (fpr <= REGARG_LASTFPR ? fpr : 0) : (gprs & 31);
1478 fpr++; gprs >>= 5;
1479 #elif LJ_64
1480 /* POSIX/x64 argument registers are used in order of appearance. */
1481 if (irt_isfp(ir->t)) {
1482 r = fpr <= REGARG_LASTFPR ? fpr : 0; fpr++;
1483 } else {
1484 r = gprs & 31; gprs >>= 5;
1486 #else
1487 if (irt_isfp(ir->t) || !(ci->flags & CCI_FASTCALL)) {
1488 r = 0;
1489 } else {
1490 r = gprs & 31; gprs >>= 5;
1492 #endif
1493 if (r) { /* Argument is in a register. */
1494 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
1495 #if LJ_64
1496 if (ir->o == IR_KINT64)
1497 emit_loadu64(as, r, ir_kint64(ir)->u64);
1498 else
1499 #endif
1500 emit_loadi(as, r, ir->i);
1501 } else {
1502 lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */
1503 if (ra_hasreg(ir->r)) {
1504 ra_noweak(as, ir->r);
1505 ra_movrr(as, ir, r, ir->r);
1506 } else {
1507 ra_allocref(as, ref, RID2RSET(r));
1510 } else if (irt_isfp(ir->t)) { /* FP argument is on stack. */
1511 lua_assert(!(irt_isfloat(ir->t) && irref_isk(ref))); /* No float k. */
1512 if (LJ_32 && (ofs & 4) && irref_isk(ref)) {
1513 /* Split stores for unaligned FP consts. */
1514 emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo);
1515 emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi);
1516 } else {
1517 r = ra_alloc1(as, ref, RSET_FPR);
1518 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSDto : XO_MOVSSto,
1519 r, RID_ESP, ofs);
1521 ofs += (LJ_32 && irt_isfloat(ir->t)) ? 4 : 8;
1522 } else { /* Non-FP argument is on stack. */
1523 if (LJ_32 && ref < ASMREF_TMP1) {
1524 emit_movmroi(as, RID_ESP, ofs, ir->i);
1525 } else {
1526 r = ra_alloc1(as, ref, RSET_GPR);
1527 emit_movtomro(as, REX_64IR(ir, r), RID_ESP, ofs);
1529 ofs += sizeof(intptr_t);
1534 /* Setup result reg/sp for call. Evict scratch regs. */
1535 static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
1537 RegSet drop = RSET_SCRATCH;
1538 if ((ci->flags & CCI_NOFPRCLOBBER))
1539 drop &= ~RSET_FPR;
1540 if (ra_hasreg(ir->r))
1541 rset_clear(drop, ir->r); /* Dest reg handled below. */
1542 ra_evictset(as, drop); /* Evictions must be performed first. */
1543 if (ra_used(ir)) {
1544 if (irt_isfp(ir->t)) {
1545 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1546 #if LJ_64
1547 if ((ci->flags & CCI_CASTU64)) {
1548 Reg dest = ir->r;
1549 if (ra_hasreg(dest)) {
1550 ra_free(as, dest);
1551 ra_modified(as, dest);
1552 emit_rr(as, XO_MOVD, dest|REX_64, RID_RET); /* Really MOVQ. */
1553 } else {
1554 emit_movtomro(as, RID_RET|REX_64, RID_ESP, ofs);
1556 } else {
1557 ra_destreg(as, ir, RID_FPRET);
1559 #else
1560 /* Number result is in x87 st0 for x86 calling convention. */
1561 Reg dest = ir->r;
1562 if (ra_hasreg(dest)) {
1563 ra_free(as, dest);
1564 ra_modified(as, dest);
1565 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
1566 dest, RID_ESP, ofs);
1568 if ((ci->flags & CCI_CASTU64)) {
1569 emit_movtomro(as, RID_RET, RID_ESP, ofs);
1570 emit_movtomro(as, RID_RETHI, RID_ESP, ofs+4);
1571 } else {
1572 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
1573 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
1575 #endif
1576 } else {
1577 lua_assert(!irt_ispri(ir->t));
1578 ra_destreg(as, ir, RID_RET);
1583 /* Collect arguments from CALL* and CARG instructions. */
1584 static void asm_collectargs(ASMState *as, IRIns *ir,
1585 const CCallInfo *ci, IRRef *args)
1587 uint32_t n = CCI_NARGS(ci);
1588 lua_assert(n <= CCI_NARGS_MAX);
1589 if ((ci->flags & CCI_L)) { *args++ = ASMREF_L; n--; }
1590 while (n-- > 1) {
1591 ir = IR(ir->op1);
1592 lua_assert(ir->o == IR_CARG);
1593 args[n] = ir->op2;
1595 args[0] = ir->op1;
1596 lua_assert(IR(ir->op1)->o != IR_CARG);
1599 static void asm_call(ASMState *as, IRIns *ir)
1601 IRRef args[CCI_NARGS_MAX];
1602 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
1603 asm_collectargs(as, ir, ci, args);
1604 asm_setupresult(as, ir, ci);
1605 asm_gencall(as, ci, args);
1608 /* Reconstruct CCallInfo flags for CALLX*. */
1609 static uint32_t asm_callx_flags(ASMState *as, IRIns *ir)
1611 uint32_t nargs = 0;
1612 if (ir->op1 != REF_NIL) { /* Count number of arguments first. */
1613 IRIns *ira = IR(ir->op1);
1614 nargs++;
1615 while (ira->o == IR_CARG) { nargs++; ira = IR(ira->op1); }
1617 /* NYI: fastcall etc. */
1618 return (nargs | (ir->t.irt << CCI_OTSHIFT));
1621 static void asm_callx(ASMState *as, IRIns *ir)
1623 IRRef args[CCI_NARGS_MAX];
1624 CCallInfo ci;
1625 IRIns *irf;
1626 ci.flags = asm_callx_flags(as, ir);
1627 asm_collectargs(as, ir, &ci, args);
1628 asm_setupresult(as, ir, &ci);
1629 irf = IR(ir->op2);
1630 if (LJ_32 && irref_isk(ir->op2)) { /* Call to constant address on x86. */
1631 ci.func = (ASMFunction)(void *)(uintptr_t)(uint32_t)irf->i;
1632 } else {
1633 /* Prefer a non-argument register or RID_RET for indirect calls. */
1634 RegSet allow = (RSET_GPR & ~RSET_SCRATCH)|RID2RSET(RID_RET);
1635 Reg r = ra_alloc1(as, ir->op2, allow);
1636 emit_rr(as, XO_GROUP5, XOg_CALL, r);
1637 ci.func = (ASMFunction)(void *)0;
1639 asm_gencall(as, &ci, args);
1642 /* -- Returns ------------------------------------------------------------- */
1644 /* Return to lower frame. Guard that it goes to the right spot. */
1645 static void asm_retf(ASMState *as, IRIns *ir)
1647 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
1648 void *pc = ir_kptr(IR(ir->op2));
1649 int32_t delta = 1+bc_a(*((const BCIns *)pc - 1));
1650 as->topslot -= (BCReg)delta;
1651 if ((int32_t)as->topslot < 0) as->topslot = 0;
1652 emit_setgl(as, base, jit_base);
1653 emit_addptr(as, base, -8*delta);
1654 asm_guardcc(as, CC_NE);
1655 emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc));
1658 /* -- Type conversions ---------------------------------------------------- */
1660 static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
1662 Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
1663 Reg dest = ra_dest(as, ir, RSET_GPR);
1664 asm_guardcc(as, CC_P);
1665 asm_guardcc(as, CC_NE);
1666 emit_rr(as, XO_UCOMISD, left, tmp);
1667 emit_rr(as, XO_CVTSI2SD, tmp, dest);
1668 if (!(as->flags & JIT_F_SPLIT_XMM))
1669 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
1670 emit_rr(as, XO_CVTTSD2SI, dest, left);
1671 /* Can't fuse since left is needed twice. */
1674 static void asm_tobit(ASMState *as, IRIns *ir)
1676 Reg dest = ra_dest(as, ir, RSET_GPR);
1677 Reg tmp = ra_noreg(IR(ir->op1)->r) ?
1678 ra_alloc1(as, ir->op1, RSET_FPR) :
1679 ra_scratch(as, RSET_FPR);
1680 Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp));
1681 emit_rr(as, XO_MOVDto, tmp, dest);
1682 emit_mrm(as, XO_ADDSD, tmp, right);
1683 ra_left(as, tmp, ir->op1);
1686 static void asm_conv(ASMState *as, IRIns *ir)
1688 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
1689 int st64 = (st == IRT_I64 || st == IRT_U64 || (LJ_64 && st == IRT_P64));
1690 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
1691 IRRef lref = ir->op1;
1692 lua_assert(irt_type(ir->t) != st);
1693 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */
1694 if (irt_isfp(ir->t)) {
1695 Reg dest = ra_dest(as, ir, RSET_FPR);
1696 if (stfp) { /* FP to FP conversion. */
1697 Reg left = asm_fuseload(as, lref, RSET_FPR);
1698 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
1699 if (left == dest) return; /* Avoid the XO_XORPS. */
1700 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
1701 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
1702 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000));
1703 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
1704 if (irt_isfloat(ir->t))
1705 emit_rr(as, XO_CVTSD2SS, dest, dest);
1706 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
1707 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
1708 emit_loadn(as, bias, k);
1709 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
1710 return;
1711 } else { /* Integer to FP conversion. */
1712 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
1713 ra_alloc1(as, lref, RSET_GPR) :
1714 asm_fuseload(as, lref, RSET_GPR);
1715 if (LJ_64 && st == IRT_U64) {
1716 MCLabel l_end = emit_label(as);
1717 const void *k = lj_ir_k64_find(as->J, U64x(43f00000,00000000));
1718 emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */
1719 emit_sjcc(as, CC_NS, l_end);
1720 emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */
1722 emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS,
1723 dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left);
1725 if (!(as->flags & JIT_F_SPLIT_XMM))
1726 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
1727 } else if (stfp) { /* FP to integer conversion. */
1728 if (irt_isguard(ir->t)) {
1729 /* Checked conversions are only supported from number to int. */
1730 lua_assert(irt_isint(ir->t) && st == IRT_NUM);
1731 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
1732 } else {
1733 Reg dest = ra_dest(as, ir, RSET_GPR);
1734 x86Op op = st == IRT_NUM ?
1735 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
1736 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
1737 if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */
1738 /* u32 = (int32_t)(number - 2^31) + 2^31 */
1739 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1740 ra_scratch(as, RSET_FPR);
1741 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
1742 emit_rr(as, op, dest, tmp);
1743 if (st == IRT_NUM)
1744 emit_rma(as, XO_ADDSD, tmp,
1745 lj_ir_k64_find(as->J, U64x(c1e00000,00000000)));
1746 else
1747 emit_rma(as, XO_ADDSS, tmp,
1748 lj_ir_k64_find(as->J, U64x(00000000,cf000000)));
1749 ra_left(as, tmp, lref);
1750 } else if (LJ_64 && irt_isu64(ir->t)) {
1751 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1752 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1753 ra_scratch(as, RSET_FPR);
1754 MCLabel l_end = emit_label(as);
1755 emit_rr(as, op, dest|REX_64, tmp);
1756 if (st == IRT_NUM)
1757 emit_rma(as, XO_ADDSD, tmp,
1758 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1759 else
1760 emit_rma(as, XO_ADDSS, tmp,
1761 lj_ir_k64_find(as->J, U64x(00000000,df800000)));
1762 emit_sjcc(as, CC_NS, l_end);
1763 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */
1764 emit_rr(as, op, dest|REX_64, tmp);
1765 ra_left(as, tmp, lref);
1766 } else {
1767 Reg left = asm_fuseload(as, lref, RSET_FPR);
1768 if (LJ_64 && irt_isu32(ir->t))
1769 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
1770 emit_mrm(as, op,
1771 dest|((LJ_64 &&
1772 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
1773 left);
1776 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
1777 Reg left, dest = ra_dest(as, ir, RSET_GPR);
1778 RegSet allow = RSET_GPR;
1779 x86Op op;
1780 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t));
1781 if (st == IRT_I8) {
1782 op = XO_MOVSXb; allow = RSET_GPR8; dest |= FORCE_REX;
1783 } else if (st == IRT_U8) {
1784 op = XO_MOVZXb; allow = RSET_GPR8; dest |= FORCE_REX;
1785 } else if (st == IRT_I16) {
1786 op = XO_MOVSXw;
1787 } else {
1788 op = XO_MOVZXw;
1790 left = asm_fuseload(as, lref, allow);
1791 /* Add extra MOV if source is already in wrong register. */
1792 if (!LJ_64 && left != RID_MRM && !rset_test(allow, left)) {
1793 Reg tmp = ra_scratch(as, allow);
1794 emit_rr(as, op, dest, tmp);
1795 emit_rr(as, XO_MOV, tmp, left);
1796 } else {
1797 emit_mrm(as, op, dest, left);
1799 } else { /* 32/64 bit integer conversions. */
1800 if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
1801 Reg dest = ra_dest(as, ir, RSET_GPR);
1802 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1803 } else if (irt_is64(ir->t)) {
1804 Reg dest = ra_dest(as, ir, RSET_GPR);
1805 if (st64 || !(ir->op2 & IRCONV_SEXT)) {
1806 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
1807 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1808 } else { /* 32 to 64 bit sign extension. */
1809 Reg left = asm_fuseload(as, lref, RSET_GPR);
1810 emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
1812 } else {
1813 Reg dest = ra_dest(as, ir, RSET_GPR);
1814 if (st64) {
1815 Reg left = asm_fuseload(as, lref, RSET_GPR);
1816 /* This is either a 32 bit reg/reg mov which zeroes the hiword
1817 ** or a load of the loword from a 64 bit address.
1819 emit_mrm(as, XO_MOV, dest, left);
1820 } else { /* 32/32 bit no-op (cast). */
1821 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1827 #if LJ_32 && LJ_HASFFI
1828 /* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
1830 /* 64 bit integer to FP conversion in 32 bit mode. */
1831 static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
1833 Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
1834 Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
1835 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1836 Reg dest = ir->r;
1837 if (ra_hasreg(dest)) {
1838 ra_free(as, dest);
1839 ra_modified(as, dest);
1840 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
1841 dest, RID_ESP, ofs);
1843 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
1844 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
1845 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
1846 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
1847 MCLabel l_end = emit_label(as);
1848 emit_rma(as, XO_FADDq, XOg_FADDq,
1849 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
1850 emit_sjcc(as, CC_NS, l_end);
1851 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
1852 } else {
1853 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64);
1855 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
1856 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
1857 emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
1858 emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
1861 /* FP to 64 bit integer conversion in 32 bit mode. */
1862 static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
1864 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
1865 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
1866 Reg lo, hi;
1867 lua_assert(st == IRT_NUM || st == IRT_FLOAT);
1868 lua_assert(dt == IRT_I64 || dt == IRT_U64);
1869 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
1870 hi = ra_dest(as, ir, RSET_GPR);
1871 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
1872 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
1873 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1874 if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */
1875 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
1876 emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
1877 emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
1879 if (dt == IRT_U64) {
1880 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1881 MCLabel l_pop, l_end = emit_label(as);
1882 emit_x87op(as, XI_FPOP);
1883 l_pop = emit_label(as);
1884 emit_sjmp(as, l_end);
1885 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1886 if ((as->flags & JIT_F_SSE3))
1887 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1888 else
1889 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1890 emit_rma(as, XO_FADDq, XOg_FADDq,
1891 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1892 emit_sjcc(as, CC_NS, l_pop);
1893 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
1895 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1896 if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */
1897 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1898 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1899 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1900 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
1901 emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
1902 emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
1903 emit_loadi(as, lo, 0xc00);
1904 emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
1906 if (dt == IRT_U64)
1907 emit_x87op(as, XI_FDUP);
1908 emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
1909 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
1910 asm_fuseload(as, ir->op1, RSET_EMPTY));
1912 #endif
1914 static void asm_strto(ASMState *as, IRIns *ir)
1916 /* Force a spill slot for the destination register (if any). */
1917 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_tonum];
1918 IRRef args[2];
1919 RegSet drop = RSET_SCRATCH;
1920 if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r))
1921 rset_set(drop, ir->r); /* WIN64 doesn't spill all FPRs. */
1922 ra_evictset(as, drop);
1923 asm_guardcc(as, CC_E);
1924 emit_rr(as, XO_TEST, RID_RET, RID_RET); /* Test return status. */
1925 args[0] = ir->op1; /* GCstr *str */
1926 args[1] = ASMREF_TMP1; /* TValue *n */
1927 asm_gencall(as, ci, args);
1928 /* Store the result to the spill slot or temp slots. */
1929 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1)|REX_64,
1930 RID_ESP, sps_scale(ir->s));
1933 static void asm_tostr(ASMState *as, IRIns *ir)
1935 IRIns *irl = IR(ir->op1);
1936 IRRef args[2];
1937 args[0] = ASMREF_L;
1938 as->gcsteps++;
1939 if (irt_isnum(irl->t)) {
1940 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromnum];
1941 args[1] = ASMREF_TMP1; /* const lua_Number * */
1942 asm_setupresult(as, ir, ci); /* GCstr * */
1943 asm_gencall(as, ci, args);
1944 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1)|REX_64,
1945 RID_ESP, ra_spill(as, irl));
1946 } else {
1947 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromint];
1948 args[1] = ir->op1; /* int32_t k */
1949 asm_setupresult(as, ir, ci); /* GCstr * */
1950 asm_gencall(as, ci, args);
1954 /* -- Memory references --------------------------------------------------- */
1956 static void asm_aref(ASMState *as, IRIns *ir)
1958 Reg dest = ra_dest(as, ir, RSET_GPR);
1959 asm_fusearef(as, ir, RSET_GPR);
1960 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0))
1961 emit_mrm(as, XO_LEA, dest, RID_MRM);
1962 else if (as->mrm.base != dest)
1963 emit_rr(as, XO_MOV, dest, as->mrm.base);
1966 /* Must match with hash*() in lj_tab.c. */
1967 static uint32_t ir_khash(IRIns *ir)
1969 uint32_t lo, hi;
1970 if (irt_isstr(ir->t)) {
1971 return ir_kstr(ir)->hash;
1972 } else if (irt_isnum(ir->t)) {
1973 lo = ir_knum(ir)->u32.lo;
1974 hi = ir_knum(ir)->u32.hi << 1;
1975 } else if (irt_ispri(ir->t)) {
1976 lua_assert(!irt_isnil(ir->t));
1977 return irt_type(ir->t)-IRT_FALSE;
1978 } else {
1979 lua_assert(irt_isgcv(ir->t));
1980 lo = u32ptr(ir_kgc(ir));
1981 hi = lo + HASH_BIAS;
1983 return hashrot(lo, hi);
1986 /* Merge NE(HREF, niltv) check. */
1987 static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
1989 /* Assumes nothing else generates NE of HREF. */
1990 if ((ir[1].o == IR_NE || ir[1].o == IR_EQ) && ir[1].op1 == as->curins &&
1991 ra_hasreg(ir->r)) {
1992 MCode *p = as->mcp;
1993 p += (LJ_64 && *p != XI_ARITHi) ? 7+6 : 6+6;
1994 /* Ensure no loop branch inversion happened. */
1995 if (p[-6] == 0x0f && p[-5] == XI_JCCn+(CC_NE^(ir[1].o & 1))) {
1996 as->mcp = p; /* Kill cmp reg, imm32 + jz exit. */
1997 return p + *(int32_t *)(p-4); /* Return exit address. */
2000 return NULL;
2003 /* Inlined hash lookup. Specialized for key type and for const keys.
2004 ** The equivalent C code is:
2005 ** Node *n = hashkey(t, key);
2006 ** do {
2007 ** if (lj_obj_equal(&n->key, key)) return &n->val;
2008 ** } while ((n = nextnode(n)));
2009 ** return niltv(L);
2011 static void asm_href(ASMState *as, IRIns *ir)
2013 MCode *nilexit = merge_href_niltv(as, ir); /* Do this before any restores. */
2014 RegSet allow = RSET_GPR;
2015 Reg dest = ra_dest(as, ir, allow);
2016 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
2017 Reg key = RID_NONE, tmp = RID_NONE;
2018 IRIns *irkey = IR(ir->op2);
2019 int isk = irref_isk(ir->op2);
2020 IRType1 kt = irkey->t;
2021 uint32_t khash;
2022 MCLabel l_end, l_loop, l_next;
2024 if (!isk) {
2025 rset_clear(allow, tab);
2026 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
2027 if (!irt_isstr(kt))
2028 tmp = ra_scratch(as, rset_exclude(allow, key));
2031 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */
2032 l_end = emit_label(as);
2033 if (nilexit && ir[1].o == IR_NE) {
2034 emit_jcc(as, CC_E, nilexit); /* XI_JMP is not found by lj_asm_patchexit. */
2035 nilexit = NULL;
2036 } else {
2037 emit_loada(as, dest, niltvg(J2G(as->J)));
2040 /* Follow hash chain until the end. */
2041 l_loop = emit_sjcc_label(as, CC_NZ);
2042 emit_rr(as, XO_TEST, dest, dest);
2043 emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next));
2044 l_next = emit_label(as);
2046 /* Type and value comparison. */
2047 if (nilexit)
2048 emit_jcc(as, CC_E, nilexit);
2049 else
2050 emit_sjcc(as, CC_E, l_end);
2051 if (irt_isnum(kt)) {
2052 if (isk) {
2053 /* Assumes -0.0 is already canonicalized to +0.0. */
2054 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
2055 (int32_t)ir_knum(irkey)->u32.lo);
2056 emit_sjcc(as, CC_NE, l_next);
2057 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
2058 (int32_t)ir_knum(irkey)->u32.hi);
2059 } else {
2060 emit_sjcc(as, CC_P, l_next);
2061 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
2062 emit_sjcc(as, CC_A, l_next);
2063 /* The type check avoids NaN penalties and complaints from Valgrind. */
2064 #if LJ_64
2065 emit_u32(as, LJ_TISNUM);
2066 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
2067 #else
2068 emit_i8(as, LJ_TISNUM);
2069 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
2070 #endif
2072 #if LJ_64
2073 } else if (irt_islightud(kt)) {
2074 emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64));
2075 #endif
2076 } else {
2077 if (!irt_ispri(kt)) {
2078 lua_assert(irt_isaddr(kt));
2079 if (isk)
2080 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr),
2081 ptr2addr(ir_kgc(irkey)));
2082 else
2083 emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr));
2084 emit_sjcc(as, CC_NE, l_next);
2086 lua_assert(!irt_isnil(kt));
2087 emit_i8(as, irt_toitype(kt));
2088 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
2090 emit_sfixup(as, l_loop);
2091 checkmclim(as);
2093 /* Load main position relative to tab->node into dest. */
2094 khash = isk ? ir_khash(irkey) : 1;
2095 if (khash == 0) {
2096 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node));
2097 } else {
2098 emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node));
2099 if ((as->flags & JIT_F_PREFER_IMUL)) {
2100 emit_i8(as, sizeof(Node));
2101 emit_rr(as, XO_IMULi8, dest, dest);
2102 } else {
2103 emit_shifti(as, XOg_SHL, dest, 3);
2104 emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
2106 if (isk) {
2107 emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash);
2108 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
2109 } else if (irt_isstr(kt)) {
2110 emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash));
2111 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
2112 } else { /* Must match with hashrot() in lj_tab.c. */
2113 emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask));
2114 emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
2115 emit_shifti(as, XOg_ROL, tmp, HASH_ROT3);
2116 emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
2117 emit_shifti(as, XOg_ROL, dest, HASH_ROT2);
2118 emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
2119 emit_shifti(as, XOg_ROL, dest, HASH_ROT1);
2120 emit_rr(as, XO_ARITH(XOg_XOR), tmp, dest);
2121 if (irt_isnum(kt)) {
2122 emit_rr(as, XO_ARITH(XOg_ADD), dest, dest);
2123 #if LJ_64
2124 emit_shifti(as, XOg_SHR|REX_64, dest, 32);
2125 emit_rr(as, XO_MOV, tmp, dest);
2126 emit_rr(as, XO_MOVDto, key|REX_64, dest);
2127 #else
2128 emit_rmro(as, XO_MOV, dest, RID_ESP, ra_spill(as, irkey)+4);
2129 emit_rr(as, XO_MOVDto, key, tmp);
2130 #endif
2131 } else {
2132 emit_rr(as, XO_MOV, tmp, key);
2133 emit_rmro(as, XO_LEA, dest, key, HASH_BIAS);
2139 static void asm_hrefk(ASMState *as, IRIns *ir)
2141 IRIns *kslot = IR(ir->op2);
2142 IRIns *irkey = IR(kslot->op1);
2143 int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
2144 Reg dest = ra_used(ir) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
2145 Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
2146 #if !LJ_64
2147 MCLabel l_exit;
2148 #endif
2149 lua_assert(ofs % sizeof(Node) == 0);
2150 if (ra_hasreg(dest)) {
2151 if (ofs != 0) {
2152 if (dest == node && !(as->flags & JIT_F_LEA_AGU))
2153 emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs);
2154 else
2155 emit_rmro(as, XO_LEA, dest, node, ofs);
2156 } else if (dest != node) {
2157 emit_rr(as, XO_MOV, dest, node);
2160 asm_guardcc(as, CC_NE);
2161 #if LJ_64
2162 if (!irt_ispri(irkey->t)) {
2163 Reg key = ra_scratch(as, rset_exclude(RSET_GPR, node));
2164 emit_rmro(as, XO_CMP, key|REX_64, node,
2165 ofs + (int32_t)offsetof(Node, key.u64));
2166 lua_assert(irt_isnum(irkey->t) || irt_isgcv(irkey->t));
2167 /* Assumes -0.0 is already canonicalized to +0.0. */
2168 emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 :
2169 ((uint64_t)irt_toitype(irkey->t) << 32) |
2170 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey)));
2171 } else {
2172 lua_assert(!irt_isnil(irkey->t));
2173 emit_i8(as, irt_toitype(irkey->t));
2174 emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
2175 ofs + (int32_t)offsetof(Node, key.it));
2177 #else
2178 l_exit = emit_label(as);
2179 if (irt_isnum(irkey->t)) {
2180 /* Assumes -0.0 is already canonicalized to +0.0. */
2181 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
2182 ofs + (int32_t)offsetof(Node, key.u32.lo),
2183 (int32_t)ir_knum(irkey)->u32.lo);
2184 emit_sjcc(as, CC_NE, l_exit);
2185 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
2186 ofs + (int32_t)offsetof(Node, key.u32.hi),
2187 (int32_t)ir_knum(irkey)->u32.hi);
2188 } else {
2189 if (!irt_ispri(irkey->t)) {
2190 lua_assert(irt_isgcv(irkey->t));
2191 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
2192 ofs + (int32_t)offsetof(Node, key.gcr),
2193 ptr2addr(ir_kgc(irkey)));
2194 emit_sjcc(as, CC_NE, l_exit);
2196 lua_assert(!irt_isnil(irkey->t));
2197 emit_i8(as, irt_toitype(irkey->t));
2198 emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
2199 ofs + (int32_t)offsetof(Node, key.it));
2201 #endif
2204 static void asm_newref(ASMState *as, IRIns *ir)
2206 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
2207 IRRef args[3];
2208 IRIns *irkey;
2209 Reg tmp;
2210 args[0] = ASMREF_L; /* lua_State *L */
2211 args[1] = ir->op1; /* GCtab *t */
2212 args[2] = ASMREF_TMP1; /* cTValue *key */
2213 asm_setupresult(as, ir, ci); /* TValue * */
2214 asm_gencall(as, ci, args);
2215 tmp = ra_releasetmp(as, ASMREF_TMP1);
2216 irkey = IR(ir->op2);
2217 if (irt_isnum(irkey->t)) {
2218 /* For numbers use the constant itself or a spill slot as a TValue. */
2219 if (irref_isk(ir->op2))
2220 emit_loada(as, tmp, ir_knum(irkey));
2221 else
2222 emit_rmro(as, XO_LEA, tmp|REX_64, RID_ESP, ra_spill(as, irkey));
2223 } else {
2224 /* Otherwise use g->tmptv to hold the TValue. */
2225 if (!irref_isk(ir->op2)) {
2226 Reg src = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, tmp));
2227 emit_movtomro(as, REX_64IR(irkey, src), tmp, 0);
2228 } else if (!irt_ispri(irkey->t)) {
2229 emit_movmroi(as, tmp, 0, irkey->i);
2231 if (!(LJ_64 && irt_islightud(irkey->t)))
2232 emit_movmroi(as, tmp, 4, irt_toitype(irkey->t));
2233 emit_loada(as, tmp, &J2G(as->J)->tmptv);
2237 static void asm_uref(ASMState *as, IRIns *ir)
2239 /* NYI: Check that UREFO is still open and not aliasing a slot. */
2240 Reg dest = ra_dest(as, ir, RSET_GPR);
2241 if (irref_isk(ir->op1)) {
2242 GCfunc *fn = ir_kfunc(IR(ir->op1));
2243 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
2244 emit_rma(as, XO_MOV, dest, v);
2245 } else {
2246 Reg uv = ra_scratch(as, RSET_GPR);
2247 Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
2248 if (ir->o == IR_UREFC) {
2249 emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv));
2250 asm_guardcc(as, CC_NE);
2251 emit_i8(as, 1);
2252 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
2253 } else {
2254 emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v));
2256 emit_rmro(as, XO_MOV, uv, func,
2257 (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
2261 static void asm_fref(ASMState *as, IRIns *ir)
2263 Reg dest = ra_dest(as, ir, RSET_GPR);
2264 asm_fusefref(as, ir, RSET_GPR);
2265 emit_mrm(as, XO_LEA, dest, RID_MRM);
2268 static void asm_strref(ASMState *as, IRIns *ir)
2270 Reg dest = ra_dest(as, ir, RSET_GPR);
2271 asm_fusestrref(as, ir, RSET_GPR);
2272 if (as->mrm.base == RID_NONE)
2273 emit_loadi(as, dest, as->mrm.ofs);
2274 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE)
2275 emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs);
2276 else
2277 emit_mrm(as, XO_LEA, dest, RID_MRM);
2280 /* -- Loads and stores ---------------------------------------------------- */
2282 static void asm_fxload(ASMState *as, IRIns *ir)
2284 Reg dest = ra_dest(as, ir, irt_isnum(ir->t) ? RSET_FPR : RSET_GPR);
2285 x86Op xo;
2286 if (ir->o == IR_FLOAD)
2287 asm_fusefref(as, ir, RSET_GPR);
2288 else
2289 asm_fusexref(as, ir->op1, RSET_GPR);
2290 /* ir->op2 is ignored -- unaligned loads are ok on x86. */
2291 switch (irt_type(ir->t)) {
2292 case IRT_I8: xo = XO_MOVSXb; break;
2293 case IRT_U8: xo = XO_MOVZXb; break;
2294 case IRT_I16: xo = XO_MOVSXw; break;
2295 case IRT_U16: xo = XO_MOVZXw; break;
2296 case IRT_NUM: xo = XMM_MOVRM(as); break;
2297 case IRT_FLOAT: xo = XO_MOVSS; break;
2298 default:
2299 if (LJ_64 && irt_is64(ir->t))
2300 dest |= REX_64;
2301 else
2302 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t));
2303 xo = XO_MOV;
2304 break;
2306 emit_mrm(as, xo, dest, RID_MRM);
2309 static void asm_fxstore(ASMState *as, IRIns *ir)
2311 RegSet allow = RSET_GPR;
2312 Reg src = RID_NONE, osrc = RID_NONE;
2313 int32_t k = 0;
2314 /* The IRT_I16/IRT_U16 stores should never be simplified for constant
2315 ** values since mov word [mem], imm16 has a length-changing prefix.
2317 if (irt_isi16(ir->t) || irt_isu16(ir->t) || irt_isfp(ir->t) ||
2318 !asm_isk32(as, ir->op2, &k)) {
2319 RegSet allow8 = irt_isfp(ir->t) ? RSET_FPR :
2320 (irt_isi8(ir->t) || irt_isu8(ir->t)) ? RSET_GPR8 : RSET_GPR;
2321 src = osrc = ra_alloc1(as, ir->op2, allow8);
2322 if (!LJ_64 && !rset_test(allow8, src)) { /* Already in wrong register. */
2323 rset_clear(allow, osrc);
2324 src = ra_scratch(as, allow8);
2326 rset_clear(allow, src);
2328 if (ir->o == IR_FSTORE)
2329 asm_fusefref(as, IR(ir->op1), allow);
2330 else
2331 asm_fusexref(as, ir->op1, allow);
2332 /* ir->op2 is ignored -- unaligned stores are ok on x86. */
2333 if (ra_hasreg(src)) {
2334 x86Op xo;
2335 switch (irt_type(ir->t)) {
2336 case IRT_I8: case IRT_U8: xo = XO_MOVtob; src |= FORCE_REX; break;
2337 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break;
2338 case IRT_NUM: xo = XO_MOVSDto; break;
2339 case IRT_FLOAT: xo = XO_MOVSSto; break;
2340 #if LJ_64
2341 case IRT_LIGHTUD: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */
2342 #endif
2343 default:
2344 if (LJ_64 && irt_is64(ir->t))
2345 src |= REX_64;
2346 else
2347 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t));
2348 xo = XO_MOVto;
2349 break;
2351 emit_mrm(as, xo, src, RID_MRM);
2352 if (!LJ_64 && src != osrc) {
2353 ra_noweak(as, osrc);
2354 emit_rr(as, XO_MOV, src, osrc);
2356 } else {
2357 if (irt_isi8(ir->t) || irt_isu8(ir->t)) {
2358 emit_i8(as, k);
2359 emit_mrm(as, XO_MOVmib, 0, RID_MRM);
2360 } else {
2361 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || irt_isu32(ir->t) ||
2362 irt_isaddr(ir->t));
2363 emit_i32(as, k);
2364 emit_mrm(as, XO_MOVmi, REX_64IR(ir, 0), RID_MRM);
2369 #if LJ_64
2370 static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
2372 if (ra_used(ir) || typecheck) {
2373 Reg dest = ra_dest(as, ir, RSET_GPR);
2374 if (typecheck) {
2375 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, dest));
2376 asm_guardcc(as, CC_NE);
2377 emit_i8(as, -2);
2378 emit_rr(as, XO_ARITHi8, XOg_CMP, tmp);
2379 emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
2380 emit_rr(as, XO_MOV, tmp|REX_64, dest);
2382 return dest;
2383 } else {
2384 return RID_NONE;
2387 #endif
2389 static void asm_ahuvload(ASMState *as, IRIns *ir)
2391 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t));
2392 #if LJ_64
2393 if (irt_islightud(ir->t)) {
2394 Reg dest = asm_load_lightud64(as, ir, 1);
2395 if (ra_hasreg(dest)) {
2396 asm_fuseahuref(as, ir->op1, RSET_GPR);
2397 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
2399 return;
2400 } else
2401 #endif
2402 if (ra_used(ir)) {
2403 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
2404 Reg dest = ra_dest(as, ir, allow);
2405 asm_fuseahuref(as, ir->op1, RSET_GPR);
2406 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), dest, RID_MRM);
2407 } else {
2408 asm_fuseahuref(as, ir->op1, RSET_GPR);
2410 /* Always do the type check, even if the load result is unused. */
2411 as->mrm.ofs += 4;
2412 asm_guardcc(as, irt_isnum(ir->t) ? CC_A : CC_NE);
2413 if (LJ_64 && irt_isnum(ir->t)) {
2414 emit_u32(as, LJ_TISNUM);
2415 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
2416 } else {
2417 emit_i8(as, irt_toitype(ir->t));
2418 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM);
2422 static void asm_ahustore(ASMState *as, IRIns *ir)
2424 if (irt_isnum(ir->t)) {
2425 Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
2426 asm_fuseahuref(as, ir->op1, RSET_GPR);
2427 emit_mrm(as, XO_MOVSDto, src, RID_MRM);
2428 #if LJ_64
2429 } else if (irt_islightud(ir->t)) {
2430 Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
2431 asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src));
2432 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
2433 #endif
2434 } else {
2435 IRIns *irr = IR(ir->op2);
2436 RegSet allow = RSET_GPR;
2437 Reg src = RID_NONE;
2438 if (!irref_isk(ir->op2)) {
2439 src = ra_alloc1(as, ir->op2, allow);
2440 rset_clear(allow, src);
2442 asm_fuseahuref(as, ir->op1, allow);
2443 if (ra_hasreg(src)) {
2444 emit_mrm(as, XO_MOVto, src, RID_MRM);
2445 } else if (!irt_ispri(irr->t)) {
2446 lua_assert(irt_isaddr(ir->t));
2447 emit_i32(as, irr->i);
2448 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
2450 as->mrm.ofs += 4;
2451 emit_i32(as, (int32_t)irt_toitype(ir->t));
2452 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
2456 static void asm_sload(ASMState *as, IRIns *ir)
2458 int32_t ofs = 8*((int32_t)ir->op1-1) + ((ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
2459 IRType1 t = ir->t;
2460 Reg base;
2461 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */
2462 lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
2463 lua_assert(!irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)));
2464 if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t)) {
2465 Reg left = ra_scratch(as, RSET_FPR);
2466 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */
2467 base = ra_alloc1(as, REF_BASE, RSET_GPR);
2468 emit_rmro(as, XMM_MOVRM(as), left, base, ofs);
2469 t.irt = IRT_NUM; /* Continue with a regular number type check. */
2470 #if LJ_64
2471 } else if (irt_islightud(t)) {
2472 Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK));
2473 if (ra_hasreg(dest)) {
2474 base = ra_alloc1(as, REF_BASE, RSET_GPR);
2475 emit_rmro(as, XO_MOV, dest|REX_64, base, ofs);
2477 return;
2478 #endif
2479 } else if (ra_used(ir)) {
2480 RegSet allow = irt_isnum(t) ? RSET_FPR : RSET_GPR;
2481 Reg dest = ra_dest(as, ir, allow);
2482 base = ra_alloc1(as, REF_BASE, RSET_GPR);
2483 lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t));
2484 if ((ir->op2 & IRSLOAD_CONVERT))
2485 emit_rmro(as, XO_CVTSD2SI, dest, base, ofs);
2486 else if (irt_isnum(t))
2487 emit_rmro(as, XMM_MOVRM(as), dest, base, ofs);
2488 else
2489 emit_rmro(as, XO_MOV, dest, base, ofs);
2490 } else {
2491 if (!(ir->op2 & IRSLOAD_TYPECHECK))
2492 return; /* No type check: avoid base alloc. */
2493 base = ra_alloc1(as, REF_BASE, RSET_GPR);
2495 if ((ir->op2 & IRSLOAD_TYPECHECK)) {
2496 /* Need type check, even if the load result is unused. */
2497 asm_guardcc(as, irt_isnum(t) ? CC_A : CC_NE);
2498 if (LJ_64 && irt_isnum(t)) {
2499 emit_u32(as, LJ_TISNUM);
2500 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
2501 } else {
2502 emit_i8(as, irt_toitype(t));
2503 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4);
2508 /* -- Allocations --------------------------------------------------------- */
2510 static void asm_snew(ASMState *as, IRIns *ir)
2512 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_new];
2513 IRRef args[3];
2514 args[0] = ASMREF_L; /* lua_State *L */
2515 args[1] = ir->op1; /* const char *str */
2516 args[2] = ir->op2; /* size_t len */
2517 as->gcsteps++;
2518 asm_setupresult(as, ir, ci); /* GCstr * */
2519 asm_gencall(as, ci, args);
2522 static void asm_tnew(ASMState *as, IRIns *ir)
2524 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_new1];
2525 IRRef args[2];
2526 args[0] = ASMREF_L; /* lua_State *L */
2527 args[1] = ASMREF_TMP1; /* uint32_t ahsize */
2528 as->gcsteps++;
2529 asm_setupresult(as, ir, ci); /* GCtab * */
2530 asm_gencall(as, ci, args);
2531 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), ir->op1 | (ir->op2 << 24));
2534 static void asm_tdup(ASMState *as, IRIns *ir)
2536 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_dup];
2537 IRRef args[2];
2538 args[0] = ASMREF_L; /* lua_State *L */
2539 args[1] = ir->op1; /* const GCtab *kt */
2540 as->gcsteps++;
2541 asm_setupresult(as, ir, ci); /* GCtab * */
2542 asm_gencall(as, ci, args);
2545 #if LJ_HASFFI
2546 static void asm_cnew(ASMState *as, IRIns *ir)
2548 CTState *cts = ctype_ctsG(J2G(as->J));
2549 CTypeID typeid = (CTypeID)IR(ir->op1)->i;
2550 CTSize sz = (ir->o == IR_CNEWI || ir->op2 == REF_NIL) ?
2551 lj_ctype_size(cts, typeid) : (CTSize)IR(ir->op2)->i;
2552 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
2553 IRRef args[2];
2554 lua_assert(sz != CTSIZE_INVALID);
2556 args[0] = ASMREF_L; /* lua_State *L */
2557 args[1] = ASMREF_TMP1; /* MSize size */
2558 as->gcsteps++;
2559 asm_setupresult(as, ir, ci); /* GCcdata * */
2561 /* Initialize immutable cdata object. */
2562 if (ir->o == IR_CNEWI) {
2563 RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
2564 #if LJ_64
2565 Reg r64 = sz == 8 ? REX_64 : 0;
2566 if (irref_isk(ir->op2)) {
2567 IRIns *irk = IR(ir->op2);
2568 uint64_t k = irk->o == IR_KINT64 ? ir_k64(irk)->u64 :
2569 (uint64_t)(uint32_t)irk->i;
2570 if (sz == 4 || checki32((int64_t)k)) {
2571 emit_i32(as, (int32_t)k);
2572 emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata));
2573 } else {
2574 emit_movtomro(as, RID_ECX + r64, RID_RET, sizeof(GCcdata));
2575 emit_loadu64(as, RID_ECX, k);
2577 } else {
2578 Reg r = ra_alloc1(as, ir->op2, allow);
2579 emit_movtomro(as, r + r64, RID_RET, sizeof(GCcdata));
2581 #else
2582 int32_t ofs = sizeof(GCcdata);
2583 if (LJ_HASFFI && sz == 8) {
2584 ofs += 4; ir++;
2585 lua_assert(ir->o == IR_HIOP);
2587 do {
2588 if (irref_isk(ir->op2)) {
2589 emit_movmroi(as, RID_RET, ofs, IR(ir->op2)->i);
2590 } else {
2591 Reg r = ra_alloc1(as, ir->op2, allow);
2592 emit_movtomro(as, r, RID_RET, ofs);
2593 rset_clear(allow, r);
2595 if (!LJ_HASFFI || ofs == sizeof(GCcdata)) break;
2596 ofs -= 4; ir--;
2597 } while (1);
2598 #endif
2599 lua_assert(sz == 4 || (sz == 8 && (LJ_64 || LJ_HASFFI)));
2602 /* Combine initialization of marked, gct and typeid. */
2603 emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked));
2604 emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX,
2605 (int32_t)((~LJ_TCDATA<<8)+(typeid<<16)));
2606 emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES);
2607 emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite);
2609 asm_gencall(as, ci, args);
2610 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata)));
2612 #else
2613 #define asm_cnew(as, ir) ((void)0)
2614 #endif
2616 /* -- Write barriers ------------------------------------------------------ */
2618 static void asm_tbar(ASMState *as, IRIns *ir)
2620 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
2621 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab));
2622 MCLabel l_end = emit_label(as);
2623 emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist));
2624 emit_setgl(as, tab, gc.grayagain);
2625 emit_getgl(as, tmp, gc.grayagain);
2626 emit_i8(as, ~LJ_GC_BLACK);
2627 emit_rmro(as, XO_ARITHib, XOg_AND, tab, offsetof(GCtab, marked));
2628 emit_sjcc(as, CC_Z, l_end);
2629 emit_i8(as, LJ_GC_BLACK);
2630 emit_rmro(as, XO_GROUP3b, XOg_TEST, tab, offsetof(GCtab, marked));
2633 static void asm_obar(ASMState *as, IRIns *ir)
2635 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
2636 IRRef args[2];
2637 MCLabel l_end;
2638 Reg obj;
2639 /* No need for other object barriers (yet). */
2640 lua_assert(IR(ir->op1)->o == IR_UREFC);
2641 ra_evictset(as, RSET_SCRATCH);
2642 l_end = emit_label(as);
2643 args[0] = ASMREF_TMP1; /* global_State *g */
2644 args[1] = ir->op1; /* TValue *tv */
2645 asm_gencall(as, ci, args);
2646 emit_loada(as, ra_releasetmp(as, ASMREF_TMP1), J2G(as->J));
2647 obj = IR(ir->op1)->r;
2648 emit_sjcc(as, CC_Z, l_end);
2649 emit_i8(as, LJ_GC_WHITES);
2650 if (irref_isk(ir->op2)) {
2651 GCobj *vp = ir_kgc(IR(ir->op2));
2652 emit_rma(as, XO_GROUP3b, XOg_TEST, &vp->gch.marked);
2653 } else {
2654 Reg val = ra_alloc1(as, ir->op2, rset_exclude(RSET_SCRATCH&RSET_GPR, obj));
2655 emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked));
2657 emit_sjcc(as, CC_Z, l_end);
2658 emit_i8(as, LJ_GC_BLACK);
2659 emit_rmro(as, XO_GROUP3b, XOg_TEST, obj,
2660 (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
2663 /* -- FP/int arithmetic and logic operations ------------------------------ */
2665 /* Load reference onto x87 stack. Force a spill to memory if needed. */
2666 static void asm_x87load(ASMState *as, IRRef ref)
2668 IRIns *ir = IR(ref);
2669 if (ir->o == IR_KNUM) {
2670 cTValue *tv = ir_knum(ir);
2671 if (tvispzero(tv)) /* Use fldz only for +0. */
2672 emit_x87op(as, XI_FLDZ);
2673 else if (tvispone(tv))
2674 emit_x87op(as, XI_FLD1);
2675 else
2676 emit_rma(as, XO_FLDq, XOg_FLDq, tv);
2677 } else if (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT && !ra_used(ir) &&
2678 !irref_isk(ir->op1) && mayfuse(as, ir->op1)) {
2679 IRIns *iri = IR(ir->op1);
2680 emit_rmro(as, XO_FILDd, XOg_FILDd, RID_ESP, ra_spill(as, iri));
2681 } else {
2682 emit_mrm(as, XO_FLDq, XOg_FLDq, asm_fuseload(as, ref, RSET_EMPTY));
2686 /* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
2687 static int fpmjoin_pow(ASMState *as, IRIns *ir)
2689 IRIns *irp = IR(ir->op1);
2690 if (irp == ir-1 && irp->o == IR_MUL && !ra_used(irp)) {
2691 IRIns *irpp = IR(irp->op1);
2692 if (irpp == ir-2 && irpp->o == IR_FPMATH &&
2693 irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
2694 /* The modified regs must match with the *.dasc implementation. */
2695 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
2696 IRIns *irx;
2697 if (ra_hasreg(ir->r))
2698 rset_clear(drop, ir->r); /* Dest reg handled below. */
2699 ra_evictset(as, drop);
2700 ra_destreg(as, ir, RID_XMM0);
2701 emit_call(as, lj_vm_pow_sse);
2702 irx = IR(irpp->op1);
2703 if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
2704 irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
2705 ra_left(as, RID_XMM0, irpp->op1);
2706 ra_left(as, RID_XMM1, irp->op2);
2707 return 1;
2710 return 0;
2713 static void asm_fpmath(ASMState *as, IRIns *ir)
2715 IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER;
2716 if (fpm == IRFPM_SQRT) {
2717 Reg dest = ra_dest(as, ir, RSET_FPR);
2718 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2719 emit_mrm(as, XO_SQRTSD, dest, left);
2720 } else if (fpm <= IRFPM_TRUNC) {
2721 if (as->flags & JIT_F_SSE4_1) { /* SSE4.1 has a rounding instruction. */
2722 Reg dest = ra_dest(as, ir, RSET_FPR);
2723 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2724 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
2725 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
2726 ** This is atrocious, but the alternatives are much worse.
2728 /* Round down/up/trunc == 1001/1010/1011. */
2729 emit_i8(as, 0x09 + fpm);
2730 emit_mrm(as, XO_ROUNDSD, dest, left);
2731 if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
2732 as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */
2734 *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */
2735 } else { /* Call helper functions for SSE2 variant. */
2736 /* The modified regs must match with the *.dasc implementation. */
2737 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
2738 if (ra_hasreg(ir->r))
2739 rset_clear(drop, ir->r); /* Dest reg handled below. */
2740 ra_evictset(as, drop);
2741 ra_destreg(as, ir, RID_XMM0);
2742 emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
2743 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
2744 ra_left(as, RID_XMM0, ir->op1);
2746 } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) {
2747 /* Rejoined to pow(). */
2748 } else { /* Handle x87 ops. */
2749 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
2750 Reg dest = ir->r;
2751 if (ra_hasreg(dest)) {
2752 ra_free(as, dest);
2753 ra_modified(as, dest);
2754 emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs);
2756 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
2757 switch (fpm) { /* st0 = lj_vm_*(st0) */
2758 case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
2759 case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break;
2760 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
2761 case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
2762 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
2763 case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10:
2764 /* Note: the use of fyl2xp1 would be pointless here. When computing
2765 ** log(1.0+eps) the precision is already lost after 1.0 is added.
2766 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
2768 emit_x87op(as, XI_FYL2X); break;
2769 case IRFPM_OTHER:
2770 switch (ir->o) {
2771 case IR_ATAN2:
2772 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
2773 case IR_LDEXP:
2774 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
2775 default: lua_assert(0); break;
2777 break;
2778 default: lua_assert(0); break;
2780 asm_x87load(as, ir->op1);
2781 switch (fpm) {
2782 case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break;
2783 case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break;
2784 case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break;
2785 case IRFPM_OTHER:
2786 if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2);
2787 break;
2788 default: break;
2793 static void asm_fppowi(ASMState *as, IRIns *ir)
2795 /* The modified regs must match with the *.dasc implementation. */
2796 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
2797 if (ra_hasreg(ir->r))
2798 rset_clear(drop, ir->r); /* Dest reg handled below. */
2799 ra_evictset(as, drop);
2800 ra_destreg(as, ir, RID_XMM0);
2801 emit_call(as, lj_vm_powi_sse);
2802 ra_left(as, RID_XMM0, ir->op1);
2803 ra_left(as, RID_EAX, ir->op2);
2806 #if LJ_64 && LJ_HASFFI
2807 static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
2809 const CCallInfo *ci = &lj_ir_callinfo[id];
2810 IRRef args[2];
2811 args[0] = ir->op1;
2812 args[1] = ir->op2;
2813 asm_setupresult(as, ir, ci);
2814 asm_gencall(as, ci, args);
2816 #endif
2818 /* Find out whether swapping operands might be beneficial. */
2819 static int swapops(ASMState *as, IRIns *ir)
2821 IRIns *irl = IR(ir->op1);
2822 IRIns *irr = IR(ir->op2);
2823 lua_assert(ra_noreg(irr->r));
2824 if (!irm_iscomm(lj_ir_mode[ir->o]))
2825 return 0; /* Can't swap non-commutative operations. */
2826 if (irref_isk(ir->op2))
2827 return 0; /* Don't swap constants to the left. */
2828 if (ra_hasreg(irl->r))
2829 return 1; /* Swap if left already has a register. */
2830 if (ra_samehint(ir->r, irr->r))
2831 return 1; /* Swap if dest and right have matching hints. */
2832 if (as->curins > as->loopref) { /* In variant part? */
2833 if (ir->op2 < as->loopref && !irt_isphi(irr->t))
2834 return 0; /* Keep invariants on the right. */
2835 if (ir->op1 < as->loopref && !irt_isphi(irl->t))
2836 return 1; /* Swap invariants to the right. */
2838 if (opisfusableload(irl->o))
2839 return 1; /* Swap fusable loads to the right. */
2840 return 0; /* Otherwise don't swap. */
2843 static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo)
2845 IRRef lref = ir->op1;
2846 IRRef rref = ir->op2;
2847 RegSet allow = RSET_FPR;
2848 Reg dest;
2849 Reg right = IR(rref)->r;
2850 if (ra_hasreg(right)) {
2851 rset_clear(allow, right);
2852 ra_noweak(as, right);
2854 dest = ra_dest(as, ir, allow);
2855 if (lref == rref) {
2856 right = dest;
2857 } else if (ra_noreg(right)) {
2858 if (swapops(as, ir)) {
2859 IRRef tmp = lref; lref = rref; rref = tmp;
2861 right = asm_fuseload(as, rref, rset_clear(allow, dest));
2863 emit_mrm(as, xo, dest, right);
2864 ra_left(as, dest, lref);
2867 static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa)
2869 IRRef lref = ir->op1;
2870 IRRef rref = ir->op2;
2871 RegSet allow = RSET_GPR;
2872 Reg dest, right;
2873 int32_t k = 0;
2874 if (as->testmcp == as->mcp) { /* Drop test r,r instruction. */
2875 as->testmcp = NULL;
2876 as->mcp += (LJ_64 && *as->mcp != XI_TEST) ? 3 : 2;
2878 right = IR(rref)->r;
2879 if (ra_hasreg(right)) {
2880 rset_clear(allow, right);
2881 ra_noweak(as, right);
2883 dest = ra_dest(as, ir, allow);
2884 if (lref == rref) {
2885 right = dest;
2886 } else if (ra_noreg(right) && !asm_isk32(as, rref, &k)) {
2887 if (swapops(as, ir)) {
2888 IRRef tmp = lref; lref = rref; rref = tmp;
2890 right = asm_fuseload(as, rref, rset_clear(allow, dest));
2892 if (irt_isguard(ir->t)) /* For IR_ADDOV etc. */
2893 asm_guardcc(as, CC_O);
2894 if (xa != XOg_X_IMUL) {
2895 if (ra_hasreg(right))
2896 emit_mrm(as, XO_ARITH(xa), REX_64IR(ir, dest), right);
2897 else
2898 emit_gri(as, XG_ARITHi(xa), REX_64IR(ir, dest), k);
2899 } else if (ra_hasreg(right)) { /* IMUL r, mrm. */
2900 emit_mrm(as, XO_IMUL, REX_64IR(ir, dest), right);
2901 } else { /* IMUL r, r, k. */
2902 /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */
2903 Reg left = asm_fuseload(as, lref, RSET_GPR);
2904 x86Op xo;
2905 if (checki8(k)) { emit_i8(as, k); xo = XO_IMULi8;
2906 } else { emit_i32(as, k); xo = XO_IMULi; }
2907 emit_rr(as, xo, REX_64IR(ir, dest), left);
2908 return;
2910 ra_left(as, dest, lref);
2913 /* LEA is really a 4-operand ADD with an independent destination register,
2914 ** up to two source registers and an immediate. One register can be scaled
2915 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
2916 ** instructions.
2918 ** Currently only a few common cases are supported:
2919 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated
2920 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b
2921 ** - Right ADD fusion: y = a+(b+k)
2922 ** The ommited variants have already been reduced by FOLD.
2924 ** There are more fusion opportunities, like gathering shifts or joining
2925 ** common references. But these are probably not worth the trouble, since
2926 ** array indexing is not decomposed and already makes use of all fields
2927 ** of the ModRM operand.
2929 static int asm_lea(ASMState *as, IRIns *ir)
2931 IRIns *irl = IR(ir->op1);
2932 IRIns *irr = IR(ir->op2);
2933 RegSet allow = RSET_GPR;
2934 Reg dest;
2935 as->mrm.base = as->mrm.idx = RID_NONE;
2936 as->mrm.scale = XM_SCALE1;
2937 as->mrm.ofs = 0;
2938 if (ra_hasreg(irl->r)) {
2939 rset_clear(allow, irl->r);
2940 ra_noweak(as, irl->r);
2941 as->mrm.base = irl->r;
2942 if (irref_isk(ir->op2) || ra_hasreg(irr->r)) {
2943 /* The PHI renaming logic does a better job in some cases. */
2944 if (ra_hasreg(ir->r) &&
2945 ((irt_isphi(irl->t) && as->phireg[ir->r] == ir->op1) ||
2946 (irt_isphi(irr->t) && as->phireg[ir->r] == ir->op2)))
2947 return 0;
2948 if (irref_isk(ir->op2)) {
2949 as->mrm.ofs = irr->i;
2950 } else {
2951 rset_clear(allow, irr->r);
2952 ra_noweak(as, irr->r);
2953 as->mrm.idx = irr->r;
2955 } else if (irr->o == IR_ADD && mayfuse(as, ir->op2) &&
2956 irref_isk(irr->op2)) {
2957 Reg idx = ra_alloc1(as, irr->op1, allow);
2958 rset_clear(allow, idx);
2959 as->mrm.idx = (uint8_t)idx;
2960 as->mrm.ofs = IR(irr->op2)->i;
2961 } else {
2962 return 0;
2964 } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) &&
2965 (irref_isk(ir->op2) || irref_isk(irl->op2))) {
2966 Reg idx, base = ra_alloc1(as, irl->op1, allow);
2967 rset_clear(allow, base);
2968 as->mrm.base = (uint8_t)base;
2969 if (irref_isk(ir->op2)) {
2970 as->mrm.ofs = irr->i;
2971 idx = ra_alloc1(as, irl->op2, allow);
2972 } else {
2973 as->mrm.ofs = IR(irl->op2)->i;
2974 idx = ra_alloc1(as, ir->op2, allow);
2976 rset_clear(allow, idx);
2977 as->mrm.idx = (uint8_t)idx;
2978 } else {
2979 return 0;
2981 dest = ra_dest(as, ir, allow);
2982 emit_mrm(as, XO_LEA, dest, RID_MRM);
2983 return 1; /* Success. */
2986 static void asm_add(ASMState *as, IRIns *ir)
2988 if (irt_isnum(ir->t))
2989 asm_fparith(as, ir, XO_ADDSD);
2990 else if ((as->flags & JIT_F_LEA_AGU) || as->testmcp == as->mcp ||
2991 irt_is64(ir->t) || !asm_lea(as, ir))
2992 asm_intarith(as, ir, XOg_ADD);
2995 static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg)
2997 Reg dest = ra_dest(as, ir, RSET_GPR);
2998 emit_rr(as, XO_GROUP3, REX_64IR(ir, xg), dest);
2999 ra_left(as, dest, ir->op1);
3002 static void asm_bitswap(ASMState *as, IRIns *ir)
3004 Reg dest = ra_dest(as, ir, RSET_GPR);
3005 as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24),
3006 REX_64IR(ir, dest), 0, 0, as->mcp, 1);
3007 ra_left(as, dest, ir->op1);
3010 static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
3012 IRRef rref = ir->op2;
3013 IRIns *irr = IR(rref);
3014 Reg dest;
3015 if (irref_isk(rref)) { /* Constant shifts. */
3016 int shift;
3017 dest = ra_dest(as, ir, RSET_GPR);
3018 shift = irr->i & (irt_is64(ir->t) ? 63 : 31);
3019 switch (shift) {
3020 case 0: break;
3021 case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break;
3022 default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break;
3024 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
3025 RegSet allow = rset_exclude(RSET_GPR, RID_ECX);
3026 Reg right = irr->r;
3027 if (ra_noreg(right)) {
3028 right = ra_allocref(as, rref, RID2RSET(RID_ECX));
3029 } else if (right != RID_ECX) {
3030 rset_clear(allow, right);
3031 ra_scratch(as, RID2RSET(RID_ECX));
3033 dest = ra_dest(as, ir, allow);
3034 emit_rr(as, XO_SHIFTcl, REX_64IR(ir, xs), dest);
3035 if (right != RID_ECX) {
3036 ra_noweak(as, right);
3037 emit_rr(as, XO_MOV, RID_ECX, right);
3040 ra_left(as, dest, ir->op1);
3042 ** Note: avoid using the flags resulting from a shift or rotate!
3043 ** All of them cause a partial flag stall, except for r,1 shifts
3044 ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
3048 /* -- Comparisons --------------------------------------------------------- */
3050 /* Virtual flags for unordered FP comparisons. */
3051 #define VCC_U 0x1000 /* Unordered. */
3052 #define VCC_P 0x2000 /* Needs extra CC_P branch. */
3053 #define VCC_S 0x4000 /* Swap avoids CC_P branch. */
3054 #define VCC_PS (VCC_P|VCC_S)
3056 /* Map of comparisons to flags. ORDER IR. */
3057 #define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
3058 static const uint16_t asm_compmap[IR_ABC+1] = {
3059 /* signed non-eq unsigned flags */
3060 /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS),
3061 /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0),
3062 /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS),
3063 /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0),
3064 /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U),
3065 /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS),
3066 /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U),
3067 /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS),
3068 /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
3069 /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P),
3070 /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */
3073 /* FP and integer comparisons. */
3074 static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
3076 if (irt_isnum(ir->t)) {
3077 IRRef lref = ir->op1;
3078 IRRef rref = ir->op2;
3079 Reg left, right;
3080 MCLabel l_around;
3082 ** An extra CC_P branch is required to preserve ordered/unordered
3083 ** semantics for FP comparisons. This can be avoided by swapping
3084 ** the operands and inverting the condition (except for EQ and UNE).
3085 ** So always try to swap if possible.
3087 ** Another option would be to swap operands to achieve better memory
3088 ** operand fusion. But it's unlikely that this outweighs the cost
3089 ** of the extra branches.
3091 if (cc & VCC_S) { /* Swap? */
3092 IRRef tmp = lref; lref = rref; rref = tmp;
3093 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
3095 left = ra_alloc1(as, lref, RSET_FPR);
3096 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
3097 l_around = emit_label(as);
3098 asm_guardcc(as, cc >> 4);
3099 if (cc & VCC_P) { /* Extra CC_P branch required? */
3100 if (!(cc & VCC_U)) {
3101 asm_guardcc(as, CC_P); /* Branch to exit for ordered comparisons. */
3102 } else if (l_around != as->invmcp) {
3103 emit_sjcc(as, CC_P, l_around); /* Branch around for unordered. */
3104 } else {
3105 /* Patched to mcloop by asm_loop_fixup. */
3106 as->loopinv = 2;
3107 if (as->realign)
3108 emit_sjcc(as, CC_P, as->mcp);
3109 else
3110 emit_jcc(as, CC_P, as->mcp);
3113 emit_mrm(as, XO_UCOMISD, left, right);
3114 } else {
3115 IRRef lref = ir->op1, rref = ir->op2;
3116 IROp leftop = (IROp)(IR(lref)->o);
3117 Reg r64 = REX_64IR(ir, 0);
3118 int32_t imm = 0;
3119 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || irt_isaddr(ir->t));
3120 /* Swap constants (only for ABC) and fusable loads to the right. */
3121 if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) {
3122 if ((cc & 0xc) == 0xc) cc ^= 3; /* L <-> G, LE <-> GE */
3123 else if ((cc & 0xa) == 0x2) cc ^= 5; /* A <-> B, AE <-> BE */
3124 lref = ir->op2; rref = ir->op1;
3126 if (asm_isk32(as, rref, &imm)) {
3127 IRIns *irl = IR(lref);
3128 /* Check wether we can use test ins. Not for unsigned, since CF=0. */
3129 int usetest = (imm == 0 && (cc & 0xa) != 0x2);
3130 if (usetest && irl->o == IR_BAND && irl+1 == ir && !ra_used(irl)) {
3131 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
3132 Reg right, left = RID_NONE;
3133 RegSet allow = RSET_GPR;
3134 if (!asm_isk32(as, irl->op2, &imm)) {
3135 left = ra_alloc1(as, irl->op2, allow);
3136 rset_clear(allow, left);
3137 } else { /* Try to Fuse IRT_I8/IRT_U8 loads, too. See below. */
3138 IRIns *irll = IR(irl->op1);
3139 if (opisfusableload((IROp)irll->o) &&
3140 (irt_isi8(irll->t) || irt_isu8(irll->t))) {
3141 IRType1 origt = irll->t; /* Temporarily flip types. */
3142 irll->t.irt = (irll->t.irt & ~IRT_TYPE) | IRT_INT;
3143 as->curins--; /* Skip to BAND to avoid failing in noconflict(). */
3144 right = asm_fuseload(as, irl->op1, RSET_GPR);
3145 as->curins++;
3146 irll->t = origt;
3147 if (right != RID_MRM) goto test_nofuse;
3148 /* Fusion succeeded, emit test byte mrm, imm8. */
3149 asm_guardcc(as, cc);
3150 emit_i8(as, (imm & 0xff));
3151 emit_mrm(as, XO_GROUP3b, XOg_TEST, RID_MRM);
3152 return;
3155 as->curins--; /* Skip to BAND to avoid failing in noconflict(). */
3156 right = asm_fuseload(as, irl->op1, allow);
3157 as->curins++; /* Undo the above. */
3158 test_nofuse:
3159 asm_guardcc(as, cc);
3160 if (ra_noreg(left)) {
3161 emit_i32(as, imm);
3162 emit_mrm(as, XO_GROUP3, r64 + XOg_TEST, right);
3163 } else {
3164 emit_mrm(as, XO_TEST, r64 + left, right);
3166 } else {
3167 Reg left;
3168 if (opisfusableload((IROp)irl->o) &&
3169 ((irt_isu8(irl->t) && checku8(imm)) ||
3170 ((irt_isi8(irl->t) || irt_isi16(irl->t)) && checki8(imm)) ||
3171 (irt_isu16(irl->t) && checku16(imm) && checki8((int16_t)imm)))) {
3172 /* Only the IRT_INT case is fused by asm_fuseload.
3173 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
3174 ** are handled here.
3175 ** Note that cmp word [mem], imm16 should not be generated,
3176 ** since it has a length-changing prefix. Compares of a word
3177 ** against a sign-extended imm8 are ok, however.
3179 IRType1 origt = irl->t; /* Temporarily flip types. */
3180 irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT;
3181 left = asm_fuseload(as, lref, RSET_GPR);
3182 irl->t = origt;
3183 if (left == RID_MRM) { /* Fusion succeeded? */
3184 asm_guardcc(as, cc);
3185 emit_i8(as, imm);
3186 emit_mrm(as, (irt_isi8(origt) || irt_isu8(origt)) ?
3187 XO_ARITHib : XO_ARITHiw8, r64 + XOg_CMP, RID_MRM);
3188 return;
3189 } /* Otherwise handle register case as usual. */
3190 } else {
3191 left = asm_fuseload(as, lref, RSET_GPR);
3193 asm_guardcc(as, cc);
3194 if (usetest && left != RID_MRM) {
3195 /* Use test r,r instead of cmp r,0. */
3196 emit_rr(as, XO_TEST, r64 + left, left);
3197 if (irl+1 == ir) /* Referencing previous ins? */
3198 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
3199 } else {
3200 emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
3203 } else {
3204 Reg left = ra_alloc1(as, lref, RSET_GPR);
3205 Reg right = asm_fuseload(as, rref, rset_exclude(RSET_GPR, left));
3206 asm_guardcc(as, cc);
3207 emit_mrm(as, XO_CMP, r64 + left, right);
3212 #if LJ_32 && LJ_HASFFI
3213 /* 64 bit integer comparisons in 32 bit mode. */
3214 static void asm_comp_int64(ASMState *as, IRIns *ir)
3216 uint32_t cc = asm_compmap[(ir-1)->o];
3217 RegSet allow = RSET_GPR;
3218 Reg lefthi = RID_NONE, leftlo = RID_NONE;
3219 Reg righthi = RID_NONE, rightlo = RID_NONE;
3220 MCLabel l_around;
3221 x86ModRM mrm;
3223 as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */
3225 /* Allocate/fuse hiword operands. */
3226 if (irref_isk(ir->op2)) {
3227 lefthi = asm_fuseload(as, ir->op1, allow);
3228 } else {
3229 lefthi = ra_alloc1(as, ir->op1, allow);
3230 righthi = asm_fuseload(as, ir->op2, allow);
3231 if (righthi == RID_MRM) {
3232 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3233 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3234 } else {
3235 rset_clear(allow, righthi);
3238 mrm = as->mrm; /* Save state for hiword instruction. */
3240 /* Allocate/fuse loword operands. */
3241 if (irref_isk((ir-1)->op2)) {
3242 leftlo = asm_fuseload(as, (ir-1)->op1, allow);
3243 } else {
3244 leftlo = ra_alloc1(as, (ir-1)->op1, allow);
3245 rightlo = asm_fuseload(as, (ir-1)->op2, allow);
3246 if (rightlo == RID_MRM) {
3247 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3248 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3249 } else {
3250 rset_clear(allow, rightlo);
3254 /* All register allocations must be performed _before_ this point. */
3255 l_around = emit_label(as);
3256 as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */
3258 /* Loword comparison and branch. */
3259 asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */
3260 if (ra_noreg(rightlo)) {
3261 int32_t imm = IR((ir-1)->op2)->i;
3262 if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
3263 emit_rr(as, XO_TEST, leftlo, leftlo);
3264 else
3265 emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
3266 } else {
3267 emit_mrm(as, XO_CMP, leftlo, rightlo);
3270 /* Hiword comparison and branches. */
3271 if ((cc & 15) != CC_NE)
3272 emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */
3273 if ((cc & 15) != CC_E)
3274 asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */
3275 as->mrm = mrm; /* Restore state. */
3276 if (ra_noreg(righthi)) {
3277 int32_t imm = IR(ir->op2)->i;
3278 if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
3279 emit_rr(as, XO_TEST, lefthi, lefthi);
3280 else
3281 emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
3282 } else {
3283 emit_mrm(as, XO_CMP, lefthi, righthi);
3286 #endif
3288 /* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
3290 /* Hiword op of a split 64 bit op. Previous op must be the loword op. */
3291 static void asm_hiop(ASMState *as, IRIns *ir)
3293 #if LJ_32 && LJ_HASFFI
3294 /* HIOP is marked as a store because it needs its own DCE logic. */
3295 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
3296 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
3297 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
3298 if (usehi || uselo) {
3299 if (irt_isfp(ir->t))
3300 asm_conv_fp_int64(as, ir);
3301 else
3302 asm_conv_int64_fp(as, ir);
3304 as->curins--; /* Always skip the CONV. */
3305 return;
3306 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
3307 asm_comp_int64(as, ir);
3308 return;
3310 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
3311 switch ((ir-1)->o) {
3312 case IR_ADD:
3313 asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD);
3314 break;
3315 case IR_SUB:
3316 asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB);
3317 break;
3318 case IR_NEG: {
3319 Reg dest = ra_dest(as, ir, RSET_GPR);
3320 emit_rr(as, XO_GROUP3, XOg_NEG, dest);
3321 if (uselo) {
3322 emit_i8(as, 0);
3323 emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
3325 ra_left(as, dest, ir->op1);
3326 break;
3328 case IR_CALLN:
3329 ra_destreg(as, ir, RID_RETHI);
3330 if (!uselo)
3331 ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */
3332 break;
3333 case IR_CNEWI:
3334 /* Nothing to do here. Handled by CNEWI itself. */
3335 break;
3336 default: lua_assert(0); break;
3338 #else
3339 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
3340 #endif
3343 /* -- Stack handling ------------------------------------------------------ */
3345 /* Get extent of the stack for a snapshot. */
3346 static BCReg asm_stack_extent(ASMState *as, SnapShot *snap, BCReg *ptopslot)
3348 SnapEntry *map = &as->T->snapmap[snap->mapofs];
3349 MSize n, nent = snap->nent;
3350 BCReg baseslot = 0, topslot = 0;
3351 /* Must check all frames to find topslot (outer can be larger than inner). */
3352 for (n = 0; n < nent; n++) {
3353 SnapEntry sn = map[n];
3354 if ((sn & SNAP_FRAME)) {
3355 IRIns *ir = IR(snap_ref(sn));
3356 GCfunc *fn = ir_kfunc(ir);
3357 if (isluafunc(fn)) {
3358 BCReg s = snap_slot(sn);
3359 BCReg fs = s + funcproto(fn)->framesize;
3360 if (fs > topslot) topslot = fs;
3361 baseslot = s;
3365 *ptopslot = topslot;
3366 return baseslot;
3369 /* Check Lua stack size for overflow. Use exit handler as fallback. */
3370 static void asm_stack_check(ASMState *as, BCReg topslot,
3371 Reg pbase, RegSet allow, ExitNo exitno)
3373 /* Try to get an unused temp. register, otherwise spill/restore eax. */
3374 Reg r = allow ? rset_pickbot(allow) : RID_EAX;
3375 emit_jcc(as, CC_B, exitstub_addr(as->J, exitno));
3376 if (allow == RSET_EMPTY) /* Restore temp. register. */
3377 emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0);
3378 else
3379 ra_modified(as, r);
3380 emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*topslot));
3381 if (ra_hasreg(pbase) && pbase != r)
3382 emit_rr(as, XO_ARITH(XOg_SUB), r, pbase);
3383 else
3384 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE,
3385 ptr2addr(&J2G(as->J)->jit_base));
3386 emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack));
3387 emit_getgl(as, r, jit_L);
3388 if (allow == RSET_EMPTY) /* Spill temp. register. */
3389 emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0);
3392 /* Restore Lua stack from on-trace state. */
3393 static void asm_stack_restore(ASMState *as, SnapShot *snap)
3395 SnapEntry *map = &as->T->snapmap[snap->mapofs];
3396 MSize n, nent = snap->nent;
3397 SnapEntry *flinks = map + nent + snap->depth;
3398 /* Store the value of all modified slots to the Lua stack. */
3399 for (n = 0; n < nent; n++) {
3400 SnapEntry sn = map[n];
3401 BCReg s = snap_slot(sn);
3402 int32_t ofs = 8*((int32_t)s-1);
3403 IRRef ref = snap_ref(sn);
3404 IRIns *ir = IR(ref);
3405 if ((sn & SNAP_NORESTORE))
3406 continue;
3407 if (irt_isnum(ir->t)) {
3408 Reg src = ra_alloc1(as, ref, RSET_FPR);
3409 emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
3410 } else {
3411 lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t));
3412 if (!irref_isk(ref)) {
3413 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
3414 emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs);
3415 } else if (!irt_ispri(ir->t)) {
3416 emit_movmroi(as, RID_BASE, ofs, ir->i);
3418 if ((sn & (SNAP_CONT|SNAP_FRAME))) {
3419 if (s != 0) /* Do not overwrite link to previous frame. */
3420 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--));
3421 } else {
3422 if (!(LJ_64 && irt_islightud(ir->t)))
3423 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
3426 checkmclim(as);
3428 lua_assert(map + nent == flinks);
3431 /* -- GC handling --------------------------------------------------------- */
3433 /* Check GC threshold and do one or more GC steps. */
3434 static void asm_gc_check(ASMState *as)
3436 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
3437 IRRef args[2];
3438 MCLabel l_end;
3439 Reg tmp;
3440 ra_evictset(as, RSET_SCRATCH);
3441 l_end = emit_label(as);
3442 /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
3443 asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */
3444 emit_rr(as, XO_TEST, RID_RET, RID_RET);
3445 args[0] = ASMREF_TMP1; /* global_State *g */
3446 args[1] = ASMREF_TMP2; /* MSize steps */
3447 asm_gencall(as, ci, args);
3448 tmp = ra_releasetmp(as, ASMREF_TMP1);
3449 emit_loada(as, tmp, J2G(as->J));
3450 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), (int32_t)as->gcsteps);
3451 /* Jump around GC step if GC total < GC threshold. */
3452 emit_sjcc(as, CC_B, l_end);
3453 emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold);
3454 emit_getgl(as, tmp, gc.total);
3455 as->gcsteps = 0;
3456 checkmclim(as);
3459 /* -- PHI and loop handling ----------------------------------------------- */
3461 /* Break a PHI cycle by renaming to a free register (evict if needed). */
3462 static void asm_phi_break(ASMState *as, RegSet blocked, RegSet blockedby,
3463 RegSet allow)
3465 RegSet candidates = blocked & allow;
3466 if (candidates) { /* If this register file has candidates. */
3467 /* Note: the set for ra_pick cannot be empty, since each register file
3468 ** has some registers never allocated to PHIs.
3470 Reg down, up = ra_pick(as, ~blocked & allow); /* Get a free register. */
3471 if (candidates & ~blockedby) /* Optimize shifts, else it's a cycle. */
3472 candidates = candidates & ~blockedby;
3473 down = rset_picktop(candidates); /* Pick candidate PHI register. */
3474 ra_rename(as, down, up); /* And rename it to the free register. */
3478 /* PHI register shuffling.
3480 ** The allocator tries hard to preserve PHI register assignments across
3481 ** the loop body. Most of the time this loop does nothing, since there
3482 ** are no register mismatches.
3484 ** If a register mismatch is detected and ...
3485 ** - the register is currently free: rename it.
3486 ** - the register is blocked by an invariant: restore/remat and rename it.
3487 ** - Otherwise the register is used by another PHI, so mark it as blocked.
3489 ** The renames are order-sensitive, so just retry the loop if a register
3490 ** is marked as blocked, but has been freed in the meantime. A cycle is
3491 ** detected if all of the blocked registers are allocated. To break the
3492 ** cycle rename one of them to a free register and retry.
3494 ** Note that PHI spill slots are kept in sync and don't need to be shuffled.
3496 static void asm_phi_shuffle(ASMState *as)
3498 RegSet work;
3500 /* Find and resolve PHI register mismatches. */
3501 for (;;) {
3502 RegSet blocked = RSET_EMPTY;
3503 RegSet blockedby = RSET_EMPTY;
3504 RegSet phiset = as->phiset;
3505 while (phiset) { /* Check all left PHI operand registers. */
3506 Reg r = rset_picktop(phiset);
3507 IRIns *irl = IR(as->phireg[r]);
3508 Reg left = irl->r;
3509 if (r != left) { /* Mismatch? */
3510 if (!rset_test(as->freeset, r)) { /* PHI register blocked? */
3511 IRRef ref = regcost_ref(as->cost[r]);
3512 if (irt_ismarked(IR(ref)->t)) { /* Blocked by other PHI (w/reg)? */
3513 rset_set(blocked, r);
3514 if (ra_hasreg(left))
3515 rset_set(blockedby, left);
3516 left = RID_NONE;
3517 } else { /* Otherwise grab register from invariant. */
3518 ra_restore(as, ref);
3519 checkmclim(as);
3522 if (ra_hasreg(left)) {
3523 ra_rename(as, left, r);
3524 checkmclim(as);
3527 rset_clear(phiset, r);
3529 if (!blocked) break; /* Finished. */
3530 if (!(as->freeset & blocked)) { /* Break cycles if none are free. */
3531 asm_phi_break(as, blocked, blockedby, RSET_GPR);
3532 asm_phi_break(as, blocked, blockedby, RSET_FPR);
3533 checkmclim(as);
3534 } /* Else retry some more renames. */
3537 /* Restore/remat invariants whose registers are modified inside the loop. */
3538 work = as->modset & ~(as->freeset | as->phiset);
3539 while (work) {
3540 Reg r = rset_picktop(work);
3541 ra_restore(as, regcost_ref(as->cost[r]));
3542 rset_clear(work, r);
3543 checkmclim(as);
3546 /* Allocate and save all unsaved PHI regs and clear marks. */
3547 work = as->phiset;
3548 while (work) {
3549 Reg r = rset_picktop(work);
3550 IRRef lref = as->phireg[r];
3551 IRIns *ir = IR(lref);
3552 if (ra_hasspill(ir->s)) { /* Left PHI gained a spill slot? */
3553 irt_clearmark(ir->t); /* Handled here, so clear marker now. */
3554 ra_alloc1(as, lref, RID2RSET(r));
3555 ra_save(as, ir, r); /* Save to spill slot inside the loop. */
3556 checkmclim(as);
3558 rset_clear(work, r);
3562 /* Emit renames for left PHIs which are only spilled outside the loop. */
3563 static void asm_phi_fixup(ASMState *as)
3565 RegSet work = as->phiset;
3566 while (work) {
3567 Reg r = rset_picktop(work);
3568 IRRef lref = as->phireg[r];
3569 IRIns *ir = IR(lref);
3570 /* Left PHI gained a spill slot before the loop? */
3571 if (irt_ismarked(ir->t) && ra_hasspill(ir->s)) {
3572 IRRef ren;
3573 lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), lref, as->loopsnapno);
3574 ren = tref_ref(lj_ir_emit(as->J));
3575 as->ir = as->T->ir; /* The IR may have been reallocated. */
3576 IR(ren)->r = (uint8_t)r;
3577 IR(ren)->s = SPS_NONE;
3579 irt_clearmark(ir->t); /* Always clear marker. */
3580 rset_clear(work, r);
3584 /* Setup right PHI reference. */
3585 static void asm_phi(ASMState *as, IRIns *ir)
3587 RegSet allow = (irt_isnum(ir->t) ? RSET_FPR : RSET_GPR) & ~as->phiset;
3588 RegSet afree = (as->freeset & allow);
3589 IRIns *irl = IR(ir->op1);
3590 IRIns *irr = IR(ir->op2);
3591 /* Spill slot shuffling is not implemented yet (but rarely needed). */
3592 if (ra_hasspill(irl->s) || ra_hasspill(irr->s))
3593 lj_trace_err(as->J, LJ_TRERR_NYIPHI);
3594 /* Leave at least one register free for non-PHIs (and PHI cycle breaking). */
3595 if ((afree & (afree-1))) { /* Two or more free registers? */
3596 Reg r;
3597 if (ra_noreg(irr->r)) { /* Get a register for the right PHI. */
3598 r = ra_allocref(as, ir->op2, allow);
3599 } else { /* Duplicate right PHI, need a copy (rare). */
3600 r = ra_scratch(as, allow);
3601 ra_movrr(as, irr, r, irr->r);
3603 ir->r = (uint8_t)r;
3604 rset_set(as->phiset, r);
3605 as->phireg[r] = (IRRef1)ir->op1;
3606 irt_setmark(irl->t); /* Marks left PHIs _with_ register. */
3607 if (ra_noreg(irl->r))
3608 ra_sethint(irl->r, r); /* Set register hint for left PHI. */
3609 } else { /* Otherwise allocate a spill slot. */
3610 /* This is overly restrictive, but it triggers only on synthetic code. */
3611 if (ra_hasreg(irl->r) || ra_hasreg(irr->r))
3612 lj_trace_err(as->J, LJ_TRERR_NYIPHI);
3613 ra_spill(as, ir);
3614 irl->s = irr->s = ir->s; /* Sync left/right PHI spill slots. */
3618 /* Fixup the loop branch. */
3619 static void asm_loop_fixup(ASMState *as)
3621 MCode *p = as->mctop;
3622 MCode *target = as->mcp;
3623 if (as->realign) { /* Realigned loops use short jumps. */
3624 as->realign = NULL; /* Stop another retry. */
3625 lua_assert(((intptr_t)target & 15) == 0);
3626 if (as->loopinv) { /* Inverted loop branch? */
3627 p -= 5;
3628 p[0] = XI_JMP;
3629 lua_assert(target - p >= -128);
3630 p[-1] = (MCode)(target - p); /* Patch sjcc. */
3631 if (as->loopinv == 2)
3632 p[-3] = (MCode)(target - p + 2); /* Patch opt. short jp. */
3633 } else {
3634 lua_assert(target - p >= -128);
3635 p[-1] = (MCode)(int8_t)(target - p); /* Patch short jmp. */
3636 p[-2] = XI_JMPs;
3638 } else {
3639 MCode *newloop;
3640 p[-5] = XI_JMP;
3641 if (as->loopinv) { /* Inverted loop branch? */
3642 /* asm_guardcc already inverted the jcc and patched the jmp. */
3643 p -= 5;
3644 newloop = target+4;
3645 *(int32_t *)(p-4) = (int32_t)(target - p); /* Patch jcc. */
3646 if (as->loopinv == 2) {
3647 *(int32_t *)(p-10) = (int32_t)(target - p + 6); /* Patch opt. jp. */
3648 newloop = target+8;
3650 } else { /* Otherwise just patch jmp. */
3651 *(int32_t *)(p-4) = (int32_t)(target - p);
3652 newloop = target+3;
3654 /* Realign small loops and shorten the loop branch. */
3655 if (newloop >= p - 128) {
3656 as->realign = newloop; /* Force a retry and remember alignment. */
3657 as->curins = as->stopins; /* Abort asm_trace now. */
3658 as->T->nins = as->orignins; /* Remove any added renames. */
3663 /* Middle part of a loop. */
3664 static void asm_loop(ASMState *as)
3666 /* LOOP is a guard, so the snapno is up to date. */
3667 as->loopsnapno = as->snapno;
3668 if (as->gcsteps)
3669 asm_gc_check(as);
3670 /* LOOP marks the transition from the variant to the invariant part. */
3671 as->testmcp = as->invmcp = NULL;
3672 as->sectref = 0;
3673 if (!neverfuse(as)) as->fuseref = 0;
3674 asm_phi_shuffle(as);
3675 asm_loop_fixup(as);
3676 as->mcloop = as->mcp;
3677 RA_DBGX((as, "===== LOOP ====="));
3678 if (!as->realign) RA_DBG_FLUSH();
3681 /* -- Head of trace ------------------------------------------------------- */
3683 /* Calculate stack adjustment. */
3684 static int32_t asm_stack_adjust(ASMState *as)
3686 if (as->evenspill <= SPS_FIXED)
3687 return 0;
3688 return sps_scale((as->evenspill - SPS_FIXED + 3) & ~3);
3691 /* Coalesce BASE register for a root trace. */
3692 static void asm_head_root_base(ASMState *as)
3694 IRIns *ir = IR(REF_BASE);
3695 Reg r = ir->r;
3696 if (ra_hasreg(r)) {
3697 ra_free(as, r);
3698 if (rset_test(as->modset, r))
3699 ir->r = RID_INIT; /* No inheritance for modified BASE register. */
3700 if (r != RID_BASE)
3701 emit_rr(as, XO_MOV, r, RID_BASE);
3705 /* Head of a root trace. */
3706 static void asm_head_root(ASMState *as)
3708 int32_t spadj;
3709 asm_head_root_base(as);
3710 emit_setgli(as, vmstate, (int32_t)as->T->traceno);
3711 spadj = asm_stack_adjust(as);
3712 as->T->spadjust = (uint16_t)spadj;
3713 emit_addptr(as, RID_ESP|REX_64, -spadj);
3714 /* Root traces assume a checked stack for the starting proto. */
3715 as->T->topslot = gcref(as->T->startpt)->pt.framesize;
3718 /* Coalesce or reload BASE register for a side trace. */
3719 static RegSet asm_head_side_base(ASMState *as, Reg pbase, RegSet allow)
3721 IRIns *ir = IR(REF_BASE);
3722 Reg r = ir->r;
3723 if (ra_hasreg(r)) {
3724 ra_free(as, r);
3725 if (rset_test(as->modset, r))
3726 ir->r = RID_INIT; /* No inheritance for modified BASE register. */
3727 if (pbase == r) {
3728 rset_clear(allow, r); /* Mark same BASE register as coalesced. */
3729 } else if (ra_hasreg(pbase) && rset_test(as->freeset, pbase)) {
3730 rset_clear(allow, pbase);
3731 emit_rr(as, XO_MOV, r, pbase); /* Move from coalesced parent register. */
3732 } else {
3733 emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */
3736 return allow;
3739 /* Head of a side trace.
3741 ** The current simplistic algorithm requires that all slots inherited
3742 ** from the parent are live in a register between pass 2 and pass 3. This
3743 ** avoids the complexity of stack slot shuffling. But of course this may
3744 ** overflow the register set in some cases and cause the dreaded error:
3745 ** "NYI: register coalescing too complex". A refined algorithm is needed.
3747 static void asm_head_side(ASMState *as)
3749 IRRef1 sloadins[RID_MAX];
3750 RegSet allow = RSET_ALL; /* Inverse of all coalesced registers. */
3751 RegSet live = RSET_EMPTY; /* Live parent registers. */
3752 Reg pbase = as->parent->ir[REF_BASE].r; /* Parent base register (if any). */
3753 int32_t spadj, spdelta;
3754 int pass2 = 0;
3755 int pass3 = 0;
3756 IRRef i;
3758 allow = asm_head_side_base(as, pbase, allow);
3760 /* Scan all parent SLOADs and collect register dependencies. */
3761 for (i = as->stopins; i > REF_BASE; i--) {
3762 IRIns *ir = IR(i);
3763 RegSP rs;
3764 lua_assert(ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_PARENT));
3765 rs = as->parentmap[ir->op1];
3766 if (ra_hasreg(ir->r)) {
3767 rset_clear(allow, ir->r);
3768 if (ra_hasspill(ir->s))
3769 ra_save(as, ir, ir->r);
3770 } else if (ra_hasspill(ir->s)) {
3771 irt_setmark(ir->t);
3772 pass2 = 1;
3774 if (ir->r == rs) { /* Coalesce matching registers right now. */
3775 ra_free(as, ir->r);
3776 } else if (ra_hasspill(regsp_spill(rs))) {
3777 if (ra_hasreg(ir->r))
3778 pass3 = 1;
3779 } else if (ra_used(ir)) {
3780 sloadins[rs] = (IRRef1)i;
3781 rset_set(live, rs); /* Block live parent register. */
3785 /* Calculate stack frame adjustment. */
3786 spadj = asm_stack_adjust(as);
3787 spdelta = spadj - (int32_t)as->parent->spadjust;
3788 if (spdelta < 0) { /* Don't shrink the stack frame. */
3789 spadj = (int32_t)as->parent->spadjust;
3790 spdelta = 0;
3792 as->T->spadjust = (uint16_t)spadj;
3794 /* Reload spilled target registers. */
3795 if (pass2) {
3796 for (i = as->stopins; i > REF_BASE; i--) {
3797 IRIns *ir = IR(i);
3798 if (irt_ismarked(ir->t)) {
3799 RegSet mask;
3800 Reg r;
3801 RegSP rs;
3802 irt_clearmark(ir->t);
3803 rs = as->parentmap[ir->op1];
3804 if (!ra_hasspill(regsp_spill(rs)))
3805 ra_sethint(ir->r, rs); /* Hint may be gone, set it again. */
3806 else if (sps_scale(regsp_spill(rs))+spdelta == sps_scale(ir->s))
3807 continue; /* Same spill slot, do nothing. */
3808 mask = (irt_isnum(ir->t) ? RSET_FPR : RSET_GPR) & allow;
3809 if (mask == RSET_EMPTY)
3810 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
3811 r = ra_allocref(as, i, mask);
3812 ra_save(as, ir, r);
3813 rset_clear(allow, r);
3814 if (r == rs) { /* Coalesce matching registers right now. */
3815 ra_free(as, r);
3816 rset_clear(live, r);
3817 } else if (ra_hasspill(regsp_spill(rs))) {
3818 pass3 = 1;
3820 checkmclim(as);
3825 /* Store trace number and adjust stack frame relative to the parent. */
3826 emit_setgli(as, vmstate, (int32_t)as->T->traceno);
3827 emit_addptr(as, RID_ESP|REX_64, -spdelta);
3829 /* Restore target registers from parent spill slots. */
3830 if (pass3) {
3831 RegSet work = ~as->freeset & RSET_ALL;
3832 while (work) {
3833 Reg r = rset_pickbot(work);
3834 IRIns *ir = IR(regcost_ref(as->cost[r]));
3835 RegSP rs = as->parentmap[ir->op1];
3836 rset_clear(work, r);
3837 if (ra_hasspill(regsp_spill(rs))) {
3838 int32_t ofs = sps_scale(regsp_spill(rs));
3839 ra_free(as, r);
3840 if (r < RID_MAX_GPR)
3841 emit_rmro(as, XO_MOV, REX_64IR(ir, r), RID_ESP, ofs);
3842 else
3843 emit_rmro(as, XMM_MOVRM(as), r, RID_ESP, ofs);
3844 checkmclim(as);
3849 /* Shuffle registers to match up target regs with parent regs. */
3850 for (;;) {
3851 RegSet work;
3853 /* Repeatedly coalesce free live registers by moving to their target. */
3854 while ((work = as->freeset & live) != RSET_EMPTY) {
3855 Reg rp = rset_pickbot(work);
3856 IRIns *ir = IR(sloadins[rp]);
3857 rset_clear(live, rp);
3858 rset_clear(allow, rp);
3859 ra_free(as, ir->r);
3860 ra_movrr(as, ir, ir->r, rp);
3861 checkmclim(as);
3864 /* We're done if no live registers remain. */
3865 if (live == RSET_EMPTY)
3866 break;
3868 /* Break cycles by renaming one target to a temp. register. */
3869 if (live & RSET_GPR) {
3870 RegSet tmpset = as->freeset & ~live & allow & RSET_GPR;
3871 if (tmpset == RSET_EMPTY)
3872 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
3873 ra_rename(as, rset_pickbot(live & RSET_GPR), rset_pickbot(tmpset));
3875 if (live & RSET_FPR) {
3876 RegSet tmpset = as->freeset & ~live & allow & RSET_FPR;
3877 if (tmpset == RSET_EMPTY)
3878 lj_trace_err(as->J, LJ_TRERR_NYICOAL);
3879 ra_rename(as, rset_pickbot(live & RSET_FPR), rset_pickbot(tmpset));
3881 checkmclim(as);
3882 /* Continue with coalescing to fix up the broken cycle(s). */
3885 /* Inherit top stack slot already checked by parent trace. */
3886 as->T->topslot = as->parent->topslot;
3887 if (as->topslot > as->T->topslot) { /* Need to check for higher slot? */
3888 as->T->topslot = (uint8_t)as->topslot; /* Remember for child traces. */
3889 /* Reuse the parent exit in the context of the parent trace. */
3890 asm_stack_check(as, as->topslot, pbase, allow & RSET_GPR, as->J->exitno);
3894 /* -- Tail of trace ------------------------------------------------------- */
3896 /* Link to another trace. */
3897 static void asm_tail_link(ASMState *as)
3899 SnapNo snapno = as->T->nsnap-1; /* Last snapshot. */
3900 SnapShot *snap = &as->T->snap[snapno];
3901 BCReg baseslot = asm_stack_extent(as, snap, &as->topslot);
3903 checkmclim(as);
3904 ra_allocref(as, REF_BASE, RID2RSET(RID_BASE));
3906 if (as->T->link == TRACE_INTERP) {
3907 /* Setup fixed registers for exit to interpreter. */
3908 const BCIns *pc = snap_pc(as->T->snapmap[snap->mapofs + snap->nent]);
3909 int32_t mres;
3910 if (bc_op(*pc) == BC_JLOOP) { /* NYI: find a better way to do this. */
3911 BCIns *retpc = &traceref(as->J, bc_d(*pc))->startins;
3912 if (bc_isret(bc_op(*retpc)))
3913 pc = retpc;
3915 emit_loada(as, RID_DISPATCH, J2GG(as->J)->dispatch);
3916 emit_loada(as, RID_PC, pc);
3917 mres = (int32_t)(snap->nslots - baseslot);
3918 switch (bc_op(*pc)) {
3919 case BC_CALLM: case BC_CALLMT:
3920 mres -= (int32_t)(1 + bc_a(*pc) + bc_c(*pc)); break;
3921 case BC_RETM: mres -= (int32_t)(bc_a(*pc) + bc_d(*pc)); break;
3922 case BC_TSETM: mres -= (int32_t)bc_a(*pc); break;
3923 default: if (bc_op(*pc) < BC_FUNCF) mres = 0; break;
3925 emit_loadi(as, RID_RET, mres); /* Return MULTRES or 0. */
3926 } else if (baseslot) {
3927 /* Save modified BASE for linking to trace with higher start frame. */
3928 emit_setgl(as, RID_BASE, jit_base);
3930 emit_addptr(as, RID_BASE, 8*(int32_t)baseslot);
3932 /* Sync the interpreter state with the on-trace state. */
3933 asm_stack_restore(as, snap);
3935 /* Root traces that grow the stack need to check the stack at the end. */
3936 if (!as->parent && as->topslot)
3937 asm_stack_check(as, as->topslot, RID_BASE, as->freeset & RSET_GPR, snapno);
3940 /* Fixup the tail code. */
3941 static void asm_tail_fixup(ASMState *as, TraceNo lnk)
3943 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
3944 MCode *p = as->mctop;
3945 MCode *target, *q;
3946 int32_t spadj = as->T->spadjust;
3947 if (spadj == 0) {
3948 p -= ((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0);
3949 } else {
3950 MCode *p1;
3951 /* Patch stack adjustment. */
3952 if (checki8(spadj)) {
3953 p -= 3;
3954 p1 = p-6;
3955 *p1 = (MCode)spadj;
3956 } else {
3957 p1 = p-9;
3958 *(int32_t *)p1 = spadj;
3960 if ((as->flags & JIT_F_LEA_AGU)) {
3961 #if LJ_64
3962 p1[-4] = 0x48;
3963 #endif
3964 p1[-3] = (MCode)XI_LEA;
3965 p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP);
3966 p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
3967 } else {
3968 #if LJ_64
3969 p1[-3] = 0x48;
3970 #endif
3971 p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
3972 p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
3975 /* Patch exit branch. */
3976 target = lnk == TRACE_INTERP ? (MCode *)lj_vm_exit_interp :
3977 traceref(as->J, lnk)->mcode;
3978 *(int32_t *)(p-4) = jmprel(p, target);
3979 p[-5] = XI_JMP;
3980 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
3981 for (q = as->mctop-1; q >= p; q--)
3982 *q = XI_NOP;
3983 as->mctop = p;
3986 /* -- Instruction dispatch ------------------------------------------------ */
3988 /* Assemble a single instruction. */
3989 static void asm_ir(ASMState *as, IRIns *ir)
3991 switch ((IROp)ir->o) {
3992 /* Miscellaneous ops. */
3993 case IR_LOOP: asm_loop(as); break;
3994 case IR_NOP: case IR_XBAR: lua_assert(!ra_used(ir)); break;
3995 case IR_PHI: asm_phi(as, ir); break;
3996 case IR_HIOP: asm_hiop(as, ir); break;
3998 /* Guarded assertions. */
3999 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
4000 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
4001 case IR_EQ: case IR_NE: case IR_ABC:
4002 asm_comp(as, ir, asm_compmap[ir->o]);
4003 break;
4005 case IR_RETF: asm_retf(as, ir); break;
4007 /* Bit ops. */
4008 case IR_BNOT: asm_neg_not(as, ir, XOg_NOT);
4009 case IR_BSWAP: asm_bitswap(as, ir); break;
4011 case IR_BAND: asm_intarith(as, ir, XOg_AND); break;
4012 case IR_BOR: asm_intarith(as, ir, XOg_OR); break;
4013 case IR_BXOR: asm_intarith(as, ir, XOg_XOR); break;
4015 case IR_BSHL: asm_bitshift(as, ir, XOg_SHL); break;
4016 case IR_BSHR: asm_bitshift(as, ir, XOg_SHR); break;
4017 case IR_BSAR: asm_bitshift(as, ir, XOg_SAR); break;
4018 case IR_BROL: asm_bitshift(as, ir, XOg_ROL); break;
4019 case IR_BROR: asm_bitshift(as, ir, XOg_ROR); break;
4021 /* Arithmetic ops. */
4022 case IR_ADD: asm_add(as, ir); break;
4023 case IR_SUB:
4024 if (irt_isnum(ir->t))
4025 asm_fparith(as, ir, XO_SUBSD);
4026 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
4027 asm_intarith(as, ir, XOg_SUB);
4028 break;
4029 case IR_MUL:
4030 if (irt_isnum(ir->t))
4031 asm_fparith(as, ir, XO_MULSD);
4032 else
4033 asm_intarith(as, ir, XOg_X_IMUL);
4034 break;
4035 case IR_DIV:
4036 #if LJ_64 && LJ_HASFFI
4037 if (!irt_isnum(ir->t))
4038 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
4039 IRCALL_lj_carith_divu64);
4040 else
4041 #endif
4042 asm_fparith(as, ir, XO_DIVSD);
4043 break;
4044 case IR_MOD:
4045 #if LJ_64 && LJ_HASFFI
4046 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
4047 IRCALL_lj_carith_modu64);
4048 #else
4049 lua_assert(0);
4050 #endif
4051 break;
4053 case IR_NEG:
4054 if (irt_isnum(ir->t))
4055 asm_fparith(as, ir, XO_XORPS);
4056 else
4057 asm_neg_not(as, ir, XOg_NEG);
4058 break;
4059 case IR_ABS: asm_fparith(as, ir, XO_ANDPS); break;
4061 case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
4062 case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;
4064 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
4065 asm_fpmath(as, ir);
4066 break;
4067 case IR_POW:
4068 #if LJ_64 && LJ_HASFFI
4069 if (!irt_isnum(ir->t))
4070 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
4071 IRCALL_lj_carith_powu64);
4072 else
4073 #endif
4074 asm_fppowi(as, ir);
4075 break;
4077 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
4078 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
4079 case IR_SUBOV: asm_intarith(as, ir, XOg_SUB); break;
4081 /* Memory references. */
4082 case IR_AREF: asm_aref(as, ir); break;
4083 case IR_HREF: asm_href(as, ir); break;
4084 case IR_HREFK: asm_hrefk(as, ir); break;
4085 case IR_NEWREF: asm_newref(as, ir); break;
4086 case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break;
4087 case IR_FREF: asm_fref(as, ir); break;
4088 case IR_STRREF: asm_strref(as, ir); break;
4090 /* Loads and stores. */
4091 case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
4092 asm_ahuvload(as, ir);
4093 break;
4094 case IR_FLOAD: case IR_XLOAD: asm_fxload(as, ir); break;
4095 case IR_SLOAD: asm_sload(as, ir); break;
4097 case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break;
4098 case IR_FSTORE: case IR_XSTORE: asm_fxstore(as, ir); break;
4100 /* Allocations. */
4101 case IR_SNEW: asm_snew(as, ir); break;
4102 case IR_TNEW: asm_tnew(as, ir); break;
4103 case IR_TDUP: asm_tdup(as, ir); break;
4104 case IR_CNEW: case IR_CNEWI: asm_cnew(as, ir); break;
4106 /* Write barriers. */
4107 case IR_TBAR: asm_tbar(as, ir); break;
4108 case IR_OBAR: asm_obar(as, ir); break;
4110 /* Type conversions. */
4111 case IR_TOBIT: asm_tobit(as, ir); break;
4112 case IR_CONV: asm_conv(as, ir); break;
4113 case IR_TOSTR: asm_tostr(as, ir); break;
4114 case IR_STRTO: asm_strto(as, ir); break;
4116 /* Calls. */
4117 case IR_CALLN: case IR_CALLL: case IR_CALLS: asm_call(as, ir); break;
4118 case IR_CALLXS: asm_callx(as, ir); break;
4119 case IR_CARG: break;
4121 default:
4122 setintV(&as->J->errinfo, ir->o);
4123 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
4124 break;
4128 /* Assemble a trace in linear backwards order. */
4129 static void asm_trace(ASMState *as)
4131 for (as->curins--; as->curins > as->stopins; as->curins--) {
4132 IRIns *ir = IR(as->curins);
4133 lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
4134 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
4135 continue; /* Dead-code elimination can be soooo easy. */
4136 if (irt_isguard(ir->t))
4137 asm_snap_prep(as);
4138 RA_DBG_REF();
4139 checkmclim(as);
4140 asm_ir(as, ir);
4144 /* -- Trace setup --------------------------------------------------------- */
4146 /* Ensure there are enough stack slots for call arguments. */
4147 static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
4149 IRRef args[CCI_NARGS_MAX];
4150 uint32_t nargs = (int)CCI_NARGS(ci);
4151 int nslots = 0;
4152 asm_collectargs(as, ir, ci, args);
4153 #if LJ_64
4154 if (LJ_ABI_WIN) {
4155 nslots = (int)(nargs*2); /* Only matters for more than four args. */
4156 } else {
4157 uint32_t i;
4158 int ngpr = 6, nfpr = 8;
4159 for (i = 0; i < nargs; i++)
4160 if (irt_isfp(IR(args[i])->t)) {
4161 if (nfpr > 0) nfpr--; else nslots += 2;
4162 } else {
4163 if (ngpr > 0) ngpr--; else nslots += 2;
4166 if (nslots > as->evenspill) /* Leave room for args in stack slots. */
4167 as->evenspill = nslots;
4168 return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET);
4169 #else
4170 if ((ci->flags & CCI_FASTCALL)) {
4171 lua_assert(nargs <= 2);
4172 } else {
4173 uint32_t i;
4174 for (i = 0; i < nargs; i++)
4175 nslots += irt_isnum(IR(args[i])->t) ? 2 : 1;
4176 if (nslots > as->evenspill) /* Leave room for args. */
4177 as->evenspill = nslots;
4179 return irt_isfp(ir->t) ? REGSP_INIT : REGSP_HINT(RID_RET);
4180 #endif
4183 /* Clear reg/sp for all instructions and add register hints. */
4184 static void asm_setup_regsp(ASMState *as, GCtrace *T)
4186 IRRef i, nins;
4187 int inloop;
4189 ra_setup(as);
4191 /* Clear reg/sp for constants. */
4192 for (i = T->nk; i < REF_BIAS; i++)
4193 IR(i)->prev = REGSP_INIT;
4195 /* REF_BASE is used for implicit references to the BASE register. */
4196 IR(REF_BASE)->prev = REGSP_HINT(RID_BASE);
4198 nins = T->nins;
4199 if (IR(nins-1)->o == IR_RENAME) {
4200 do { nins--; } while (IR(nins-1)->o == IR_RENAME);
4201 T->nins = nins; /* Remove any renames left over from ASM restart. */
4203 as->snaprename = nins;
4204 as->snapref = nins;
4205 as->snapno = T->nsnap;
4207 as->stopins = REF_BASE;
4208 as->orignins = nins;
4209 as->curins = nins;
4211 inloop = 0;
4212 as->evenspill = SPS_FIRST;
4213 for (i = REF_FIRST; i < nins; i++) {
4214 IRIns *ir = IR(i);
4215 switch (ir->o) {
4216 case IR_LOOP:
4217 inloop = 1;
4218 break;
4219 /* Set hints for slot loads from a parent trace. */
4220 case IR_SLOAD:
4221 if ((ir->op2 & IRSLOAD_PARENT)) {
4222 RegSP rs = as->parentmap[ir->op1];
4223 lua_assert(regsp_used(rs));
4224 as->stopins = i;
4225 if (!ra_hasspill(regsp_spill(rs)) && ra_hasreg(regsp_reg(rs))) {
4226 ir->prev = (uint16_t)REGSP_HINT(regsp_reg(rs));
4227 continue;
4230 break;
4231 case IR_CALLXS: {
4232 CCallInfo ci;
4233 ci.flags = asm_callx_flags(as, ir);
4234 ir->prev = asm_setup_call_slots(as, ir, &ci);
4235 if (inloop)
4236 as->modset |= RSET_SCRATCH;
4237 continue;
4239 case IR_CALLN: case IR_CALLL: case IR_CALLS: {
4240 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
4241 ir->prev = asm_setup_call_slots(as, ir, ci);
4242 if (inloop)
4243 as->modset |= (ci->flags & CCI_NOFPRCLOBBER) ?
4244 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
4245 continue;
4247 #if LJ_32 && LJ_HASFFI
4248 case IR_HIOP:
4249 if ((ir-1)->o == IR_CALLN) {
4250 ir->prev = REGSP_HINT(RID_RETHI);
4251 continue;
4253 break;
4254 #endif
4255 /* C calls evict all scratch regs and return results in RID_RET. */
4256 case IR_SNEW: case IR_NEWREF:
4257 #if !LJ_64
4258 if (as->evenspill < 3) /* lj_str_new and lj_tab_newkey need 3 args. */
4259 as->evenspill = 3;
4260 #endif
4261 case IR_TNEW: case IR_TDUP: case IR_CNEW: case IR_CNEWI: case IR_TOSTR:
4262 ir->prev = REGSP_HINT(RID_RET);
4263 if (inloop)
4264 as->modset = RSET_SCRATCH;
4265 continue;
4266 case IR_STRTO: case IR_OBAR:
4267 if (inloop)
4268 as->modset = RSET_SCRATCH;
4269 break;
4270 case IR_POW:
4271 if (irt_isnum(ir->t)) {
4272 ir->prev = REGSP_HINT(RID_XMM0);
4273 if (inloop)
4274 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
4275 continue;
4277 /* fallthrough */
4278 case IR_DIV: case IR_MOD:
4279 #if LJ_64 && LJ_HASFFI
4280 if (!irt_isnum(ir->t)) {
4281 ir->prev = REGSP_HINT(RID_RET);
4282 if (inloop)
4283 as->modset |= (RSET_SCRATCH & RSET_GPR);
4284 continue;
4286 #endif
4287 break;
4288 case IR_FPMATH:
4289 if (ir->op2 == IRFPM_EXP2) { /* May be joined to lj_vm_pow_sse. */
4290 ir->prev = REGSP_HINT(RID_XMM0);
4291 #if !LJ_64
4292 if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */
4293 as->evenspill = 4;
4294 #endif
4295 if (inloop)
4296 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
4297 continue;
4298 } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
4299 ir->prev = REGSP_HINT(RID_XMM0);
4300 if (inloop)
4301 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
4302 continue;
4304 break;
4305 /* Non-constant shift counts need to be in RID_ECX. */
4306 case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR:
4307 if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r)) {
4308 IR(ir->op2)->r = REGSP_HINT(RID_ECX);
4309 if (inloop)
4310 rset_set(as->modset, RID_ECX);
4312 break;
4313 /* Do not propagate hints across type conversions. */
4314 case IR_CONV: case IR_TOBIT:
4315 break;
4316 default:
4317 /* Propagate hints across likely 'op reg, imm' or 'op reg'. */
4318 if (irref_isk(ir->op2) && !irref_isk(ir->op1)) {
4319 ir->prev = IR(ir->op1)->prev;
4320 continue;
4322 break;
4324 ir->prev = REGSP_INIT;
4326 if ((as->evenspill & 1))
4327 as->oddspill = as->evenspill++;
4328 else
4329 as->oddspill = 0;
4332 /* -- Assembler core ------------------------------------------------------ */
4334 /* Define this if you want to run LuaJIT with Valgrind. */
4335 #ifdef LUAJIT_USE_VALGRIND
4336 #include <valgrind/valgrind.h>
4337 #define VG_INVALIDATE(p, sz) VALGRIND_DISCARD_TRANSLATIONS(p, sz)
4338 #else
4339 #define VG_INVALIDATE(p, sz) ((void)0)
4340 #endif
4342 /* Assemble a trace. */
4343 void lj_asm_trace(jit_State *J, GCtrace *T)
4345 ASMState as_;
4346 ASMState *as = &as_;
4348 /* Setup initial state. Copy some fields to reduce indirections. */
4349 as->J = J;
4350 as->T = T;
4351 as->ir = T->ir;
4352 as->flags = J->flags;
4353 as->loopref = J->loopref;
4354 as->realign = NULL;
4355 as->loopinv = 0;
4356 if (J->parent) {
4357 as->parent = traceref(J, J->parent);
4358 lj_snap_regspmap(as->parentmap, as->parent, J->exitno);
4359 } else {
4360 as->parent = NULL;
4362 as->mctop = lj_mcode_reserve(J, &as->mcbot); /* Reserve MCode memory. */
4363 as->mcp = as->mctop;
4364 as->mclim = as->mcbot + MCLIM_REDZONE;
4365 asm_exitstub_setup(as, T->nsnap);
4367 do {
4368 as->mcp = as->mctop;
4369 as->curins = T->nins;
4370 RA_DBG_START();
4371 RA_DBGX((as, "===== STOP ====="));
4372 /* Realign and leave room for backwards loop branch or exit branch. */
4373 if (as->realign) {
4374 int i = ((int)(intptr_t)as->realign) & 15;
4375 MCode *p = as->mctop;
4376 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
4377 while (i-- > 0)
4378 *--p = XI_NOP;
4379 as->mctop = p;
4380 as->mcp = p - (as->loopinv ? 5 : 2); /* Space for short/near jmp. */
4381 } else {
4382 as->mcp = as->mctop - 5; /* Space for exit branch (near jmp). */
4384 as->invmcp = as->mcp;
4385 as->mcloop = NULL;
4386 as->testmcp = NULL;
4387 as->topslot = 0;
4388 as->gcsteps = 0;
4389 as->sectref = as->loopref;
4390 as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED;
4392 /* Setup register allocation. */
4393 asm_setup_regsp(as, T);
4395 if (!as->loopref) {
4396 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
4397 as->mcp -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6 + (LJ_64 ? 1 : 0);
4398 as->invmcp = NULL;
4399 asm_tail_link(as);
4401 asm_trace(as);
4402 } while (as->realign); /* Retry in case the MCode needs to be realigned. */
4404 RA_DBG_REF();
4405 checkmclim(as);
4406 if (as->gcsteps) {
4407 as->curins = as->T->snap[0].ref;
4408 asm_snap_prep(as); /* The GC check is a guard. */
4409 asm_gc_check(as);
4411 ra_evictk(as);
4412 if (as->parent)
4413 asm_head_side(as);
4414 else
4415 asm_head_root(as);
4416 asm_phi_fixup(as);
4418 RA_DBGX((as, "===== START ===="));
4419 RA_DBG_FLUSH();
4420 if (as->freeset != RSET_ALL)
4421 lj_trace_err(as->J, LJ_TRERR_BADRA); /* Ouch! Should never happen. */
4423 /* Set trace entry point before fixing up tail to allow link to self. */
4424 T->mcode = as->mcp;
4425 T->mcloop = as->mcloop ? (MSize)(as->mcloop - as->mcp) : 0;
4426 if (!as->loopref)
4427 asm_tail_fixup(as, T->link); /* Note: this may change as->mctop! */
4428 T->szmcode = (MSize)(as->mctop - as->mcp);
4429 VG_INVALIDATE(T->mcode, T->szmcode);
4432 /* Patch exit jumps of existing machine code to a new target. */
4433 void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
4435 MCode *p = T->mcode;
4436 MCode *mcarea = lj_mcode_patch(J, p, 0);
4437 MSize len = T->szmcode;
4438 MCode *px = exitstub_addr(J, exitno) - 6;
4439 MCode *pe = p+len-6;
4440 uint32_t stateaddr = u32ptr(&J2G(J)->vmstate);
4441 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px)
4442 *(int32_t *)(p+len-4) = jmprel(p+len, target);
4443 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
4444 for (; p < pe; p++)
4445 if (*(uint32_t *)(p+(LJ_64 ? 3 : 2)) == stateaddr && p[0] == XI_MOVmi) {
4446 p += LJ_64 ? 11 : 10;
4447 break;
4449 lua_assert(p < pe);
4450 for (; p < pe; p++) {
4451 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) {
4452 *(int32_t *)(p+2) = jmprel(p+6, target);
4453 p += 5;
4456 lj_mcode_patch(J, mcarea, 1);
4457 VG_INVALIDATE(T->mcode, T->szmcode);
4460 #undef IR
4462 #endif