beta-0.89.2
[luatex.git] / source / libs / luajit / LuaJIT-src / src / lj_opt_loop.c
blob4b4ab7dc3a3c58d786ad80eff82c0e6faf4ce828
1 /*
2 ** LOOP: Loop Optimizations.
3 ** Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
4 */
6 #define lj_opt_loop_c
7 #define LUA_CORE
9 #include "lj_obj.h"
11 #if LJ_HASJIT
13 #include "lj_err.h"
14 #include "lj_buf.h"
15 #include "lj_ir.h"
16 #include "lj_jit.h"
17 #include "lj_iropt.h"
18 #include "lj_trace.h"
19 #include "lj_snap.h"
20 #include "lj_vm.h"
22 /* Loop optimization:
24 ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
25 ** of a loop into invariant and variant instructions. The invariant
26 ** instructions are hoisted out of the loop and only the variant
27 ** instructions remain inside the loop body.
29 ** Unfortunately LICM is mostly useless for compiling dynamic languages.
30 ** The IR has many guards and most of the subsequent instructions are
31 ** control-dependent on them. The first non-hoistable guard would
32 ** effectively prevent hoisting of all subsequent instructions.
34 ** That's why we use a special form of unrolling using copy-substitution,
35 ** combined with redundancy elimination:
37 ** The recorded instruction stream is re-emitted to the compiler pipeline
38 ** with substituted operands. The substitution table is filled with the
39 ** refs returned by re-emitting each instruction. This can be done
40 ** on-the-fly, because the IR is in strict SSA form, where every ref is
41 ** defined before its use.
43 ** This aproach generates two code sections, separated by the LOOP
44 ** instruction:
46 ** 1. The recorded instructions form a kind of pre-roll for the loop. It
47 ** contains a mix of invariant and variant instructions and performs
48 ** exactly one loop iteration (but not necessarily the 1st iteration).
50 ** 2. The loop body contains only the variant instructions and performs
51 ** all remaining loop iterations.
53 ** On first sight that looks like a waste of space, because the variant
54 ** instructions are present twice. But the key insight is that the
55 ** pre-roll honors the control-dependencies for *both* the pre-roll itself
56 ** *and* the loop body!
58 ** It also means one doesn't have to explicitly model control-dependencies
59 ** (which, BTW, wouldn't help LICM much). And it's much easier to
60 ** integrate sparse snapshotting with this approach.
62 ** One of the nicest aspects of this approach is that all of the
63 ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
64 ** reused with only minor restrictions (e.g. one should not fold
65 ** instructions across loop-carried dependencies).
67 ** But in general all optimizations can be applied which only need to look
68 ** backwards into the generated instruction stream. At any point in time
69 ** during the copy-substitution process this contains both a static loop
70 ** iteration (the pre-roll) and a dynamic one (from the to-be-copied
71 ** instruction up to the end of the partial loop body).
73 ** Since control-dependencies are implicitly kept, CSE also applies to all
74 ** kinds of guards. The major advantage is that all invariant guards can
75 ** be hoisted, too.
77 ** Load/store forwarding works across loop iterations, too. This is
78 ** important if loop-carried dependencies are kept in upvalues or tables.
79 ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
80 ** become a forwarded loop-recurrence after inlining.
82 ** Since the IR is in SSA form, loop-carried dependencies have to be
83 ** modeled with PHI instructions. The potential candidates for PHIs are
84 ** collected on-the-fly during copy-substitution. After eliminating the
85 ** redundant ones, PHI instructions are emitted *below* the loop body.
87 ** Note that this departure from traditional SSA form doesn't change the
88 ** semantics of the PHI instructions themselves. But it greatly simplifies
89 ** on-the-fly generation of the IR and the machine code.
92 /* Some local macros to save typing. Undef'd at the end. */
93 #define IR(ref) (&J->cur.ir[(ref)])
95 /* Pass IR on to next optimization in chain (FOLD). */
96 #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
98 /* Emit raw IR without passing through optimizations. */
99 #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
101 /* -- PHI elimination ----------------------------------------------------- */
103 /* Emit or eliminate collected PHIs. */
104 static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
105 SnapNo onsnap)
107 int passx = 0;
108 IRRef i, j, nslots;
109 IRRef invar = J->chain[IR_LOOP];
110 /* Pass #1: mark redundant and potentially redundant PHIs. */
111 for (i = 0, j = 0; i < nphi; i++) {
112 IRRef lref = phi[i];
113 IRRef rref = subst[lref];
114 if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */
115 irt_clearphi(IR(lref)->t);
116 } else {
117 phi[j++] = (IRRef1)lref;
118 if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
119 /* Quick check for simple recurrences failed, need pass2. */
120 irt_setmark(IR(lref)->t);
121 passx = 1;
125 nphi = j;
126 /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
127 if (passx) {
128 SnapNo s;
129 for (i = J->cur.nins-1; i > invar; i--) {
130 IRIns *ir = IR(i);
131 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
132 if (!irref_isk(ir->op1)) {
133 irt_clearmark(IR(ir->op1)->t);
134 if (ir->op1 < invar &&
135 ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */
136 ir = IR(ir->op1);
137 while (ir->o == IR_CARG) {
138 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
139 if (irref_isk(ir->op1)) break;
140 ir = IR(ir->op1);
141 irt_clearmark(ir->t);
146 for (s = J->cur.nsnap-1; s >= onsnap; s--) {
147 SnapShot *snap = &J->cur.snap[s];
148 SnapEntry *map = &J->cur.snapmap[snap->mapofs];
149 MSize n, nent = snap->nent;
150 for (n = 0; n < nent; n++) {
151 IRRef ref = snap_ref(map[n]);
152 if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
156 /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
157 nslots = J->baseslot+J->maxslot;
158 for (i = 1; i < nslots; i++) {
159 IRRef ref = tref_ref(J->slot[i]);
160 while (!irref_isk(ref) && ref != subst[ref]) {
161 IRIns *ir = IR(ref);
162 irt_clearmark(ir->t); /* Unmark potential uses, too. */
163 if (irt_isphi(ir->t) || irt_ispri(ir->t))
164 break;
165 irt_setphi(ir->t);
166 if (nphi >= LJ_MAX_PHI)
167 lj_trace_err(J, LJ_TRERR_PHIOV);
168 phi[nphi++] = (IRRef1)ref;
169 ref = subst[ref];
170 if (ref > invar)
171 break;
174 /* Pass #4: propagate non-redundant PHIs. */
175 while (passx) {
176 passx = 0;
177 for (i = 0; i < nphi; i++) {
178 IRRef lref = phi[i];
179 IRIns *ir = IR(lref);
180 if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */
181 IRIns *irr = IR(subst[lref]);
182 if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */
183 irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */
184 passx = 1; /* Retry. */
189 /* Pass #5: emit PHI instructions or eliminate PHIs. */
190 for (i = 0; i < nphi; i++) {
191 IRRef lref = phi[i];
192 IRIns *ir = IR(lref);
193 if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
194 IRRef rref = subst[lref];
195 if (rref > invar)
196 irt_setphi(IR(rref)->t);
197 emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
198 } else { /* Otherwise eliminate PHI. */
199 irt_clearmark(ir->t);
200 irt_clearphi(ir->t);
205 /* -- Loop unrolling using copy-substitution ------------------------------ */
207 /* Copy-substitute snapshot. */
208 static void loop_subst_snap(jit_State *J, SnapShot *osnap,
209 SnapEntry *loopmap, IRRef1 *subst)
211 SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
212 SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
213 MSize nmapofs;
214 MSize on, ln, nn, onent = osnap->nent;
215 BCReg nslots = osnap->nslots;
216 SnapShot *snap = &J->cur.snap[J->cur.nsnap];
217 if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
218 nmapofs = J->cur.nsnapmap;
219 J->cur.nsnap++; /* Add new snapshot. */
220 } else { /* Otherwise overwrite previous snapshot. */
221 snap--;
222 nmapofs = snap->mapofs;
224 J->guardemit.irt = 0;
225 /* Setup new snapshot. */
226 snap->mapofs = (uint16_t)nmapofs;
227 snap->ref = (IRRef1)J->cur.nins;
228 snap->nslots = nslots;
229 snap->topslot = osnap->topslot;
230 snap->count = 0;
231 nmap = &J->cur.snapmap[nmapofs];
232 /* Substitute snapshot slots. */
233 on = ln = nn = 0;
234 while (on < onent) {
235 SnapEntry osn = omap[on], lsn = loopmap[ln];
236 if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */
237 nmap[nn++] = lsn;
238 ln++;
239 } else { /* Copy substituted slot from snapshot map. */
240 if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */
241 if (!irref_isk(snap_ref(osn)))
242 osn = snap_setref(osn, subst[snap_ref(osn)]);
243 nmap[nn++] = osn;
244 on++;
247 while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */
248 nmap[nn++] = loopmap[ln++];
249 snap->nent = (uint8_t)nn;
250 omap += onent;
251 nmap += nn;
252 while (omap < nextmap) /* Copy PC + frame links. */
253 *nmap++ = *omap++;
254 J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
257 typedef struct LoopState {
258 jit_State *J;
259 IRRef1 *subst;
260 MSize sizesubst;
261 } LoopState;
263 /* Unroll loop. */
264 static void loop_unroll(LoopState *lps)
266 jit_State *J = lps->J;
267 IRRef1 phi[LJ_MAX_PHI];
268 uint32_t nphi = 0;
269 IRRef1 *subst;
270 SnapNo onsnap;
271 SnapShot *osnap, *loopsnap;
272 SnapEntry *loopmap, *psentinel;
273 IRRef ins, invar;
275 /* Allocate substitution table.
276 ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
278 invar = J->cur.nins;
279 lps->sizesubst = invar - REF_BIAS;
280 lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1);
281 subst = lps->subst - REF_BIAS;
282 subst[REF_BASE] = REF_BASE;
284 /* LOOP separates the pre-roll from the loop body. */
285 emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
287 /* Grow snapshot buffer and map for copy-substituted snapshots.
288 ** Need up to twice the number of snapshots minus #0 and loop snapshot.
289 ** Need up to twice the number of entries plus fallback substitutions
290 ** from the loop snapshot entries for each new snapshot.
291 ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
293 onsnap = J->cur.nsnap;
294 lj_snap_grow_buf(J, 2*onsnap-2);
295 lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);
297 /* The loop snapshot is used for fallback substitutions. */
298 loopsnap = &J->cur.snap[onsnap-1];
299 loopmap = &J->cur.snapmap[loopsnap->mapofs];
300 /* The PC of snapshot #0 and the loop snapshot must match. */
301 psentinel = &loopmap[loopsnap->nent];
302 lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
303 *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */
305 /* Start substitution with snapshot #1 (#0 is empty for root traces). */
306 osnap = &J->cur.snap[1];
308 /* Copy and substitute all recorded instructions and snapshots. */
309 for (ins = REF_FIRST; ins < invar; ins++) {
310 IRIns *ir;
311 IRRef op1, op2;
313 if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */
314 loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */
316 /* Substitute instruction operands. */
317 ir = IR(ins);
318 op1 = ir->op1;
319 if (!irref_isk(op1)) op1 = subst[op1];
320 op2 = ir->op2;
321 if (!irref_isk(op2)) op2 = subst[op2];
322 if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
323 op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
324 subst[ins] = (IRRef1)ins; /* Shortcut. */
325 } else {
326 /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
327 IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
328 IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
329 subst[ins] = (IRRef1)ref;
330 if (ref != ins) {
331 IRIns *irr = IR(ref);
332 if (ref < invar) { /* Loop-carried dependency? */
333 /* Potential PHI? */
334 if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
335 irt_setphi(irr->t);
336 if (nphi >= LJ_MAX_PHI)
337 lj_trace_err(J, LJ_TRERR_PHIOV);
338 phi[nphi++] = (IRRef1)ref;
340 /* Check all loop-carried dependencies for type instability. */
341 if (!irt_sametype(t, irr->t)) {
342 if (irt_isinteger(t) && irt_isinteger(irr->t))
343 continue;
344 else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */
345 ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
346 else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */
347 ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
348 IRCONV_INT_NUM|IRCONV_CHECK));
349 else
350 lj_trace_err(J, LJ_TRERR_TYPEINS);
351 subst[ins] = (IRRef1)ref;
352 irr = IR(ref);
353 goto phiconv;
355 } else if (ref != REF_DROP && irr->o == IR_CONV &&
356 ref > invar && irr->op1 < invar) {
357 /* May need an extra PHI for a CONV. */
358 ref = irr->op1;
359 irr = IR(ref);
360 phiconv:
361 if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
362 irt_setphi(irr->t);
363 if (nphi >= LJ_MAX_PHI)
364 lj_trace_err(J, LJ_TRERR_PHIOV);
365 phi[nphi++] = (IRRef1)ref;
371 if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */
372 J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
373 lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
374 *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */
376 loop_emit_phi(J, subst, phi, nphi, onsnap);
379 /* Undo any partial changes made by the loop optimization. */
380 static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
382 ptrdiff_t i;
383 SnapShot *snap = &J->cur.snap[nsnap-1];
384 SnapEntry *map = J->cur.snapmap;
385 map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */
386 J->cur.nsnapmap = (uint16_t)nsnapmap;
387 J->cur.nsnap = nsnap;
388 J->guardemit.irt = 0;
389 lj_ir_rollback(J, ins);
390 for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */
391 BPropEntry *bp = &J->bpropcache[i];
392 if (bp->val >= ins)
393 bp->key = 0;
395 for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
396 IRIns *ir = IR(ins);
397 irt_clearphi(ir->t);
398 irt_clearmark(ir->t);
402 /* Protected callback for loop optimization. */
403 static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
405 UNUSED(L); UNUSED(dummy);
406 loop_unroll((LoopState *)ud);
407 return NULL;
410 /* Loop optimization. */
411 int lj_opt_loop(jit_State *J)
413 IRRef nins = J->cur.nins;
414 SnapNo nsnap = J->cur.nsnap;
415 MSize nsnapmap = J->cur.nsnapmap;
416 LoopState lps;
417 int errcode;
418 lps.J = J;
419 lps.subst = NULL;
420 lps.sizesubst = 0;
421 errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt);
422 lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1);
423 if (LJ_UNLIKELY(errcode)) {
424 lua_State *L = J->L;
425 if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */
426 int32_t e = numberVint(L->top-1);
427 switch ((TraceError)e) {
428 case LJ_TRERR_TYPEINS: /* Type instability. */
429 case LJ_TRERR_GFAIL: /* Guard would always fail. */
430 /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
431 if (--J->instunroll < 0) /* But do not unroll forever. */
432 break;
433 L->top--; /* Remove error object. */
434 loop_undo(J, nins, nsnap, nsnapmap);
435 return 1; /* Loop optimization failed, continue recording. */
436 default:
437 break;
440 lj_err_throw(L, errcode); /* Propagate all other errors. */
442 return 0; /* Loop optimization is ok. */
445 #undef IR
446 #undef emitir
447 #undef emitir_raw
449 #endif