2 ** LOOP: Loop Optimizations.
3 ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
24 ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
25 ** of a loop into invariant and variant instructions. The invariant
26 ** instructions are hoisted out of the loop and only the variant
27 ** instructions remain inside the loop body.
29 ** Unfortunately LICM is mostly useless for compiling dynamic languages.
30 ** The IR has many guards and most of the subsequent instructions are
31 ** control-dependent on them. The first non-hoistable guard would
32 ** effectively prevent hoisting of all subsequent instructions.
34 ** That's why we use a special form of unrolling using copy-substitution,
35 ** combined with redundancy elimination:
37 ** The recorded instruction stream is re-emitted to the compiler pipeline
38 ** with substituted operands. The substitution table is filled with the
39 ** refs returned by re-emitting each instruction. This can be done
40 ** on-the-fly, because the IR is in strict SSA form, where every ref is
41 ** defined before its use.
43 ** This aproach generates two code sections, separated by the LOOP
46 ** 1. The recorded instructions form a kind of pre-roll for the loop. It
47 ** contains a mix of invariant and variant instructions and performs
48 ** exactly one loop iteration (but not necessarily the 1st iteration).
50 ** 2. The loop body contains only the variant instructions and performs
51 ** all remaining loop iterations.
53 ** On first sight that looks like a waste of space, because the variant
54 ** instructions are present twice. But the key insight is that the
55 ** pre-roll honors the control-dependencies for *both* the pre-roll itself
56 ** *and* the loop body!
58 ** It also means one doesn't have to explicitly model control-dependencies
59 ** (which, BTW, wouldn't help LICM much). And it's much easier to
60 ** integrate sparse snapshotting with this approach.
62 ** One of the nicest aspects of this approach is that all of the
63 ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
64 ** reused with only minor restrictions (e.g. one should not fold
65 ** instructions across loop-carried dependencies).
67 ** But in general all optimizations can be applied which only need to look
68 ** backwards into the generated instruction stream. At any point in time
69 ** during the copy-substitution process this contains both a static loop
70 ** iteration (the pre-roll) and a dynamic one (from the to-be-copied
71 ** instruction up to the end of the partial loop body).
73 ** Since control-dependencies are implicitly kept, CSE also applies to all
74 ** kinds of guards. The major advantage is that all invariant guards can
77 ** Load/store forwarding works across loop iterations, too. This is
78 ** important if loop-carried dependencies are kept in upvalues or tables.
79 ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
80 ** become a forwarded loop-recurrence after inlining.
82 ** Since the IR is in SSA form, loop-carried dependencies have to be
83 ** modeled with PHI instructions. The potential candidates for PHIs are
84 ** collected on-the-fly during copy-substitution. After eliminating the
85 ** redundant ones, PHI instructions are emitted *below* the loop body.
87 ** Note that this departure from traditional SSA form doesn't change the
88 ** semantics of the PHI instructions themselves. But it greatly simplifies
89 ** on-the-fly generation of the IR and the machine code.
92 /* Some local macros to save typing. Undef'd at the end. */
93 #define IR(ref) (&J->cur.ir[(ref)])
95 /* Pass IR on to next optimization in chain (FOLD). */
96 #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
98 /* Emit raw IR without passing through optimizations. */
99 #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
101 /* -- PHI elimination ----------------------------------------------------- */
103 /* Emit or eliminate collected PHIs. */
104 static void loop_emit_phi(jit_State
*J
, IRRef1
*subst
, IRRef1
*phi
, IRRef nphi
,
109 IRRef invar
= J
->chain
[IR_LOOP
];
110 /* Pass #1: mark redundant and potentially redundant PHIs. */
111 for (i
= 0, j
= 0; i
< nphi
; i
++) {
113 IRRef rref
= subst
[lref
];
114 if (lref
== rref
|| rref
== REF_DROP
) { /* Invariants are redundant. */
115 irt_clearphi(IR(lref
)->t
);
117 phi
[j
++] = (IRRef1
)lref
;
118 if (!(IR(rref
)->op1
== lref
|| IR(rref
)->op2
== lref
)) {
119 /* Quick check for simple recurrences failed, need pass2. */
120 irt_setmark(IR(lref
)->t
);
126 /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
129 for (i
= J
->cur
.nins
-1; i
> invar
; i
--) {
131 if (!irref_isk(ir
->op2
)) irt_clearmark(IR(ir
->op2
)->t
);
132 if (!irref_isk(ir
->op1
)) {
133 irt_clearmark(IR(ir
->op1
)->t
);
134 if (ir
->op1
< invar
&&
135 ir
->o
>= IR_CALLN
&& ir
->o
<= IR_CARG
) { /* ORDER IR */
137 while (ir
->o
== IR_CARG
) {
138 if (!irref_isk(ir
->op2
)) irt_clearmark(IR(ir
->op2
)->t
);
139 if (irref_isk(ir
->op1
)) break;
141 irt_clearmark(ir
->t
);
146 for (s
= J
->cur
.nsnap
-1; s
>= onsnap
; s
--) {
147 SnapShot
*snap
= &J
->cur
.snap
[s
];
148 SnapEntry
*map
= &J
->cur
.snapmap
[snap
->mapofs
];
149 MSize n
, nent
= snap
->nent
;
150 for (n
= 0; n
< nent
; n
++) {
151 IRRef ref
= snap_ref(map
[n
]);
152 if (!irref_isk(ref
)) irt_clearmark(IR(ref
)->t
);
156 /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
157 nslots
= J
->baseslot
+J
->maxslot
;
158 for (i
= 1; i
< nslots
; i
++) {
159 IRRef ref
= tref_ref(J
->slot
[i
]);
160 while (!irref_isk(ref
) && ref
!= subst
[ref
]) {
162 irt_clearmark(ir
->t
); /* Unmark potential uses, too. */
163 if (irt_isphi(ir
->t
) || irt_ispri(ir
->t
))
166 if (nphi
>= LJ_MAX_PHI
)
167 lj_trace_err(J
, LJ_TRERR_PHIOV
);
168 phi
[nphi
++] = (IRRef1
)ref
;
174 /* Pass #4: propagate non-redundant PHIs. */
177 for (i
= 0; i
< nphi
; i
++) {
179 IRIns
*ir
= IR(lref
);
180 if (!irt_ismarked(ir
->t
)) { /* Propagate only from unmarked PHIs. */
181 IRIns
*irr
= IR(subst
[lref
]);
182 if (irt_ismarked(irr
->t
)) { /* Right ref points to other PHI? */
183 irt_clearmark(irr
->t
); /* Mark that PHI as non-redundant. */
184 passx
= 1; /* Retry. */
189 /* Pass #5: emit PHI instructions or eliminate PHIs. */
190 for (i
= 0; i
< nphi
; i
++) {
192 IRIns
*ir
= IR(lref
);
193 if (!irt_ismarked(ir
->t
)) { /* Emit PHI if not marked. */
194 IRRef rref
= subst
[lref
];
196 irt_setphi(IR(rref
)->t
);
197 emitir_raw(IRT(IR_PHI
, irt_type(ir
->t
)), lref
, rref
);
198 } else { /* Otherwise eliminate PHI. */
199 irt_clearmark(ir
->t
);
205 /* -- Loop unrolling using copy-substitution ------------------------------ */
207 /* Copy-substitute snapshot. */
208 static void loop_subst_snap(jit_State
*J
, SnapShot
*osnap
,
209 SnapEntry
*loopmap
, IRRef1
*subst
)
211 SnapEntry
*nmap
, *omap
= &J
->cur
.snapmap
[osnap
->mapofs
];
212 SnapEntry
*nextmap
= &J
->cur
.snapmap
[snap_nextofs(&J
->cur
, osnap
)];
214 MSize on
, ln
, nn
, onent
= osnap
->nent
;
215 BCReg nslots
= osnap
->nslots
;
216 SnapShot
*snap
= &J
->cur
.snap
[J
->cur
.nsnap
];
217 if (irt_isguard(J
->guardemit
)) { /* Guard inbetween? */
218 nmapofs
= J
->cur
.nsnapmap
;
219 J
->cur
.nsnap
++; /* Add new snapshot. */
220 } else { /* Otherwise overwrite previous snapshot. */
222 nmapofs
= snap
->mapofs
;
224 J
->guardemit
.irt
= 0;
225 /* Setup new snapshot. */
226 snap
->mapofs
= (uint32_t)nmapofs
;
227 snap
->ref
= (IRRef1
)J
->cur
.nins
;
229 snap
->nslots
= nslots
;
230 snap
->topslot
= osnap
->topslot
;
232 nmap
= &J
->cur
.snapmap
[nmapofs
];
233 /* Substitute snapshot slots. */
236 SnapEntry osn
= omap
[on
], lsn
= loopmap
[ln
];
237 if (snap_slot(lsn
) < snap_slot(osn
)) { /* Copy slot from loop map. */
240 } else { /* Copy substituted slot from snapshot map. */
241 if (snap_slot(lsn
) == snap_slot(osn
)) ln
++; /* Shadowed loop slot. */
242 if (!irref_isk(snap_ref(osn
)))
243 osn
= snap_setref(osn
, subst
[snap_ref(osn
)]);
248 while (snap_slot(loopmap
[ln
]) < nslots
) /* Copy remaining loop slots. */
249 nmap
[nn
++] = loopmap
[ln
++];
250 snap
->nent
= (uint8_t)nn
;
253 while (omap
< nextmap
) /* Copy PC + frame links. */
255 J
->cur
.nsnapmap
= (uint32_t)(nmap
- J
->cur
.snapmap
);
258 typedef struct LoopState
{
265 static void loop_unroll(LoopState
*lps
)
267 jit_State
*J
= lps
->J
;
268 IRRef1 phi
[LJ_MAX_PHI
];
272 SnapShot
*osnap
, *loopsnap
;
273 SnapEntry
*loopmap
, *psentinel
;
276 /* Allocate substitution table.
277 ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
280 lps
->sizesubst
= invar
- REF_BIAS
;
281 lps
->subst
= lj_mem_newvec(J
->L
, lps
->sizesubst
, IRRef1
);
282 subst
= lps
->subst
- REF_BIAS
;
283 subst
[REF_BASE
] = REF_BASE
;
285 /* LOOP separates the pre-roll from the loop body. */
286 emitir_raw(IRTG(IR_LOOP
, IRT_NIL
), 0, 0);
288 /* Grow snapshot buffer and map for copy-substituted snapshots.
289 ** Need up to twice the number of snapshots minus #0 and loop snapshot.
290 ** Need up to twice the number of entries plus fallback substitutions
291 ** from the loop snapshot entries for each new snapshot.
292 ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
294 onsnap
= J
->cur
.nsnap
;
295 lj_snap_grow_buf(J
, 2*onsnap
-2);
296 lj_snap_grow_map(J
, J
->cur
.nsnapmap
*2+(onsnap
-2)*J
->cur
.snap
[onsnap
-1].nent
);
298 /* The loop snapshot is used for fallback substitutions. */
299 loopsnap
= &J
->cur
.snap
[onsnap
-1];
300 loopmap
= &J
->cur
.snapmap
[loopsnap
->mapofs
];
301 /* The PC of snapshot #0 and the loop snapshot must match. */
302 psentinel
= &loopmap
[loopsnap
->nent
];
303 lj_assertJ(*psentinel
== J
->cur
.snapmap
[J
->cur
.snap
[0].nent
],
304 "mismatched PC for loop snapshot");
305 *psentinel
= SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */
307 /* Start substitution with snapshot #1 (#0 is empty for root traces). */
308 osnap
= &J
->cur
.snap
[1];
310 /* Copy and substitute all recorded instructions and snapshots. */
311 for (ins
= REF_FIRST
; ins
< invar
; ins
++) {
315 if (ins
>= osnap
->ref
) /* Instruction belongs to next snapshot? */
316 loop_subst_snap(J
, osnap
++, loopmap
, subst
); /* Copy-substitute it. */
318 /* Substitute instruction operands. */
321 if (!irref_isk(op1
)) op1
= subst
[op1
];
323 if (!irref_isk(op2
)) op2
= subst
[op2
];
324 if (irm_kind(lj_ir_mode
[ir
->o
]) == IRM_N
&&
325 op1
== ir
->op1
&& op2
== ir
->op2
) { /* Regular invariant ins? */
326 subst
[ins
] = (IRRef1
)ins
; /* Shortcut. */
328 /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
329 IRType1 t
= ir
->t
; /* Get this first, since emitir may invalidate ir. */
330 IRRef ref
= tref_ref(emitir(ir
->ot
& ~IRT_ISPHI
, op1
, op2
));
331 subst
[ins
] = (IRRef1
)ref
;
333 IRIns
*irr
= IR(ref
);
334 if (ref
< invar
) { /* Loop-carried dependency? */
336 if (!irref_isk(ref
) && !irt_isphi(irr
->t
) && !irt_ispri(irr
->t
)) {
338 if (nphi
>= LJ_MAX_PHI
)
339 lj_trace_err(J
, LJ_TRERR_PHIOV
);
340 phi
[nphi
++] = (IRRef1
)ref
;
342 /* Check all loop-carried dependencies for type instability. */
343 if (!irt_sametype(t
, irr
->t
)) {
344 if (irt_isinteger(t
) && irt_isinteger(irr
->t
))
346 else if (irt_isnum(t
) && irt_isinteger(irr
->t
)) /* Fix int->num. */
347 ref
= tref_ref(emitir(IRTN(IR_CONV
), ref
, IRCONV_NUM_INT
));
348 else if (irt_isnum(irr
->t
) && irt_isinteger(t
)) /* Fix num->int. */
349 ref
= tref_ref(emitir(IRTGI(IR_CONV
), ref
,
350 IRCONV_INT_NUM
|IRCONV_CHECK
));
352 lj_trace_err(J
, LJ_TRERR_TYPEINS
);
353 subst
[ins
] = (IRRef1
)ref
;
357 } else if (ref
!= REF_DROP
&& ref
> invar
&&
358 ((irr
->o
== IR_CONV
&& irr
->op1
< invar
) ||
359 (irr
->o
== IR_ALEN
&& irr
->op2
< invar
&&
360 irr
->op2
!= REF_NIL
))) {
361 /* May need an extra PHI for a CONV or ALEN hint. */
362 ref
= irr
->o
== IR_CONV
? irr
->op1
: irr
->op2
;
365 if (ref
< invar
&& !irref_isk(ref
) && !irt_isphi(irr
->t
)) {
367 if (nphi
>= LJ_MAX_PHI
)
368 lj_trace_err(J
, LJ_TRERR_PHIOV
);
369 phi
[nphi
++] = (IRRef1
)ref
;
375 if (!irt_isguard(J
->guardemit
)) /* Drop redundant snapshot. */
376 J
->cur
.nsnapmap
= (uint32_t)J
->cur
.snap
[--J
->cur
.nsnap
].mapofs
;
377 lj_assertJ(J
->cur
.nsnapmap
<= J
->sizesnapmap
, "bad snapshot map index");
378 *psentinel
= J
->cur
.snapmap
[J
->cur
.snap
[0].nent
]; /* Restore PC. */
380 loop_emit_phi(J
, subst
, phi
, nphi
, onsnap
);
383 /* Undo any partial changes made by the loop optimization. */
384 static void loop_undo(jit_State
*J
, IRRef ins
, SnapNo nsnap
, MSize nsnapmap
)
387 SnapShot
*snap
= &J
->cur
.snap
[nsnap
-1];
388 SnapEntry
*map
= J
->cur
.snapmap
;
389 map
[snap
->mapofs
+ snap
->nent
] = map
[J
->cur
.snap
[0].nent
]; /* Restore PC. */
390 J
->cur
.nsnapmap
= (uint32_t)nsnapmap
;
391 J
->cur
.nsnap
= nsnap
;
392 J
->guardemit
.irt
= 0;
393 lj_ir_rollback(J
, ins
);
394 for (i
= 0; i
< BPROP_SLOTS
; i
++) { /* Remove backprop. cache entries. */
395 BPropEntry
*bp
= &J
->bpropcache
[i
];
399 for (ins
--; ins
>= REF_FIRST
; ins
--) { /* Remove flags. */
402 irt_clearmark(ir
->t
);
406 /* Protected callback for loop optimization. */
407 static TValue
*cploop_opt(lua_State
*L
, lua_CFunction dummy
, void *ud
)
409 UNUSED(L
); UNUSED(dummy
);
410 loop_unroll((LoopState
*)ud
);
414 /* Loop optimization. */
415 int lj_opt_loop(jit_State
*J
)
417 IRRef nins
= J
->cur
.nins
;
418 SnapNo nsnap
= J
->cur
.nsnap
;
419 MSize nsnapmap
= J
->cur
.nsnapmap
;
425 errcode
= lj_vm_cpcall(J
->L
, NULL
, &lps
, cploop_opt
);
426 lj_mem_freevec(J2G(J
), lps
.subst
, lps
.sizesubst
, IRRef1
);
427 if (LJ_UNLIKELY(errcode
)) {
429 if (errcode
== LUA_ERRRUN
&& tvisnumber(L
->top
-1)) { /* Trace error? */
430 int32_t e
= numberVint(L
->top
-1);
431 switch ((TraceError
)e
) {
432 case LJ_TRERR_TYPEINS
: /* Type instability. */
433 case LJ_TRERR_GFAIL
: /* Guard would always fail. */
434 /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
435 if (--J
->instunroll
< 0) /* But do not unroll forever. */
437 L
->top
--; /* Remove error object. */
438 loop_undo(J
, nins
, nsnap
, nsnapmap
);
439 return 1; /* Loop optimization failed, continue recording. */
444 lj_err_throw(L
, errcode
); /* Propagate all other errors. */
446 return 0; /* Loop optimization is ok. */