src/lj_opt_loop.c

   1 /*
   2 ** LOOP: Loop Optimizations.
   3 ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
   4 */
   5
   6 #define lj_opt_loop_c
   7 #define LUA_CORE
   8
   9 #include "lj_obj.h"
  10
  11 #if LJ_HASJIT
  12
  13 #include "lj_err.h"
  14 #include "lj_buf.h"
  15 #include "lj_ir.h"
  16 #include "lj_jit.h"
  17 #include "lj_iropt.h"
  18 #include "lj_trace.h"
  19 #include "lj_snap.h"
  20 #include "lj_vm.h"
  21
  22 /* Loop optimization:
  23 **
  24 ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
  25 ** of a loop into invariant and variant instructions. The invariant
  26 ** instructions are hoisted out of the loop and only the variant
  27 ** instructions remain inside the loop body.
  28 **
  29 ** Unfortunately LICM is mostly useless for compiling dynamic languages.
  30 ** The IR has many guards and most of the subsequent instructions are
  31 ** control-dependent on them. The first non-hoistable guard would
  32 ** effectively prevent hoisting of all subsequent instructions.
  33 **
  34 ** That's why we use a special form of unrolling using copy-substitution,
  35 ** combined with redundancy elimination:
  36 **
  37 ** The recorded instruction stream is re-emitted to the compiler pipeline
  38 ** with substituted operands. The substitution table is filled with the
  39 ** refs returned by re-emitting each instruction. This can be done
  40 ** on-the-fly, because the IR is in strict SSA form, where every ref is
  41 ** defined before its use.
  42 **
  43 ** This aproach generates two code sections, separated by the LOOP
  44 ** instruction:
  45 **
  46 ** 1. The recorded instructions form a kind of pre-roll for the loop. It
  47 ** contains a mix of invariant and variant instructions and performs
  48 ** exactly one loop iteration (but not necessarily the 1st iteration).
  49 **
  50 ** 2. The loop body contains only the variant instructions and performs
  51 ** all remaining loop iterations.
  52 **
  53 ** On first sight that looks like a waste of space, because the variant
  54 ** instructions are present twice. But the key insight is that the
  55 ** pre-roll honors the control-dependencies for *both* the pre-roll itself
  56 ** *and* the loop body!
  57 **
  58 ** It also means one doesn't have to explicitly model control-dependencies
  59 ** (which, BTW, wouldn't help LICM much). And it's much easier to
  60 ** integrate sparse snapshotting with this approach.
  61 **
  62 ** One of the nicest aspects of this approach is that all of the
  63 ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
  64 ** reused with only minor restrictions (e.g. one should not fold
  65 ** instructions across loop-carried dependencies).
  66 **
  67 ** But in general all optimizations can be applied which only need to look
  68 ** backwards into the generated instruction stream. At any point in time
  69 ** during the copy-substitution process this contains both a static loop
  70 ** iteration (the pre-roll) and a dynamic one (from the to-be-copied
  71 ** instruction up to the end of the partial loop body).
  72 **
  73 ** Since control-dependencies are implicitly kept, CSE also applies to all
  74 ** kinds of guards. The major advantage is that all invariant guards can
  75 ** be hoisted, too.
  76 **
  77 ** Load/store forwarding works across loop iterations, too. This is
  78 ** important if loop-carried dependencies are kept in upvalues or tables.
  79 ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
  80 ** become a forwarded loop-recurrence after inlining.
  81 **
  82 ** Since the IR is in SSA form, loop-carried dependencies have to be
  83 ** modeled with PHI instructions. The potential candidates for PHIs are
  84 ** collected on-the-fly during copy-substitution. After eliminating the
  85 ** redundant ones, PHI instructions are emitted *below* the loop body.
  86 **
  87 ** Note that this departure from traditional SSA form doesn't change the
  88 ** semantics of the PHI instructions themselves. But it greatly simplifies
  89 ** on-the-fly generation of the IR and the machine code.
  90 */
  91
  92 /* Some local macros to save typing. Undef'd at the end. */
  93 #define IR(ref)         (&J->cur.ir[(ref)])
  94
  95 /* Pass IR on to next optimization in chain (FOLD). */
  96 #define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
  97
  98 /* Emit raw IR without passing through optimizations. */
  99 #define emitir_raw(ot, a, b)    (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
 100
 101 /* -- PHI elimination ----------------------------------------------------- */
 102
 103 /* Emit or eliminate collected PHIs. */
 104 static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
 105                           SnapNo onsnap)
 106 {
 107   int passx = 0;
 108   IRRef i, j, nslots;
 109   IRRef invar = J->chain[IR_LOOP];
 110   /* Pass #1: mark redundant and potentially redundant PHIs. */
 111   for (i = 0, j = 0; i < nphi; i++) {
 112     IRRef lref = phi[i];
 113     IRRef rref = subst[lref];
 114     if (lref == rref || rref == REF_DROP) {  /* Invariants are redundant. */
 115       irt_clearphi(IR(lref)->t);
 116     } else {
 117       phi[j++] = (IRRef1)lref;
 118       if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
 119         /* Quick check for simple recurrences failed, need pass2. */
 120         irt_setmark(IR(lref)->t);
 121         passx = 1;
 122       }
 123     }
 124   }
 125   nphi = j;
 126   /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
 127   if (passx) {
 128     SnapNo s;
 129     for (i = J->cur.nins-1; i > invar; i--) {
 130       IRIns *ir = IR(i);
 131       if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
 132       if (!irref_isk(ir->op1)) {
 133         irt_clearmark(IR(ir->op1)->t);
 134         if (ir->op1 < invar &&
 135             ir->o >= IR_CALLN && ir->o <= IR_CARG) {  /* ORDER IR */
 136           ir = IR(ir->op1);
 137           while (ir->o == IR_CARG) {
 138             if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
 139             if (irref_isk(ir->op1)) break;
 140             ir = IR(ir->op1);
 141             irt_clearmark(ir->t);
 142           }
 143         }
 144       }
 145     }
 146     for (s = J->cur.nsnap-1; s >= onsnap; s--) {
 147       SnapShot *snap = &J->cur.snap[s];
 148       SnapEntry *map = &J->cur.snapmap[snap->mapofs];
 149       MSize n, nent = snap->nent;
 150       for (n = 0; n < nent; n++) {
 151         IRRef ref = snap_ref(map[n]);
 152         if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
 153       }
 154     }
 155   }
 156   /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
 157   nslots = J->baseslot+J->maxslot;
 158   for (i = 1; i < nslots; i++) {
 159     IRRef ref = tref_ref(J->slot[i]);
 160     while (!irref_isk(ref) && ref != subst[ref]) {
 161       IRIns *ir = IR(ref);
 162       irt_clearmark(ir->t);  /* Unmark potential uses, too. */
 163       if (irt_isphi(ir->t) || irt_ispri(ir->t))
 164         break;
 165       irt_setphi(ir->t);
 166       if (nphi >= LJ_MAX_PHI)
 167         lj_trace_err(J, LJ_TRERR_PHIOV);
 168       phi[nphi++] = (IRRef1)ref;
 169       ref = subst[ref];
 170       if (ref > invar)
 171         break;
 172     }
 173   }
 174   /* Pass #4: propagate non-redundant PHIs. */
 175   while (passx) {
 176     passx = 0;
 177     for (i = 0; i < nphi; i++) {
 178       IRRef lref = phi[i];
 179       IRIns *ir = IR(lref);
 180       if (!irt_ismarked(ir->t)) {  /* Propagate only from unmarked PHIs. */
 181         IRIns *irr = IR(subst[lref]);
 182         if (irt_ismarked(irr->t)) {  /* Right ref points to other PHI? */
 183           irt_clearmark(irr->t);  /* Mark that PHI as non-redundant. */
 184           passx = 1;  /* Retry. */
 185         }
 186       }
 187     }
 188   }
 189   /* Pass #5: emit PHI instructions or eliminate PHIs. */
 190   for (i = 0; i < nphi; i++) {
 191     IRRef lref = phi[i];
 192     IRIns *ir = IR(lref);
 193     if (!irt_ismarked(ir->t)) {  /* Emit PHI if not marked. */
 194       IRRef rref = subst[lref];
 195       if (rref > invar)
 196         irt_setphi(IR(rref)->t);
 197       emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
 198     } else {  /* Otherwise eliminate PHI. */
 199       irt_clearmark(ir->t);
 200       irt_clearphi(ir->t);
 201     }
 202   }
 203 }
 204
 205 /* -- Loop unrolling using copy-substitution ------------------------------ */
 206
 207 /* Copy-substitute snapshot. */
 208 static void loop_subst_snap(jit_State *J, SnapShot *osnap,
 209                             SnapEntry *loopmap, IRRef1 *subst)
 210 {
 211   SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
 212   SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
 213   MSize nmapofs;
 214   MSize on, ln, nn, onent = osnap->nent;
 215   BCReg nslots = osnap->nslots;
 216   SnapShot *snap = &J->cur.snap[J->cur.nsnap];
 217   if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
 218     nmapofs = J->cur.nsnapmap;
 219     J->cur.nsnap++;  /* Add new snapshot. */
 220   } else {  /* Otherwise overwrite previous snapshot. */
 221     snap--;
 222     nmapofs = snap->mapofs;
 223   }
 224   J->guardemit.irt = 0;
 225   /* Setup new snapshot. */
 226   snap->mapofs = (uint32_t)nmapofs;
 227   snap->ref = (IRRef1)J->cur.nins;
 228   snap->mcofs = 0;
 229   snap->nslots = nslots;
 230   snap->topslot = osnap->topslot;
 231   snap->count = 0;
 232   nmap = &J->cur.snapmap[nmapofs];
 233   /* Substitute snapshot slots. */
 234   on = ln = nn = 0;
 235   while (on < onent) {
 236     SnapEntry osn = omap[on], lsn = loopmap[ln];
 237     if (snap_slot(lsn) < snap_slot(osn)) {  /* Copy slot from loop map. */
 238       nmap[nn++] = lsn;
 239       ln++;
 240     } else {  /* Copy substituted slot from snapshot map. */
 241       if (snap_slot(lsn) == snap_slot(osn)) ln++;  /* Shadowed loop slot. */
 242       if (!irref_isk(snap_ref(osn)))
 243         osn = snap_setref(osn, subst[snap_ref(osn)]);
 244       nmap[nn++] = osn;
 245       on++;
 246     }
 247   }
 248   while (snap_slot(loopmap[ln]) < nslots)  /* Copy remaining loop slots. */
 249     nmap[nn++] = loopmap[ln++];
 250   snap->nent = (uint8_t)nn;
 251   omap += onent;
 252   nmap += nn;
 253   while (omap < nextmap)  /* Copy PC + frame links. */
 254     *nmap++ = *omap++;
 255   J->cur.nsnapmap = (uint32_t)(nmap - J->cur.snapmap);
 256 }
 257
 258 typedef struct LoopState {
 259   jit_State *J;
 260   IRRef1 *subst;
 261   MSize sizesubst;
 262 } LoopState;
 263
 264 /* Unroll loop. */
 265 static void loop_unroll(LoopState *lps)
 266 {
 267   jit_State *J = lps->J;
 268   IRRef1 phi[LJ_MAX_PHI];
 269   uint32_t nphi = 0;
 270   IRRef1 *subst;
 271   SnapNo onsnap;
 272   SnapShot *osnap, *loopsnap;
 273   SnapEntry *loopmap, *psentinel;
 274   IRRef ins, invar;
 275
 276   /* Allocate substitution table.
 277   ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
 278   */
 279   invar = J->cur.nins;
 280   lps->sizesubst = invar - REF_BIAS;
 281   lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1);
 282   subst = lps->subst - REF_BIAS;
 283   subst[REF_BASE] = REF_BASE;
 284
 285   /* LOOP separates the pre-roll from the loop body. */
 286   emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
 287
 288   /* Grow snapshot buffer and map for copy-substituted snapshots.
 289   ** Need up to twice the number of snapshots minus #0 and loop snapshot.
 290   ** Need up to twice the number of entries plus fallback substitutions
 291   ** from the loop snapshot entries for each new snapshot.
 292   ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
 293   */
 294   onsnap = J->cur.nsnap;
 295   lj_snap_grow_buf(J, 2*onsnap-2);
 296   lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);
 297
 298   /* The loop snapshot is used for fallback substitutions. */
 299   loopsnap = &J->cur.snap[onsnap-1];
 300   loopmap = &J->cur.snapmap[loopsnap->mapofs];
 301   /* The PC of snapshot #0 and the loop snapshot must match. */
 302   psentinel = &loopmap[loopsnap->nent];
 303   lj_assertJ(*psentinel == J->cur.snapmap[J->cur.snap[0].nent],
 304              "mismatched PC for loop snapshot");
 305   *psentinel = SNAP(255, 0, 0);  /* Replace PC with temporary sentinel. */
 306
 307   /* Start substitution with snapshot #1 (#0 is empty for root traces). */
 308   osnap = &J->cur.snap[1];
 309
 310   /* Copy and substitute all recorded instructions and snapshots. */
 311   for (ins = REF_FIRST; ins < invar; ins++) {
 312     IRIns *ir;
 313     IRRef op1, op2;
 314
 315     if (ins >= osnap->ref)  /* Instruction belongs to next snapshot? */
 316       loop_subst_snap(J, osnap++, loopmap, subst);  /* Copy-substitute it. */
 317
 318     /* Substitute instruction operands. */
 319     ir = IR(ins);
 320     op1 = ir->op1;
 321     if (!irref_isk(op1)) op1 = subst[op1];
 322     op2 = ir->op2;
 323     if (!irref_isk(op2)) op2 = subst[op2];
 324     if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
 325         op1 == ir->op1 && op2 == ir->op2) {  /* Regular invariant ins? */
 326       subst[ins] = (IRRef1)ins;  /* Shortcut. */
 327     } else {
 328       /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
 329       IRType1 t = ir->t;  /* Get this first, since emitir may invalidate ir. */
 330       IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
 331       subst[ins] = (IRRef1)ref;
 332       if (ref != ins) {
 333         IRIns *irr = IR(ref);
 334         if (ref < invar) {  /* Loop-carried dependency? */
 335           /* Potential PHI? */
 336           if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
 337             irt_setphi(irr->t);
 338             if (nphi >= LJ_MAX_PHI)
 339               lj_trace_err(J, LJ_TRERR_PHIOV);
 340             phi[nphi++] = (IRRef1)ref;
 341           }
 342           /* Check all loop-carried dependencies for type instability. */
 343           if (!irt_sametype(t, irr->t)) {
 344             if (irt_isinteger(t) && irt_isinteger(irr->t))
 345               continue;
 346             else if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num. */
 347               ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
 348             else if (irt_isnum(irr->t) && irt_isinteger(t))  /* Fix num->int. */
 349               ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
 350                                     IRCONV_INT_NUM|IRCONV_CHECK));
 351             else
 352               lj_trace_err(J, LJ_TRERR_TYPEINS);
 353             subst[ins] = (IRRef1)ref;
 354             irr = IR(ref);
 355             goto phiconv;
 356           }
 357         } else if (ref != REF_DROP && ref > invar &&
 358                    ((irr->o == IR_CONV && irr->op1 < invar) ||
 359                     (irr->o == IR_ALEN && irr->op2 < invar &&
 360                                           irr->op2 != REF_NIL))) {
 361           /* May need an extra PHI for a CONV or ALEN hint. */
 362           ref = irr->o == IR_CONV ? irr->op1 : irr->op2;
 363           irr = IR(ref);
 364         phiconv:
 365           if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
 366             irt_setphi(irr->t);
 367             if (nphi >= LJ_MAX_PHI)
 368               lj_trace_err(J, LJ_TRERR_PHIOV);
 369             phi[nphi++] = (IRRef1)ref;
 370           }
 371         }
 372       }
 373     }
 374   }
 375   if (!irt_isguard(J->guardemit))  /* Drop redundant snapshot. */
 376     J->cur.nsnapmap = (uint32_t)J->cur.snap[--J->cur.nsnap].mapofs;
 377   lj_assertJ(J->cur.nsnapmap <= J->sizesnapmap, "bad snapshot map index");
 378   *psentinel = J->cur.snapmap[J->cur.snap[0].nent];  /* Restore PC. */
 379
 380   loop_emit_phi(J, subst, phi, nphi, onsnap);
 381 }
 382
 383 /* Undo any partial changes made by the loop optimization. */
 384 static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
 385 {
 386   ptrdiff_t i;
 387   SnapShot *snap = &J->cur.snap[nsnap-1];
 388   SnapEntry *map = J->cur.snapmap;
 389   map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent];  /* Restore PC. */
 390   J->cur.nsnapmap = (uint32_t)nsnapmap;
 391   J->cur.nsnap = nsnap;
 392   J->guardemit.irt = 0;
 393   lj_ir_rollback(J, ins);
 394   for (i = 0; i < BPROP_SLOTS; i++) {  /* Remove backprop. cache entries. */
 395     BPropEntry *bp = &J->bpropcache[i];
 396     if (bp->val >= ins)
 397       bp->key = 0;
 398   }
 399   for (ins--; ins >= REF_FIRST; ins--) {  /* Remove flags. */
 400     IRIns *ir = IR(ins);
 401     irt_clearphi(ir->t);
 402     irt_clearmark(ir->t);
 403   }
 404 }
 405
 406 /* Protected callback for loop optimization. */
 407 static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
 408 {
 409   UNUSED(L); UNUSED(dummy);
 410   loop_unroll((LoopState *)ud);
 411   return NULL;
 412 }
 413
 414 /* Loop optimization. */
 415 int lj_opt_loop(jit_State *J)
 416 {
 417   IRRef nins = J->cur.nins;
 418   SnapNo nsnap = J->cur.nsnap;
 419   MSize nsnapmap = J->cur.nsnapmap;
 420   LoopState lps;
 421   int errcode;
 422   lps.J = J;
 423   lps.subst = NULL;
 424   lps.sizesubst = 0;
 425   errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt);
 426   lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1);
 427   if (LJ_UNLIKELY(errcode)) {
 428     lua_State *L = J->L;
 429     if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) {  /* Trace error? */
 430       int32_t e = numberVint(L->top-1);
 431       switch ((TraceError)e) {
 432       case LJ_TRERR_TYPEINS:  /* Type instability. */
 433       case LJ_TRERR_GFAIL:  /* Guard would always fail. */
 434         /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
 435         if (--J->instunroll < 0)  /* But do not unroll forever. */
 436           break;
 437         L->top--;  /* Remove error object. */
 438         loop_undo(J, nins, nsnap, nsnapmap);
 439         return 1;  /* Loop optimization failed, continue recording. */
 440       default:
 441         break;
 442       }
 443     }
 444     lj_err_throw(L, errcode);  /* Propagate all other errors. */
 445   }
 446   return 0;  /* Loop optimization is ok. */
 447 }
 448
 449 #undef IR
 450 #undef emitir
 451 #undef emitir_raw
 452
 453 #endif