src/lj_opt_split.c

   1 /*
   2 ** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
   3 ** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
   4 */
   5
   6 #define lj_opt_split_c
   7 #define LUA_CORE
   8
   9 #include "lj_obj.h"
  10
  11 #if LJ_HASJIT && LJ_HASFFI && LJ_32
  12
  13 #include "lj_err.h"
  14 #include "lj_str.h"
  15 #include "lj_ir.h"
  16 #include "lj_jit.h"
  17 #include "lj_iropt.h"
  18 #include "lj_vm.h"
  19
  20 /* SPLIT pass:
  21 **
  22 ** This pass splits up 64 bit IR instructions into multiple 32 bit IR
  23 ** instructions. It's only active for 32 bit CPUs which lack native 64 bit
  24 ** operations. The FFI is currently the only emitter for 64 bit
  25 ** instructions, so this pass is disabled if the FFI is disabled.
  26 **
  27 ** Splitting the IR in a separate pass keeps each 32 bit IR assembler
  28 ** backend simple. Only a small amount of extra functionality needs to be
  29 ** implemented. This is much easier than adding support for allocating
  30 ** register pairs to each backend (believe me, I tried). A few simple, but
  31 ** important optimizations can be performed by the SPLIT pass, which would
  32 ** be tedious to do in the backend.
  33 **
  34 ** The basic idea is to replace each 64 bit IR instruction with its 32 bit
  35 ** equivalent plus an extra HIOP instruction. The splitted IR is not passed
  36 ** through FOLD or any other optimizations, so each HIOP is guaranteed to
  37 ** immediately follow it's counterpart. The actual functionality of HIOP is
  38 ** inferred from the previous instruction.
  39 **
  40 ** The operands of HIOP hold the hiword input references. The output of HIOP
  41 ** is the hiword output reference, which is also used to hold the hiword
  42 ** register or spill slot information. The register allocator treats this
  43 ** instruction independent of any other instruction, which improves code
  44 ** quality compared to using fixed register pairs.
  45 **
  46 ** It's easier to split up some instructions into two regular 32 bit
  47 ** instructions. E.g. XLOAD is split up into two XLOADs with two different
  48 ** addresses. Obviously 64 bit constants need to be split up into two 32 bit
  49 ** constants, too. Some hiword instructions can be entirely omitted, e.g.
  50 ** when zero-extending a 32 bit value to 64 bits.
  51 **
  52 ** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
  53 ** two int64_t fields:
  54 **
  55 ** 0100    p32 ADD    base  +8
  56 ** 0101    i64 XLOAD  0100
  57 ** 0102    i64 ADD    0101  +1
  58 ** 0103    p32 ADD    base  +16
  59 ** 0104    i64 XSTORE 0103  0102
  60 **
  61 **         mov rax, [esi+0x8]
  62 **         add rax, +0x01
  63 **         mov [esi+0x10], rax
  64 **
  65 ** Here's the transformed IR and the x86 machine code after the SPLIT pass:
  66 **
  67 ** 0100    p32 ADD    base  +8
  68 ** 0101    int XLOAD  0100
  69 ** 0102    p32 ADD    base  +12
  70 ** 0103    int XLOAD  0102
  71 ** 0104    int ADD    0101  +1
  72 ** 0105    int HIOP   0103  +0
  73 ** 0106    p32 ADD    base  +16
  74 ** 0107    int XSTORE 0106  0104
  75 ** 0108    p32 ADD    base  +20
  76 ** 0109    int XSTORE 0108  0105
  77 **
  78 **         mov eax, [esi+0x8]
  79 **         mov ecx, [esi+0xc]
  80 **         add eax, +0x01
  81 **         adc ecx, +0x00
  82 **         mov [esi+0x10], eax
  83 **         mov [esi+0x14], ecx
  84 **
  85 ** You may notice the reassociated hiword address computation, which is
  86 ** later fused into the mov operands by the assembler.
  87 */
  88
  89 /* Some local macros to save typing. Undef'd at the end. */
  90 #define IR(ref)         (&J->cur.ir[(ref)])
  91
  92 /* Directly emit the transformed IR without updating chains etc. */
  93 static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
  94 {
  95   IRRef nref = lj_ir_nextins(J);
  96   IRIns *ir = IR(nref);
  97   ir->ot = ot;
  98   ir->op1 = op1;
  99   ir->op2 = op2;
 100   return nref;
 101 }
 102
 103 /* Emit a CALLN with two split 64 bit arguments. */
 104 static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
 105                           IRIns *ir, IRCallID id)
 106 {
 107   IRRef tmp, op1 = ir->op1, op2 = ir->op2;
 108   J->cur.nins--;
 109 #if LJ_LE
 110   tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
 111   tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
 112   tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
 113 #else
 114   tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
 115   tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
 116   tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
 117 #endif
 118   ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
 119   return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
 120 }
 121
 122 /* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
 123 static IRRef split_ptr(jit_State *J, IRRef ref)
 124 {
 125   IRIns *ir = IR(ref);
 126   int32_t ofs = 4;
 127   if (ir->o == IR_ADD && irref_isk(ir->op2)) {  /* Reassociate address. */
 128     ofs += IR(ir->op2)->i;
 129     ref = ir->op1;
 130     if (ofs == 0) return ref;
 131   }
 132   return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
 133 }
 134
 135 /* Transform the old IR to the new IR. */
 136 static void split_ir(jit_State *J)
 137 {
 138   IRRef nins = J->cur.nins, nk = J->cur.nk;
 139   MSize irlen = nins - nk;
 140   MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
 141   IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
 142   IRRef1 *hisubst;
 143   IRRef ref;
 144
 145   /* Copy old IR to buffer. */
 146   memcpy(oir, IR(nk), irlen*sizeof(IRIns));
 147   /* Bias hiword substitution table and old IR. Loword kept in field prev. */
 148   hisubst = (IRRef1 *)&oir[irlen] - nk;
 149   oir -= nk;
 150
 151   /* Remove all IR instructions, but retain IR constants. */
 152   J->cur.nins = REF_FIRST;
 153
 154   /* Process constants and fixed references. */
 155   for (ref = nk; ref <= REF_BASE; ref++) {
 156     IRIns *ir = &oir[ref];
 157     if (ir->o == IR_KINT64) {  /* Split up 64 bit constant. */
 158       TValue tv = *ir_k64(ir);
 159       ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
 160       hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
 161     } else {
 162       ir->prev = (IRRef1)ref;  /* Identity substitution for loword. */
 163     }
 164   }
 165
 166   /* Process old IR instructions. */
 167   for (ref = REF_FIRST; ref < nins; ref++) {
 168     IRIns *ir = &oir[ref];
 169     IRRef nref = lj_ir_nextins(J);
 170     IRIns *nir = IR(nref);
 171
 172     /* Copy-substitute old instruction to new instruction. */
 173     nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
 174     nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
 175     ir->prev = nref;  /* Loword substitution. */
 176     nir->o = ir->o;
 177     nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
 178
 179     /* Split 64 bit instructions. */
 180     if (irt_isint64(ir->t)) {
 181       IRRef hi = hisubst[ir->op1];
 182       nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD);  /* Turn into INT op. */
 183       switch (ir->o) {
 184       case IR_ADD:
 185       case IR_SUB:
 186         /* Use plain op for hiword if loword cannot produce a carry/borrow. */
 187         if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
 188           ir->prev = nir->op1;  /* Pass through loword. */
 189           nir->op1 = hi; nir->op2 = hisubst[ir->op2];
 190           hi = nref;
 191           break;
 192         }
 193         /* fallthrough */
 194       case IR_NEG:
 195         hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
 196         break;
 197       case IR_MUL:
 198         hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
 199         break;
 200       case IR_POWI:
 201         hi = split_call64(J, hisubst, oir, ir,
 202                           irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
 203                                              IRCALL_lj_carith_powu64);
 204         break;
 205       case IR_XLOAD:
 206         hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
 207 #if LJ_BE
 208         ir->prev = hi; hi = nref;
 209 #endif
 210         break;
 211       case IR_XSTORE:
 212 #if LJ_LE
 213         hi = hisubst[ir->op2];
 214 #else
 215         hi = nir->op2; nir->op2 = hisubst[ir->op2];
 216 #endif
 217         split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
 218         continue;
 219       case IR_CONV: {  /* Conversion to 64 bit integer. Others handled below. */
 220         IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
 221         if (st == IRT_NUM || st == IRT_FLOAT) {  /* FP to 64 bit int conv. */
 222           hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
 223         } else if (st == IRT_I64 || st == IRT_U64) {  /* 64/64 bit cast. */
 224           /* Drop cast, since assembler doesn't care. */
 225           hisubst[ref] = hi;
 226           goto fwdlo;
 227         } else if ((ir->op2 & IRCONV_SEXT)) {  /* Sign-extend to 64 bit. */
 228           IRRef k31 = lj_ir_kint(J, 31);
 229           nir = IR(nref);  /* May have been reallocated. */
 230           ir->prev = nir->op1;  /* Pass through loword. */
 231           nir->o = IR_BSAR;  /* hi = bsar(lo, 31). */
 232           nir->op2 = k31;
 233           hi = nref;
 234         } else {  /* Zero-extend to 64 bit. */
 235           hisubst[ref] = lj_ir_kint(J, 0);
 236           goto fwdlo;
 237         }
 238         break;
 239         }
 240       case IR_PHI: {
 241         IRRef hi2;
 242         if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
 243             nir->op1 == nir->op2)
 244           J->cur.nins--;  /* Drop useless PHIs. */
 245         hi2 = hisubst[ir->op2];
 246         if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
 247           split_emit(J, IRTI(IR_PHI), hi, hi2);
 248         continue;
 249         }
 250       default:
 251         lua_assert(ir->o <= IR_NE);
 252         split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]);  /* Comparisons. */
 253         continue;
 254       }
 255       hisubst[ref] = hi;  /* Store hiword substitution. */
 256     } else if (ir->o == IR_CONV) {  /* See above, too. */
 257       IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
 258       if (st == IRT_I64 || st == IRT_U64) {  /* Conversion from 64 bit int. */
 259         if (irt_isfp(ir->t)) {  /* 64 bit integer to FP conversion. */
 260           ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
 261                                 hisubst[ir->op1], nref);
 262         } else {  /* Truncate to lower 32 bits. */
 263         fwdlo:
 264           ir->prev = nir->op1;  /* Forward loword. */
 265           /* Replace with NOP to avoid messing up the snapshot logic. */
 266           nir->ot = IRT(IR_NOP, IRT_NIL);
 267           nir->op1 = nir->op2 = 0;
 268         }
 269       }
 270     } else if (ir->o == IR_LOOP) {
 271       J->loopref = nref;  /* Needed by assembler. */
 272     }
 273   }
 274
 275   /* Add PHI marks. */
 276   for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
 277     IRIns *ir = IR(ref);
 278     if (ir->o != IR_PHI) break;
 279     if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
 280     if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
 281   }
 282
 283   /* Substitute snapshot maps. */
 284   oir[nins].prev = J->cur.nins;  /* Substitution for last snapshot. */
 285   {
 286     SnapNo i, nsnap = J->cur.nsnap;
 287     for (i = 0; i < nsnap; i++) {
 288       SnapShot *snap = &J->cur.snap[i];
 289       SnapEntry *map = &J->cur.snapmap[snap->mapofs];
 290       MSize n, nent = snap->nent;
 291       snap->ref = oir[snap->ref].prev;
 292       for (n = 0; n < nent; n++) {
 293         SnapEntry sn = map[n];
 294         map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
 295       }
 296     }
 297   }
 298 }
 299
 300 /* Protected callback for split pass. */
 301 static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
 302 {
 303   jit_State *J = (jit_State *)ud;
 304   split_ir(J);
 305   UNUSED(L); UNUSED(dummy);
 306   return NULL;
 307 }
 308
 309 #ifdef LUA_USE_ASSERT
 310 /* Slow, but sure way to check whether a SPLIT pass is needed. */
 311 static int split_needsplit(jit_State *J)
 312 {
 313   IRIns *ir, *irend;
 314   IRRef ref;
 315   for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
 316     if (irt_isint64(ir->t))
 317       return 1;
 318   for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
 319     if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
 320         (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
 321       return 1;
 322   return 0;  /* Nope. */
 323 }
 324 #endif
 325
 326 /* SPLIT pass. */
 327 void lj_opt_split(jit_State *J)
 328 {
 329   lua_assert(J->needsplit >= split_needsplit(J));  /* Verify flag. */
 330   if (J->needsplit) {
 331     int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
 332     if (errcode) {
 333       /* Completely reset the trace to avoid inconsistent dump on abort. */
 334       J->cur.nins = J->cur.nk = REF_BASE;
 335       J->cur.nsnap = 0;
 336       lj_err_throw(J->L, errcode);  /* Propagate errors. */
 337     }
 338   }
 339 }
 340
 341 #undef IR
 342
 343 #endif