nanojit/Assembler.cpp

   1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
   2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is [Open Source Virtual Machine].
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Adobe System Incorporated.
  20  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Adobe AS3 Team
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either the GNU General Public License Version 2 or later (the "GPL"), or
  28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 #include "nanojit.h"
  41
  42 #ifdef FEATURE_NANOJIT
  43
  44 #ifdef VTUNE
  45 #include "../core/CodegenLIR.h"
  46 #endif
  47
  48 #ifdef _MSC_VER
  49     // disable some specific warnings which are normally useful, but pervasive in the code-gen macros
  50     #pragma warning(disable:4310) // cast truncates constant value
  51 #endif
  52
  53 namespace nanojit
  54 {
  55     /**
  56      * Need the following:
  57      *
  58      *    - merging paths ( build a graph? ), possibly use external rep to drive codegen
  59      */
  60     Assembler::Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config)
  61         : codeList(NULL)
  62         , alloc(alloc)
  63         , _codeAlloc(codeAlloc)
  64         , _dataAlloc(dataAlloc)
  65         , _thisfrag(NULL)
  66         , _branchStateMap(alloc)
  67         , _patches(alloc)
  68         , _labels(alloc)
  69     #if NJ_USES_QUAD_CONSTANTS
  70         , _quadConstants(alloc)
  71     #endif
  72         , _epilogue(NULL)
  73         , _err(None)
  74     #if PEDANTIC
  75         , pedanticTop(NULL)
  76     #endif
  77     #ifdef VTUNE
  78         , cgen(NULL)
  79     #endif
  80         , _config(config)
  81     {
  82         VMPI_memset(&_stats, 0, sizeof(_stats));
  83         VMPI_memset(lookahead, 0, N_LOOKAHEAD * sizeof(LInsp));
  84         nInit(core);
  85         (void)logc;
  86         verbose_only( _logc = logc; )
  87         verbose_only( _outputCache = 0; )
  88         verbose_only( outline[0] = '\0'; )
  89         verbose_only( outlineEOL[0] = '\0'; )
  90
  91         reset();
  92     }
  93
  94 #ifdef _DEBUG
  95
  96     /*static*/ LIns* const AR::BAD_ENTRY = (LIns*)0xdeadbeef;
  97
  98     void AR::validateQuick()
  99     {
 100         NanoAssert(_highWaterMark < NJ_MAX_STACK_ENTRY);
 101         NanoAssert(_entries[0] == NULL);
 102         // Only check a few entries around _highWaterMark.
 103         uint32_t const RADIUS = 4;
 104         uint32_t const lo = (_highWaterMark > 1 + RADIUS ? _highWaterMark - RADIUS : 1);
 105         uint32_t const hi = (_highWaterMark + 1 + RADIUS < NJ_MAX_STACK_ENTRY ? _highWaterMark + 1 + RADIUS : NJ_MAX_STACK_ENTRY);
 106         for (uint32_t i = lo; i <= _highWaterMark; ++i)
 107             NanoAssert(_entries[i] != BAD_ENTRY);
 108         for (uint32_t i = _highWaterMark+1; i < hi; ++i)
 109             NanoAssert(_entries[i] == BAD_ENTRY);
 110     }
 111
 112     void AR::validateFull()
 113     {
 114         NanoAssert(_highWaterMark < NJ_MAX_STACK_ENTRY);
 115         NanoAssert(_entries[0] == NULL);
 116         for (uint32_t i = 1; i <= _highWaterMark; ++i)
 117             NanoAssert(_entries[i] != BAD_ENTRY);
 118         for (uint32_t i = _highWaterMark+1; i < NJ_MAX_STACK_ENTRY; ++i)
 119             NanoAssert(_entries[i] == BAD_ENTRY);
 120     }
 121
 122     void AR::validate()
 123     {
 124         static uint32_t validateCounter = 0;
 125         if (++validateCounter >= 100)
 126         {
 127             validateFull();
 128             validateCounter = 0;
 129         }
 130         else
 131         {
 132             validateQuick();
 133         }
 134     }
 135
 136 #endif
 137
 138     inline void AR::clear()
 139     {
 140         _highWaterMark = 0;
 141         NanoAssert(_entries[0] == NULL);
 142     #ifdef _DEBUG
 143         for (uint32_t i = 1; i < NJ_MAX_STACK_ENTRY; ++i)
 144             _entries[i] = BAD_ENTRY;
 145     #endif
 146     }
 147
 148     bool AR::Iter::next(LIns*& ins, uint32_t& nStackSlots, int32_t& arIndex)
 149     {
 150         while (_i <= _ar._highWaterMark) {
 151             ins = _ar._entries[_i];
 152             if (ins) {
 153                 arIndex = _i;
 154                 nStackSlots = nStackSlotsFor(ins);
 155                 _i += nStackSlots;
 156                 return true;
 157             }
 158             _i++;
 159         }
 160         ins = NULL;
 161         nStackSlots = 0;
 162         arIndex = 0;
 163         return false;
 164     }
 165
 166     void Assembler::arReset()
 167     {
 168         _activation.clear();
 169         _branchStateMap.clear();
 170         _patches.clear();
 171         _labels.clear();
 172     #if NJ_USES_QUAD_CONSTANTS
 173         _quadConstants.clear();
 174     #endif
 175     }
 176
 177     void Assembler::registerResetAll()
 178     {
 179         nRegisterResetAll(_allocator);
 180
 181         // At start, should have some registers free and none active.
 182         NanoAssert(0 != _allocator.free);
 183         NanoAssert(0 == _allocator.countActive());
 184 #ifdef NANOJIT_IA32
 185         debug_only(_fpuStkDepth = 0; )
 186 #endif
 187     }
 188
 189     // Legend for register sets: A = allowed, P = preferred, F = free, S = SavedReg.
 190     //
 191     // Finds a register in 'setA___' to store the result of 'ins' (one from
 192     // 'set_P__' if possible), evicting one if necessary.  Doesn't consider
 193     // the prior state of 'ins'.
 194     //
 195     // Nb: 'setA___' comes from the instruction's use, 'set_P__' comes from its def.
 196     // Eg. in 'add(call(...), ...)':
 197     //     - the call's use means setA___==GpRegs;
 198     //     - the call's def means set_P__==rmask(retRegs[0]).
 199     //
 200     Register Assembler::registerAlloc(LIns* ins, RegisterMask setA___, RegisterMask set_P__)
 201     {
 202         Register r;
 203         RegisterMask set__F_ = _allocator.free;
 204         RegisterMask setA_F_ = setA___ & set__F_;
 205
 206         if (setA_F_) {
 207             RegisterMask set___S = SavedRegs;
 208             RegisterMask setA_FS = setA_F_ & set___S;
 209             RegisterMask setAPF_ = setA_F_ & set_P__;
 210             RegisterMask setAPFS = setA_FS & set_P__;
 211             RegisterMask set;
 212
 213             if      (setAPFS) set = setAPFS;
 214             else if (setAPF_) set = setAPF_;
 215             else if (setA_FS) set = setA_FS;
 216             else              set = setA_F_;
 217
 218             r = nRegisterAllocFromSet(set);
 219             _allocator.addActive(r, ins);
 220             ins->setReg(r);
 221         } else {
 222             counter_increment(steals);
 223
 224             // Nothing free, steal one.
 225             // LSRA says pick the one with the furthest use.
 226             LIns* vic = findVictim(setA___);
 227             NanoAssert(vic->isInReg());
 228             r = vic->getReg();
 229
 230             evict(vic);
 231
 232             // r ends up staying active, but the LIns defining it changes.
 233             _allocator.removeFree(r);
 234             _allocator.addActive(r, ins);
 235             ins->setReg(r);
 236         }
 237
 238         return r;
 239     }
 240
 241     // Finds a register in 'allow' to store a temporary value (one not
 242     // associated with a particular LIns), evicting one if necessary.  The
 243     // returned register is marked as being free and so can only be safely
 244     // used for code generation purposes until the regstate is next inspected
 245     // or updated.
 246     Register Assembler::registerAllocTmp(RegisterMask allow)
 247     {
 248         LIns dummyIns;
 249         Register r = registerAlloc(&dummyIns, allow, /*prefer*/0);
 250
 251         // Mark r as free, ready for use as a temporary value.
 252         _allocator.removeActive(r);
 253         _allocator.addFree(r);
 254         return r;
 255      }
 256
 257     /**
 258      * these instructions don't have to be saved & reloaded to spill,
 259      * they can just be recalculated w/out any inputs.
 260      */
 261     bool Assembler::canRemat(LIns *i) {
 262         return i->isImmAny() || i->isop(LIR_alloc);
 263     }
 264
 265     void Assembler::codeAlloc(NIns *&start, NIns *&end, NIns *&eip
 266                               verbose_only(, size_t &nBytes))
 267     {
 268         // save the block we just filled
 269         if (start)
 270             CodeAlloc::add(codeList, start, end);
 271
 272         // CodeAlloc contract: allocations never fail
 273         _codeAlloc.alloc(start, end);
 274         verbose_only( nBytes += (end - start) * sizeof(NIns); )
 275         NanoAssert(uintptr_t(end) - uintptr_t(start) >= (size_t)LARGEST_UNDERRUN_PROT);
 276         eip = end;
 277
 278         #ifdef VTUNE
 279         if (_nIns && _nExitIns) {
 280             //cgen->jitAddRecord((uintptr_t)list->code, 0, 0, true); // add placeholder record for top of page
 281             cgen->jitCodePosUpdate((uintptr_t)list->code);
 282             cgen->jitPushInfo(); // new page requires new entry
 283         }
 284         #endif
 285     }
 286
 287     void Assembler::reset()
 288     {
 289         _nIns = 0;
 290         _nExitIns = 0;
 291         codeStart = codeEnd = 0;
 292         exitStart = exitEnd = 0;
 293         _stats.pages = 0;
 294         codeList = 0;
 295
 296         nativePageReset();
 297         registerResetAll();
 298         arReset();
 299     }
 300
 301     #ifdef _DEBUG
 302     void Assembler::pageValidate()
 303     {
 304         if (error()) return;
 305         // This may be a normal code chunk or an exit code chunk.
 306         NanoAssertMsg(codeStart <= _nIns && _nIns <= codeEnd,
 307                      "Native instruction pointer overstep paging bounds; check overrideProtect for last instruction");
 308     }
 309     #endif
 310
 311     #ifdef _DEBUG
 312
 313     bool AR::isValidEntry(uint32_t idx, LIns* ins) const
 314     {
 315         return idx > 0 && idx <= _highWaterMark && _entries[idx] == ins;
 316     }
 317
 318     void AR::checkForResourceConsistency(const RegAlloc& regs)
 319     {
 320         validate();
 321         for (uint32_t i = 1; i <= _highWaterMark; ++i)
 322         {
 323             LIns* ins = _entries[i];
 324             if (!ins)
 325                 continue;
 326             uint32_t arIndex = ins->getArIndex();
 327             NanoAssert(arIndex != 0);
 328             if (ins->isop(LIR_alloc)) {
 329                 int const n = i + (ins->size()>>2);
 330                 for (int j=i+1; j < n; j++) {
 331                     NanoAssert(_entries[j]==ins);
 332                 }
 333                 NanoAssert(arIndex == (uint32_t)n-1);
 334                 i = n-1;
 335             }
 336             else if (ins->isN64()) {
 337                 NanoAssert(_entries[i + 1]==ins);
 338                 i += 1; // skip high word
 339             }
 340             else {
 341                 NanoAssertMsg(arIndex == i, "Stack record index mismatch");
 342             }
 343             NanoAssertMsg(!ins->isInReg() || regs.isConsistent(ins->getReg(), ins),
 344                           "Register record mismatch");
 345         }
 346     }
 347
 348     void Assembler::resourceConsistencyCheck()
 349     {
 350         NanoAssert(!error());
 351
 352 #ifdef NANOJIT_IA32
 353         NanoAssert((_allocator.active[FST0] && _fpuStkDepth == -1) ||
 354             (!_allocator.active[FST0] && _fpuStkDepth == 0));
 355 #endif
 356
 357         _activation.checkForResourceConsistency(_allocator);
 358
 359         registerConsistencyCheck();
 360     }
 361
 362     void Assembler::registerConsistencyCheck()
 363     {
 364         RegisterMask managed = _allocator.managed;
 365         for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
 366             if (rmask(r) & managed) {
 367                 // A register managed by register allocation must be either
 368                 // free or active, but not both.
 369                 if (_allocator.isFree(r)) {
 370                     NanoAssertMsgf(_allocator.getActive(r)==0,
 371                         "register %s is free but assigned to ins", gpn(r));
 372                 } else {
 373                     // An LIns defining a register must have that register in
 374                     // its reservation.
 375                     LIns* ins = _allocator.getActive(r);
 376                     NanoAssert(ins);
 377                     NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
 378                 }
 379             } else {
 380                 // A register not managed by register allocation must be
 381                 // neither free nor active.
 382                 NanoAssert(!_allocator.isFree(r));
 383                 NanoAssert(!_allocator.getActive(r));
 384             }
 385         }
 386     }
 387     #endif /* _DEBUG */
 388
 389     void Assembler::findRegFor2(RegisterMask allowa, LIns* ia, Register& ra,
 390                                 RegisterMask allowb, LIns* ib, Register& rb)
 391     {
 392         // There should be some overlap between 'allowa' and 'allowb', else
 393         // there's no point calling this function.
 394         NanoAssert(allowa & allowb);
 395
 396         if (ia == ib) {
 397             ra = rb = findRegFor(ia, allowa & allowb);  // use intersection(allowa, allowb)
 398
 399         } else if (ib->isInRegMask(allowb)) {
 400             // 'ib' is already in an allowable reg -- don't let it get evicted
 401             // when finding 'ra'.
 402             rb = ib->getReg();
 403             ra = findRegFor(ia, allowa & ~rmask(rb));
 404
 405         } else {
 406             ra = findRegFor(ia, allowa);
 407             rb = findRegFor(ib, allowb & ~rmask(ra));
 408         }
 409     }
 410
 411     Register Assembler::findSpecificRegFor(LIns* i, Register w)
 412     {
 413         return findRegFor(i, rmask(w));
 414     }
 415
 416     // Like findRegFor(), but called when the LIns is used as a pointer.  It
 417     // doesn't have to be called, findRegFor() can still be used, but it can
 418     // optimize the LIR_alloc case by indexing off FP, thus saving the use of
 419     // a GpReg.
 420     //
 421     Register Assembler::getBaseReg(LInsp base, int &d, RegisterMask allow)
 422     {
 423     #if !PEDANTIC
 424         if (base->isop(LIR_alloc)) {
 425             // The value of a LIR_alloc is a pointer to its stack memory,
 426             // which is always relative to FP.  So we can just return FP if we
 427             // also adjust 'd' (and can do so in a valid manner).  Or, in the
 428             // PEDANTIC case, we can just assign a register as normal;
 429             // findRegFor() will allocate the stack memory for LIR_alloc if
 430             // necessary.
 431             d += findMemFor(base);
 432             return FP;
 433         }
 434     #else
 435         (void) d;
 436     #endif
 437         return findRegFor(base, allow);
 438     }
 439
 440     // Like findRegFor2(), but used for stores where the base value has the
 441     // same type as the stored value, eg. in asm_store32() on 32-bit platforms
 442     // and asm_store64() on 64-bit platforms.  Similar to getBaseReg(),
 443     // findRegFor2() can be called instead, but this function can optimize the
 444     // case where the base value is a LIR_alloc.
 445     void Assembler::getBaseReg2(RegisterMask allowValue, LIns* value, Register& rv,
 446                                 RegisterMask allowBase, LIns* base, Register& rb, int &d)
 447     {
 448     #if !PEDANTIC
 449         if (base->isop(LIR_alloc)) {
 450             rb = FP;
 451             d += findMemFor(base);
 452             rv = findRegFor(value, allowValue);
 453             return;
 454         }
 455     #else
 456         (void) d;
 457     #endif
 458         findRegFor2(allowValue, value, rv, allowBase, base, rb);
 459     }
 460
 461     // Finds a register in 'allow' to hold the result of 'ins'.  Used when we
 462     // encounter a use of 'ins'.  The actions depend on the prior regstate of
 463     // 'ins':
 464     // - If the result of 'ins' is not in any register, we find an allowed
 465     //   one, evicting one if necessary.
 466     // - If the result of 'ins' is already in an allowed register, we use that.
 467     // - If the result of 'ins' is already in a not-allowed register, we find an
 468     //   allowed one and move it.
 469     //
 470     Register Assembler::findRegFor(LIns* ins, RegisterMask allow)
 471     {
 472         if (ins->isop(LIR_alloc)) {
 473             // Never allocate a reg for this without stack space too.
 474             findMemFor(ins);
 475         }
 476
 477         Register r;
 478
 479         if (!ins->isInReg()) {
 480             // 'ins' isn't in a register (must be in a spill slot or nowhere).
 481             r = registerAlloc(ins, allow, hint(ins));
 482
 483         } else if (rmask(r = ins->getReg()) & allow) {
 484             // 'ins' is in an allowed register.
 485             _allocator.useActive(r);
 486
 487         } else {
 488             // 'ins' is in a register (r) that's not in 'allow'.
 489 #ifdef NANOJIT_IA32
 490             if (((rmask(r)&XmmRegs) && !(allow&XmmRegs)) ||
 491                 ((rmask(r)&x87Regs) && !(allow&x87Regs)))
 492             {
 493                 // x87 <-> xmm copy required
 494                 //_nvprof("fpu-evict",1);
 495                 evict(ins);
 496                 r = registerAlloc(ins, allow, hint(ins));
 497             } else
 498 #elif defined(NANOJIT_PPC) || defined(NANOJIT_MIPS)
 499             if (((rmask(r)&GpRegs) && !(allow&GpRegs)) ||
 500                 ((rmask(r)&FpRegs) && !(allow&FpRegs)))
 501             {
 502                 evict(ins);
 503                 r = registerAlloc(ins, allow, hint(ins));
 504             } else
 505 #endif
 506             {
 507                 // The post-state register holding 'ins' is 's', the pre-state
 508                 // register holding 'ins' is 'r'.  For example, if s=eax and
 509                 // r=ecx:
 510                 //
 511                 // pre-state:   ecx(ins)
 512                 // instruction: mov eax, ecx
 513                 // post-state:  eax(ins)
 514                 //
 515                 Register s = r;
 516                 _allocator.retire(r);
 517                 r = registerAlloc(ins, allow, hint(ins));
 518
 519                 // 'ins' is in 'allow', in register r (different to the old r);
 520                 //  s is the old r.
 521                 if ((rmask(s) & GpRegs) && (rmask(r) & GpRegs)) {
 522                     MR(s, r);   // move 'ins' from its pre-state reg (r) to its post-state reg (s)
 523                 } else {
 524                     asm_nongp_copy(s, r);
 525                 }
 526             }
 527         }
 528
 529         return r;
 530     }
 531
 532     // Like findSpecificRegFor(), but only for when 'r' is known to be free
 533     // and 'ins' is known to not already have a register allocated.  Updates
 534     // the regstate (maintaining the invariants) but does not generate any
 535     // code.  The return value is redundant, always being 'r', but it's
 536     // sometimes useful to have it there for assignments.
 537     Register Assembler::findSpecificRegForUnallocated(LIns* ins, Register r)
 538     {
 539         if (ins->isop(LIR_alloc)) {
 540             // never allocate a reg for this w/out stack space too
 541             findMemFor(ins);
 542         }
 543
 544         NanoAssert(!ins->isInReg());
 545         NanoAssert(_allocator.free & rmask(r));
 546
 547         ins->setReg(r);
 548         _allocator.removeFree(r);
 549         _allocator.addActive(r, ins);
 550
 551         return r;
 552     }
 553
 554 #if NJ_USES_QUAD_CONSTANTS
 555     const uint64_t* Assembler::findQuadConstant(uint64_t q)
 556     {
 557         uint64_t* p = _quadConstants.get(q);
 558         if (!p)
 559         {
 560             p = new (_dataAlloc) uint64_t;
 561             *p = q;
 562             _quadConstants.put(q, p);
 563         }
 564         return p;
 565     }
 566 #endif
 567
 568     int Assembler::findMemFor(LIns *ins)
 569     {
 570 #if NJ_USES_QUAD_CONSTANTS
 571         NanoAssert(!ins->isconstf());
 572 #endif
 573         if (!ins->isInAr()) {
 574             uint32_t const arIndex = arReserve(ins);
 575             ins->setArIndex(arIndex);
 576             NanoAssert(_activation.isValidEntry(ins->getArIndex(), ins) == (arIndex != 0));
 577         }
 578         return arDisp(ins);
 579     }
 580
 581     // XXX: this function is dangerous and should be phased out;
 582     // See bug 513615.  Calls to it should replaced it with a
 583     // prepareResultReg() / generate code / freeResourcesOf() sequence.
 584     Register Assembler::deprecated_prepResultReg(LIns *ins, RegisterMask allow)
 585     {
 586 #ifdef NANOJIT_IA32
 587         // We used to have to worry about possibly popping the x87 stack here.
 588         // But this function is no longer used on i386, and this assertion
 589         // ensures that.
 590         NanoAssert(0);
 591 #endif
 592         Register r = findRegFor(ins, allow);
 593         deprecated_freeRsrcOf(ins);
 594         return r;
 595     }
 596
 597     // Finds a register in 'allow' to hold the result of 'ins'.  Also
 598     // generates code to spill the result if necessary.  Called just prior to
 599     // generating the code for 'ins' (because we generate code backwards).
 600     //
 601     // An example where no spill is necessary.  Lines marked '*' are those
 602     // done by this function.
 603     //
 604     //   regstate:  R
 605     //   asm:       define res into r
 606     // * regstate:  R + r(res)
 607     //              ...
 608     //   asm:       use res in r
 609     //
 610     // An example where a spill is necessary.
 611     //
 612     //   regstate:  R
 613     //   asm:       define res into r
 614     // * regstate:  R + r(res)
 615     // * asm:       spill res from r
 616     //   regstate:  R
 617     //              ...
 618     //   asm:       restore res into r2
 619     //   regstate:  R + r2(res) + other changes from "..."
 620     //   asm:       use res in r2
 621     //
 622     Register Assembler::prepareResultReg(LIns *ins, RegisterMask allow)
 623     {
 624         // At this point, we know the result of 'ins' result has a use later
 625         // in the code.  (Exception: if 'ins' is a call to an impure function
 626         // the return value may not be used, but 'ins' will still be present
 627         // because it has side-effects.)  It may have had to be evicted, in
 628         // which case the restore will have already been generated, so we now
 629         // generate the spill (unless the restore was actually a
 630         // rematerialize, in which case it's not necessary).
 631 #ifdef NANOJIT_IA32
 632         // If 'allow' includes FST0 we have to pop if 'ins' isn't in FST0 in
 633         // the post-regstate.  This could be because 'ins' is unused, 'ins' is
 634         // in a spill slot, or 'ins' is in an XMM register.
 635         const bool pop = (allow & rmask(FST0)) &&
 636                          (!ins->isInReg() || ins->getReg() != FST0);
 637 #else
 638         const bool pop = false;
 639 #endif
 640         Register r = findRegFor(ins, allow);
 641         asm_maybe_spill(ins, pop);
 642 #ifdef NANOJIT_IA32
 643         if (!ins->isInAr() && pop && r == FST0) {
 644             // This can only happen with a LIR_fcall to an impure function
 645             // whose return value was ignored (ie. if ins->isInReg() was false
 646             // prior to the findRegFor() call).
 647             FSTP(FST0);     // pop the fpu result since it isn't used
 648         }
 649 #endif
 650         return r;
 651     }
 652
 653     void Assembler::asm_maybe_spill(LInsp ins, bool pop)
 654     {
 655         int d = ins->isInAr() ? arDisp(ins) : 0;
 656         Register r = ins->getReg();
 657         if (ins->isInAr()) {
 658             verbose_only( RefBuf b;
 659                           if (_logc->lcbits & LC_Assembly) {
 660                              setOutputForEOL("  <= spill %s",
 661                              _thisfrag->lirbuf->printer->formatRef(&b, ins)); } )
 662             asm_spill(r, d, pop, ins->isN64());
 663         }
 664     }
 665
 666     // XXX: This function is error-prone and should be phased out; see bug 513615.
 667     void Assembler::deprecated_freeRsrcOf(LIns *ins)
 668     {
 669         if (ins->isInReg()) {
 670             asm_maybe_spill(ins, /*pop*/false);
 671             _allocator.retire(ins->getReg());   // free any register associated with entry
 672             ins->clearReg();
 673         }
 674         if (ins->isInAr()) {
 675             arFree(ins);                        // free any AR space associated with entry
 676             ins->clearArIndex();
 677         }
 678     }
 679
 680     // Frees all record of registers and spill slots used by 'ins'.
 681     void Assembler::freeResourcesOf(LIns *ins)
 682     {
 683         if (ins->isInReg()) {
 684             _allocator.retire(ins->getReg());   // free any register associated with entry
 685             ins->clearReg();
 686         }
 687         if (ins->isInAr()) {
 688             arFree(ins);                        // free any AR space associated with entry
 689             ins->clearArIndex();
 690         }
 691     }
 692
 693     // Frees 'r' in the RegAlloc regstate, if it's not already free.
 694     void Assembler::evictIfActive(Register r)
 695     {
 696         if (LIns* vic = _allocator.getActive(r)) {
 697             NanoAssert(vic->getReg() == r);
 698             evict(vic);
 699         }
 700     }
 701
 702     // Frees 'r' (which currently holds the result of 'vic') in the regstate.
 703     // An example:
 704     //
 705     //   pre-regstate:  eax(ld1)
 706     //   instruction:   mov ebx,-4(ebp) <= restore add1   # %ebx is dest
 707     //   post-regstate: eax(ld1) ebx(add1)
 708     //
 709     // At run-time we are *restoring* 'add1' into %ebx, hence the call to
 710     // asm_restore().  But at regalloc-time we are moving backwards through
 711     // the code, so in that sense we are *evicting* 'add1' from %ebx.
 712     //
 713     void Assembler::evict(LIns* vic)
 714     {
 715         // Not free, need to steal.
 716         counter_increment(steals);
 717
 718         Register r = vic->getReg();
 719
 720         NanoAssert(!_allocator.isFree(r));
 721         NanoAssert(vic == _allocator.getActive(r));
 722
 723         verbose_only( RefBuf b;
 724                       if (_logc->lcbits & LC_Assembly) {
 725                         setOutputForEOL("  <= restore %s",
 726                         _thisfrag->lirbuf->printer->formatRef(&b, vic)); } )
 727         asm_restore(vic, r);
 728
 729         _allocator.retire(r);
 730         vic->clearReg();
 731
 732         // At this point 'vic' is unused (if rematerializable), or in a spill
 733         // slot (if not).
 734     }
 735
 736     void Assembler::patch(GuardRecord *lr)
 737     {
 738         if (!lr->jmp) // the guard might have been eliminated as redundant
 739             return;
 740         Fragment *frag = lr->exit->target;
 741         NanoAssert(frag->fragEntry != 0);
 742         nPatchBranch((NIns*)lr->jmp, frag->fragEntry);
 743         CodeAlloc::flushICache(lr->jmp, LARGEST_BRANCH_PATCH);
 744         verbose_only(verbose_outputf("patching jump at %p to target %p\n",
 745             lr->jmp, frag->fragEntry);)
 746     }
 747
 748     void Assembler::patch(SideExit *exit)
 749     {
 750         GuardRecord *rec = exit->guards;
 751         NanoAssert(rec);
 752         while (rec) {
 753             patch(rec);
 754             rec = rec->next;
 755         }
 756     }
 757
 758 #ifdef NANOJIT_IA32
 759     void Assembler::patch(SideExit* exit, SwitchInfo* si)
 760     {
 761         for (GuardRecord* lr = exit->guards; lr; lr = lr->next) {
 762             Fragment *frag = lr->exit->target;
 763             NanoAssert(frag->fragEntry != 0);
 764             si->table[si->index] = frag->fragEntry;
 765         }
 766     }
 767 #endif
 768
 769     NIns* Assembler::asm_exit(LInsp guard)
 770     {
 771         SideExit *exit = guard->record()->exit;
 772         NIns* at = 0;
 773         if (!_branchStateMap.get(exit))
 774         {
 775             at = asm_leave_trace(guard);
 776         }
 777         else
 778         {
 779             RegAlloc* captured = _branchStateMap.get(exit);
 780             intersectRegisterState(*captured);
 781             at = exit->target->fragEntry;
 782             NanoAssert(at != 0);
 783             _branchStateMap.remove(exit);
 784         }
 785         return at;
 786     }
 787
 788     NIns* Assembler::asm_leave_trace(LInsp guard)
 789     {
 790         verbose_only( int32_t nativeSave = _stats.native );
 791         verbose_only( verbose_outputf("----------------------------------- ## END exit block %p", guard);)
 792
 793         // This point is unreachable.  So free all the registers.  If an
 794         // instruction has a stack entry we will leave it alone, otherwise we
 795         // free it entirely.  intersectRegisterState() will restore.
 796         RegAlloc capture = _allocator;
 797         releaseRegisters();
 798
 799         swapCodeChunks();
 800         _inExit = true;
 801
 802 #ifdef NANOJIT_IA32
 803         debug_only( _sv_fpuStkDepth = _fpuStkDepth; _fpuStkDepth = 0; )
 804 #endif
 805
 806         nFragExit(guard);
 807
 808         // Restore the callee-saved register and parameters.
 809         assignSavedRegs();
 810         assignParamRegs();
 811
 812         intersectRegisterState(capture);
 813
 814         // this can be useful for breaking whenever an exit is taken
 815         //INT3();
 816         //NOP();
 817
 818         // we are done producing the exit logic for the guard so demark where our exit block code begins
 819         NIns* jmpTarget = _nIns;     // target in exit path for our mainline conditional jump
 820
 821         // swap back pointers, effectively storing the last location used in the exit path
 822         swapCodeChunks();
 823         _inExit = false;
 824
 825         //verbose_only( verbose_outputf("         LIR_xt/xf swapCodeChunks, _nIns is now %08X(%08X), _nExitIns is now %08X(%08X)",_nIns, *_nIns,_nExitIns,*_nExitIns) );
 826         verbose_only( verbose_outputf("%010lx:", (unsigned long)jmpTarget);)
 827         verbose_only( verbose_outputf("----------------------------------- ## BEGIN exit block (LIR_xt|LIR_xf)") );
 828
 829 #ifdef NANOJIT_IA32
 830         NanoAssertMsgf(_fpuStkDepth == _sv_fpuStkDepth, "LIR_xtf, _fpuStkDepth=%d, expect %d",_fpuStkDepth, _sv_fpuStkDepth);
 831         debug_only( _fpuStkDepth = _sv_fpuStkDepth; _sv_fpuStkDepth = 9999; )
 832 #endif
 833
 834         verbose_only(_stats.exitnative += (_stats.native-nativeSave));
 835
 836         return jmpTarget;
 837     }
 838
 839     void Assembler::compile(Fragment* frag, Allocator& alloc, bool optimize verbose_only(, LInsPrinter* printer))
 840     {
 841         verbose_only(
 842         bool anyVerb = (_logc->lcbits & 0xFFFF & ~LC_FragProfile) > 0;
 843         bool asmVerb = (_logc->lcbits & 0xFFFF & LC_Assembly) > 0;
 844         bool liveVerb = (_logc->lcbits & 0xFFFF & LC_Liveness) > 0;
 845         )
 846
 847         /* BEGIN decorative preamble */
 848         verbose_only(
 849         if (anyVerb) {
 850             _logc->printf("========================================"
 851                           "========================================\n");
 852             _logc->printf("=== BEGIN LIR::compile(%p, %p)\n",
 853                           (void*)this, (void*)frag);
 854             _logc->printf("===\n");
 855         })
 856         /* END decorative preamble */
 857
 858         verbose_only( if (liveVerb) {
 859             _logc->printf("\n");
 860             _logc->printf("=== Results of liveness analysis:\n");
 861             _logc->printf("===\n");
 862             LirReader br(frag->lastIns);
 863             LirFilter* lir = &br;
 864             if (optimize) {
 865                 StackFilter* sf = new (alloc) StackFilter(lir, alloc, frag->lirbuf->sp);
 866                 lir = sf;
 867             }
 868             live(lir, alloc, frag, _logc);
 869         })
 870
 871         /* Set up the generic text output cache for the assembler */
 872         verbose_only( StringList asmOutput(alloc); )
 873         verbose_only( _outputCache = &asmOutput; )
 874
 875         beginAssembly(frag);
 876         if (error())
 877             return;
 878
 879         //_logc->printf("recompile trigger %X kind %d\n", (int)frag, frag->kind);
 880
 881         verbose_only( if (anyVerb) {
 882             _logc->printf("=== Translating LIR fragments into assembly:\n");
 883         })
 884
 885         // now the the main trunk
 886         verbose_only( RefBuf b; )
 887         verbose_only( if (anyVerb) {
 888             _logc->printf("=== -- Compile trunk %s: begin\n", printer->formatAddr(&b, frag));
 889         })
 890
 891         // Used for debug printing, if needed
 892         debug_only(ValidateReader *validate = NULL;)
 893         verbose_only(
 894         ReverseLister *pp_init = NULL;
 895         ReverseLister *pp_after_sf = NULL;
 896         )
 897
 898         // The LIR passes through these filters as listed in this
 899         // function, viz, top to bottom.
 900
 901         // set up backwards pipeline: assembler <- StackFilter <- LirReader
 902         LirFilter* lir = new (alloc) LirReader(frag->lastIns);
 903
 904 #ifdef DEBUG
 905         // VALIDATION
 906         validate = new (alloc) ValidateReader(lir);
 907         lir = validate;
 908 #endif
 909
 910         // INITIAL PRINTING
 911         verbose_only( if (_logc->lcbits & LC_ReadLIR) {
 912         pp_init = new (alloc) ReverseLister(lir, alloc, frag->lirbuf->printer, _logc,
 913                                     "Initial LIR");
 914         lir = pp_init;
 915         })
 916
 917         // STACKFILTER
 918         if (optimize) {
 919             StackFilter* stackfilter = new (alloc) StackFilter(lir, alloc, frag->lirbuf->sp);
 920             lir = stackfilter;
 921         }
 922
 923         verbose_only( if (_logc->lcbits & LC_AfterSF) {
 924         pp_after_sf = new (alloc) ReverseLister(lir, alloc, frag->lirbuf->printer, _logc,
 925                                                 "After StackFilter");
 926         lir = pp_after_sf;
 927         })
 928
 929         assemble(frag, lir);
 930
 931         // If we were accumulating debug info in the various ReverseListers,
 932         // call finish() to emit whatever contents they have accumulated.
 933         verbose_only(
 934         if (pp_init)        pp_init->finish();
 935         if (pp_after_sf)    pp_after_sf->finish();
 936         )
 937
 938         verbose_only( if (anyVerb) {
 939             _logc->printf("=== -- Compile trunk %s: end\n", printer->formatAddr(&b, frag));
 940         })
 941
 942         verbose_only(
 943             if (asmVerb)
 944                 outputf("## compiling trunk %s", printer->formatAddr(&b, frag));
 945         )
 946         endAssembly(frag);
 947
 948         // Reverse output so that assembly is displayed low-to-high.
 949         // Up to this point, _outputCache has been non-NULL, and so has been
 950         // accumulating output.  Now we set it to NULL, traverse the entire
 951         // list of stored strings, and hand them a second time to output.
 952         // Since _outputCache is now NULL, outputf just hands these strings
 953         // directly onwards to _logc->printf.
 954         verbose_only( if (anyVerb) {
 955             _logc->printf("\n");
 956             _logc->printf("=== Aggregated assembly output: BEGIN\n");
 957             _logc->printf("===\n");
 958             _outputCache = 0;
 959             for (Seq<char*>* p = asmOutput.get(); p != NULL; p = p->tail) {
 960                 char *str = p->head;
 961                 outputf("  %s", str);
 962             }
 963             _logc->printf("===\n");
 964             _logc->printf("=== Aggregated assembly output: END\n");
 965         });
 966
 967         if (error())
 968             frag->fragEntry = 0;
 969
 970         verbose_only( frag->nCodeBytes += codeBytes; )
 971         verbose_only( frag->nExitBytes += exitBytes; )
 972
 973         /* BEGIN decorative postamble */
 974         verbose_only( if (anyVerb) {
 975             _logc->printf("\n");
 976             _logc->printf("===\n");
 977             _logc->printf("=== END LIR::compile(%p, %p)\n",
 978                           (void*)this, (void*)frag);
 979             _logc->printf("========================================"
 980                           "========================================\n");
 981             _logc->printf("\n");
 982         });
 983         /* END decorative postamble */
 984     }
 985
 986     void Assembler::beginAssembly(Fragment *frag)
 987     {
 988         verbose_only( codeBytes = 0; )
 989         verbose_only( exitBytes = 0; )
 990
 991         reset();
 992
 993         NanoAssert(codeList == 0);
 994         NanoAssert(codeStart == 0);
 995         NanoAssert(codeEnd == 0);
 996         NanoAssert(exitStart == 0);
 997         NanoAssert(exitEnd == 0);
 998         NanoAssert(_nIns == 0);
 999         NanoAssert(_nExitIns == 0);
1000
1001         _thisfrag = frag;
1002         _inExit = false;
1003
1004         counter_reset(native);
1005         counter_reset(exitnative);
1006         counter_reset(steals);
1007         counter_reset(spills);
1008         counter_reset(remats);
1009
1010         setError(None);
1011
1012         // native code gen buffer setup
1013         nativePageSetup();
1014
1015         // make sure we got memory at least one page
1016         if (error()) return;
1017
1018 #ifdef PERFM
1019         _stats.pages = 0;
1020         _stats.codeStart = _nIns-1;
1021         _stats.codeExitStart = _nExitIns-1;
1022 #endif /* PERFM */
1023
1024         _epilogue = NULL;
1025
1026         nBeginAssembly();
1027     }
1028
1029     void Assembler::assemble(Fragment* frag, LirFilter* reader)
1030     {
1031         if (error()) return;
1032         _thisfrag = frag;
1033
1034         // check the fragment is starting out with a sane profiling state
1035         verbose_only( NanoAssert(frag->nStaticExits == 0); )
1036         verbose_only( NanoAssert(frag->nCodeBytes == 0); )
1037         verbose_only( NanoAssert(frag->nExitBytes == 0); )
1038         verbose_only( NanoAssert(frag->profCount == 0); )
1039         verbose_only( if (_logc->lcbits & LC_FragProfile)
1040                           NanoAssert(frag->profFragID > 0);
1041                       else
1042                           NanoAssert(frag->profFragID == 0); )
1043
1044         _inExit = false;
1045
1046         gen(reader);
1047
1048         if (!error()) {
1049             // patch all branches
1050             NInsMap::Iter iter(_patches);
1051             while (iter.next()) {
1052                 NIns* where = iter.key();
1053                 LIns* target = iter.value();
1054                 if (target->isop(LIR_jtbl)) {
1055                     // Need to patch up a whole jump table, 'where' is the table.
1056                     LIns *jtbl = target;
1057                     NIns** native_table = (NIns**) where;
1058                     for (uint32_t i = 0, n = jtbl->getTableSize(); i < n; i++) {
1059                         LabelState* lstate = _labels.get(jtbl->getTarget(i));
1060                         NIns* ntarget = lstate->addr;
1061                         if (ntarget) {
1062                             native_table[i] = ntarget;
1063                         } else {
1064                             setError(UnknownBranch);
1065                             break;
1066                         }
1067                     }
1068                 } else {
1069                     // target is a label for a single-target branch
1070                     LabelState *lstate = _labels.get(target);
1071                     NIns* ntarget = lstate->addr;
1072                     if (ntarget) {
1073                         nPatchBranch(where, ntarget);
1074                     } else {
1075                         setError(UnknownBranch);
1076                         break;
1077                     }
1078                 }
1079             }
1080         }
1081     }
1082
1083     void Assembler::endAssembly(Fragment* frag)
1084     {
1085         // don't try to patch code if we are in an error state since we might have partially
1086         // overwritten the code cache already
1087         if (error()) {
1088             // something went wrong, release all allocated code memory
1089             _codeAlloc.freeAll(codeList);
1090             if (_nExitIns)
1091                 _codeAlloc.free(exitStart, exitEnd);
1092             _codeAlloc.free(codeStart, codeEnd);
1093             codeList = NULL;
1094             return;
1095         }
1096
1097         NIns* fragEntry = genPrologue();
1098         verbose_only( asm_output("[prologue]"); )
1099
1100         debug_only(_activation.checkForResourceLeaks());
1101
1102         NanoAssert(!_inExit);
1103         // save used parts of current block on fragment's code list, free the rest
1104 #if defined(NANOJIT_ARM) || defined(NANOJIT_MIPS)
1105         // [codeStart, _nSlot) ... gap ... [_nIns, codeEnd)
1106         if (_nExitIns) {
1107             _codeAlloc.addRemainder(codeList, exitStart, exitEnd, _nExitSlot, _nExitIns);
1108             verbose_only( exitBytes -= (_nExitIns - _nExitSlot) * sizeof(NIns); )
1109         }
1110         _codeAlloc.addRemainder(codeList, codeStart, codeEnd, _nSlot, _nIns);
1111         verbose_only( codeBytes -= (_nIns - _nSlot) * sizeof(NIns); )
1112 #else
1113         // [codeStart ... gap ... [_nIns, codeEnd))
1114         if (_nExitIns) {
1115             _codeAlloc.addRemainder(codeList, exitStart, exitEnd, exitStart, _nExitIns);
1116             verbose_only( exitBytes -= (_nExitIns - exitStart) * sizeof(NIns); )
1117         }
1118         _codeAlloc.addRemainder(codeList, codeStart, codeEnd, codeStart, _nIns);
1119         verbose_only( codeBytes -= (_nIns - codeStart) * sizeof(NIns); )
1120 #endif
1121
1122         // at this point all our new code is in the d-cache and not the i-cache,
1123         // so flush the i-cache on cpu's that need it.
1124         CodeAlloc::flushICache(codeList);
1125
1126         // save entry point pointers
1127         frag->fragEntry = fragEntry;
1128         frag->setCode(_nIns);
1129         PERFM_NVPROF("code", CodeAlloc::size(codeList));
1130
1131 #ifdef NANOJIT_IA32
1132         NanoAssertMsgf(_fpuStkDepth == 0,"_fpuStkDepth %d\n",_fpuStkDepth);
1133 #endif
1134
1135         debug_only( pageValidate(); )
1136         NanoAssert(_branchStateMap.isEmpty());
1137     }
1138
1139     void Assembler::releaseRegisters()
1140     {
1141         for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
1142         {
1143             LIns *ins = _allocator.getActive(r);
1144             if (ins) {
1145                 // Clear reg allocation, preserve stack allocation.
1146                 _allocator.retire(r);
1147                 NanoAssert(r == ins->getReg());
1148                 ins->clearReg();
1149             }
1150         }
1151     }
1152
1153 #ifdef PERFM
1154 #define countlir_live() _nvprof("lir-live",1)
1155 #define countlir_ret() _nvprof("lir-ret",1)
1156 #define countlir_alloc() _nvprof("lir-alloc",1)
1157 #define countlir_var() _nvprof("lir-var",1)
1158 #define countlir_use() _nvprof("lir-use",1)
1159 #define countlir_def() _nvprof("lir-def",1)
1160 #define countlir_imm() _nvprof("lir-imm",1)
1161 #define countlir_param() _nvprof("lir-param",1)
1162 #define countlir_cmov() _nvprof("lir-cmov",1)
1163 #define countlir_ld() _nvprof("lir-ld",1)
1164 #define countlir_ldq() _nvprof("lir-ldq",1)
1165 #define countlir_alu() _nvprof("lir-alu",1)
1166 #define countlir_qjoin() _nvprof("lir-qjoin",1)
1167 #define countlir_qlo() _nvprof("lir-qlo",1)
1168 #define countlir_qhi() _nvprof("lir-qhi",1)
1169 #define countlir_fpu() _nvprof("lir-fpu",1)
1170 #define countlir_st() _nvprof("lir-st",1)
1171 #define countlir_stq() _nvprof("lir-stq",1)
1172 #define countlir_jmp() _nvprof("lir-jmp",1)
1173 #define countlir_jcc() _nvprof("lir-jcc",1)
1174 #define countlir_label() _nvprof("lir-label",1)
1175 #define countlir_xcc() _nvprof("lir-xcc",1)
1176 #define countlir_x() _nvprof("lir-x",1)
1177 #define countlir_call() _nvprof("lir-call",1)
1178 #define countlir_jtbl() _nvprof("lir-jtbl",1)
1179 #else
1180 #define countlir_live()
1181 #define countlir_ret()
1182 #define countlir_alloc()
1183 #define countlir_var()
1184 #define countlir_use()
1185 #define countlir_def()
1186 #define countlir_imm()
1187 #define countlir_param()
1188 #define countlir_cmov()
1189 #define countlir_ld()
1190 #define countlir_ldq()
1191 #define countlir_alu()
1192 #define countlir_qjoin()
1193 #define countlir_qlo()
1194 #define countlir_qhi()
1195 #define countlir_fpu()
1196 #define countlir_st()
1197 #define countlir_stq()
1198 #define countlir_jmp()
1199 #define countlir_jcc()
1200 #define countlir_label()
1201 #define countlir_xcc()
1202 #define countlir_x()
1203 #define countlir_call()
1204 #define countlir_jtbl()
1205 #endif
1206
1207     void Assembler::gen(LirFilter* reader)
1208     {
1209         NanoAssert(_thisfrag->nStaticExits == 0);
1210
1211         // The trace must end with one of these opcodes.
1212         NanoAssert(reader->finalIns()->isop(LIR_x)    ||
1213                    reader->finalIns()->isop(LIR_xtbl) ||
1214                    reader->finalIns()->isRet()        ||
1215                    reader->finalIns()->isLive());
1216
1217         InsList pending_lives(alloc);
1218
1219         NanoAssert(!error());
1220
1221         // What's going on here: we're visiting all the LIR instructions in
1222         // the buffer, working strictly backwards in buffer-order, and
1223         // generating machine instructions for them as we go.
1224         //
1225         // For each LIns, we first determine whether it's actually necessary,
1226         // and if not skip it.  Otherwise we generate code for it.  There are
1227         // two kinds of "necessary" instructions:
1228         //
1229         // - "Statement" instructions, which have side effects.  Anything that
1230         //   could change control flow or the state of memory.
1231         //
1232         // - "Value" or "expression" instructions, which compute a value based
1233         //   only on the operands to the instruction (and, in the case of
1234         //   loads, the state of memory).  Because we visit instructions in
1235         //   reverse order, if some previously visited instruction uses the
1236         //   value computed by this instruction, then this instruction will
1237         //   already have a register assigned to hold that value.  Hence we
1238         //   can consult the instruction to detect whether its value is in
1239         //   fact used (i.e. not dead).
1240         //
1241         // Note that the backwards code traversal can make register allocation
1242         // confusing.  (For example, we restore a value before we spill it!)
1243         // In particular, words like "before" and "after" must be used very
1244         // carefully -- their meaning at regalloc-time is opposite to their
1245         // meaning at run-time.  We use the term "pre-regstate" to refer to
1246         // the register allocation state that occurs prior to an instruction's
1247         // execution, and "post-regstate" to refer to the state that occurs
1248         // after an instruction's execution, e.g.:
1249         //
1250         //   pre-regstate:  ebx(ins)
1251         //   instruction:   mov eax, ebx     // mov dst, src
1252         //   post-regstate: eax(ins)
1253         //
1254         // At run-time, the instruction updates the pre-regstate into the
1255         // post-regstate (and these states are the real machine's regstates).
1256         // But when allocating registers, because we go backwards, the
1257         // pre-regstate is constructed from the post-regstate (and these
1258         // regstates are those stored in RegAlloc).
1259         //
1260         // One consequence of generating code backwards is that we tend to
1261         // both spill and restore registers as early (at run-time) as
1262         // possible;  this is good for tolerating memory latency.  If we
1263         // generated code forwards, we would expect to both spill and restore
1264         // registers as late (at run-time) as possible;  this might be better
1265         // for reducing register pressure.
1266         //
1267         // Another thing to note: we provide N_LOOKAHEAD instruction's worth
1268         // of lookahead because it's useful for backends.  This is nice and
1269         // easy because once read() gets to the LIR_start at the beginning of
1270         // the buffer it'll just keep regetting it.
1271
1272         for (int32_t i = 0; i < N_LOOKAHEAD; i++)
1273             lookahead[i] = reader->read();
1274
1275         while (!lookahead[0]->isop(LIR_start))
1276         {
1277             LInsp ins = lookahead[0];   // give it a shorter name for local use
1278             LOpcode op = ins->opcode();
1279
1280             bool required = ins->isStmt() || ins->isUsed();
1281             if (!required)
1282                 goto end_of_loop;
1283
1284 #ifdef NJ_VERBOSE
1285             // Output the post-regstate (registers and/or activation).
1286             // Because asm output comes in reverse order, doing it now means
1287             // it is printed after the LIR and asm, exactly when the
1288             // post-regstate should be shown.
1289             if ((_logc->lcbits & LC_Assembly) && (_logc->lcbits & LC_Activation))
1290                 printActivationState();
1291             if ((_logc->lcbits & LC_Assembly) && (_logc->lcbits & LC_RegAlloc))
1292                 printRegState();
1293 #endif
1294
1295             switch (op)
1296             {
1297                 default:
1298                     NanoAssertMsgf(false, "unsupported LIR instruction: %d\n", op);
1299                     break;
1300
1301                 case LIR_regfence:
1302                     evictAllActiveRegs();
1303                     break;
1304
1305                 case LIR_live:
1306                 case LIR_flive:
1307                 CASE64(LIR_qlive:) {
1308                     countlir_live();
1309                     LInsp op1 = ins->oprnd1();
1310                     // alloca's are meant to live until the point of the LIR_live instruction, marking
1311                     // other expressions as live ensures that they remain so at loop bottoms.
1312                     // alloca areas require special treatment because they are accessed indirectly and
1313                     // the indirect accesses are invisible to the assembler, other than via LIR_live.
1314                     // other expression results are only accessed directly in ways that are visible to
1315                     // the assembler, so extending those expression's lifetimes past the last loop edge
1316                     // isn't necessary.
1317                     if (op1->isop(LIR_alloc)) {
1318                         findMemFor(op1);
1319                     } else {
1320                         pending_lives.add(ins);
1321                     }
1322                     break;
1323                 }
1324
1325                 case LIR_ret:
1326                 case LIR_fret:
1327                 CASE64(LIR_qret:) {
1328                     countlir_ret();
1329                     asm_ret(ins);
1330                     break;
1331                 }
1332
1333                 // Allocate some stack space.  The value of this instruction
1334                 // is the address of the stack space.
1335                 case LIR_alloc: {
1336                     countlir_alloc();
1337                     NanoAssert(ins->isInAr());
1338                     if (ins->isInReg()) {
1339                         Register r = ins->getReg();
1340                         asm_restore(ins, r);
1341                         _allocator.retire(r);
1342                         ins->clearReg();
1343                     }
1344                     freeResourcesOf(ins);
1345                     break;
1346                 }
1347                 case LIR_int:
1348                 {
1349                     countlir_imm();
1350                     asm_immi(ins);
1351                     break;
1352                 }
1353 #ifdef NANOJIT_64BIT
1354                 case LIR_quad:
1355                 {
1356                     countlir_imm();
1357                     asm_immq(ins);
1358                     break;
1359                 }
1360 #endif
1361                 case LIR_float:
1362                 {
1363                     countlir_imm();
1364                     asm_immf(ins);
1365                     break;
1366                 }
1367                 case LIR_param:
1368                 {
1369                     countlir_param();
1370                     asm_param(ins);
1371                     break;
1372                 }
1373 #if NJ_SOFTFLOAT_SUPPORTED
1374                 case LIR_callh:
1375                 {
1376                     // return result of quad-call in register
1377                     deprecated_prepResultReg(ins, rmask(retRegs[1]));
1378                     // if hi half was used, we must use the call to ensure it happens
1379                     findSpecificRegFor(ins->oprnd1(), retRegs[0]);
1380                     break;
1381                 }
1382                 case LIR_qlo:
1383                 {
1384                     countlir_qlo();
1385                     asm_qlo(ins);
1386                     break;
1387                 }
1388                 case LIR_qhi:
1389                 {
1390                     countlir_qhi();
1391                     asm_qhi(ins);
1392                     break;
1393                 }
1394                 case LIR_qjoin:
1395                 {
1396                     countlir_qjoin();
1397                     asm_qjoin(ins);
1398                     break;
1399                 }
1400 #endif
1401                 CASE64(LIR_qcmov:)
1402                 case LIR_cmov:
1403                 {
1404                     countlir_cmov();
1405                     asm_cmov(ins);
1406                     break;
1407                 }
1408                 case LIR_ldzb:
1409                 case LIR_ldzs:
1410                 case LIR_ldsb:
1411                 case LIR_ldss:
1412                 case LIR_ld:
1413                 {
1414                     countlir_ld();
1415                     asm_load32(ins);
1416                     break;
1417                 }
1418
1419                 case LIR_ld32f:
1420                 case LIR_ldf:
1421                 CASE64(LIR_ldq:)
1422                 {
1423                     countlir_ldq();
1424                     asm_load64(ins);
1425                     break;
1426                 }
1427                 case LIR_neg:
1428                 case LIR_not:
1429                 {
1430                     countlir_alu();
1431                     asm_neg_not(ins);
1432                     break;
1433                 }
1434
1435 #if defined NANOJIT_64BIT
1436                 case LIR_qiadd:
1437                 case LIR_qiand:
1438                 case LIR_qilsh:
1439                 case LIR_qursh:
1440                 case LIR_qirsh:
1441                 case LIR_qior:
1442                 case LIR_qxor:
1443                 {
1444                     asm_qbinop(ins);
1445                     break;
1446                 }
1447 #endif
1448
1449                 case LIR_add:
1450                 case LIR_sub:
1451                 case LIR_mul:
1452                 case LIR_and:
1453                 case LIR_or:
1454                 case LIR_xor:
1455                 case LIR_lsh:
1456                 case LIR_rsh:
1457                 case LIR_ush:
1458                 CASE86(LIR_div:)
1459                 CASE86(LIR_mod:)
1460                 {
1461                     countlir_alu();
1462                     asm_arith(ins);
1463                     break;
1464                 }
1465                 case LIR_fneg:
1466                 {
1467                     countlir_fpu();
1468                     asm_fneg(ins);
1469                     break;
1470                 }
1471                 case LIR_fadd:
1472                 case LIR_fsub:
1473                 case LIR_fmul:
1474                 case LIR_fdiv:
1475                 {
1476                     countlir_fpu();
1477                     asm_fop(ins);
1478                     break;
1479                 }
1480                 case LIR_i2f:
1481                 {
1482                     countlir_fpu();
1483                     asm_i2f(ins);
1484                     break;
1485                 }
1486                 case LIR_u2f:
1487                 {
1488                     countlir_fpu();
1489                     asm_u2f(ins);
1490                     break;
1491                 }
1492                 case LIR_f2i:
1493                 {
1494                     countlir_fpu();
1495                     asm_f2i(ins);
1496                     break;
1497                 }
1498 #ifdef NANOJIT_64BIT
1499                 case LIR_i2q:
1500                 case LIR_u2q:
1501                 {
1502                     countlir_alu();
1503                     asm_promote(ins);
1504                     break;
1505                 }
1506                 case LIR_q2i:
1507                 {
1508                     countlir_alu();
1509                     asm_q2i(ins);
1510                     break;
1511                 }
1512 #endif
1513                 case LIR_stb:
1514                 case LIR_sts:
1515                 case LIR_sti:
1516                 {
1517                     countlir_st();
1518                     asm_store32(op, ins->oprnd1(), ins->disp(), ins->oprnd2());
1519                     break;
1520                 }
1521                 case LIR_st32f:
1522                 case LIR_stfi:
1523                 CASE64(LIR_stqi:)
1524                 {
1525                     countlir_stq();
1526                     LIns* value = ins->oprnd1();
1527                     LIns* base = ins->oprnd2();
1528                     int dr = ins->disp();
1529 #if NJ_SOFTFLOAT_SUPPORTED
1530                     if (value->isop(LIR_qjoin) && op == LIR_stfi)
1531                     {
1532                         // This is correct for little-endian only.
1533                         asm_store32(LIR_sti, value->oprnd1(), dr, base);
1534                         asm_store32(LIR_sti, value->oprnd2(), dr+4, base);
1535                     }
1536                     else
1537 #endif
1538                     {
1539                         asm_store64(op, value, dr, base);
1540                     }
1541                     break;
1542                 }
1543
1544                 case LIR_j:
1545                 {
1546                     countlir_jmp();
1547                     LInsp to = ins->getTarget();
1548                     LabelState *label = _labels.get(to);
1549                     // the jump is always taken so whatever register state we
1550                     // have from downstream code, is irrelevant to code before
1551                     // this jump.  so clear it out.  we will pick up register
1552                     // state from the jump target, if we have seen that label.
1553                     releaseRegisters();
1554                     if (label && label->addr) {
1555                         // forward jump - pick up register state from target.
1556                         unionRegisterState(label->regs);
1557                         JMP(label->addr);
1558                     }
1559                     else {
1560                         // backwards jump
1561                         handleLoopCarriedExprs(pending_lives);
1562                         if (!label) {
1563                             // save empty register state at loop header
1564                             _labels.add(to, 0, _allocator);
1565                         }
1566                         else {
1567                             intersectRegisterState(label->regs);
1568                         }
1569                         JMP(0);
1570                         _patches.put(_nIns, to);
1571                     }
1572                     break;
1573                 }
1574
1575                 case LIR_jt:
1576                 case LIR_jf:
1577                 {
1578                     countlir_jcc();
1579                     LInsp to = ins->getTarget();
1580                     LIns* cond = ins->oprnd1();
1581                     LabelState *label = _labels.get(to);
1582                     if (label && label->addr) {
1583                         // forward jump to known label.  need to merge with label's register state.
1584                         unionRegisterState(label->regs);
1585                         asm_branch(op == LIR_jf, cond, label->addr);
1586                     }
1587                     else {
1588                         // back edge.
1589                         handleLoopCarriedExprs(pending_lives);
1590                         if (!label) {
1591                             // evict all registers, most conservative approach.
1592                             evictAllActiveRegs();
1593                             _labels.add(to, 0, _allocator);
1594                         }
1595                         else {
1596                             // evict all registers, most conservative approach.
1597                             intersectRegisterState(label->regs);
1598                         }
1599                         NIns *branch = asm_branch(op == LIR_jf, cond, 0);
1600                         _patches.put(branch,to);
1601                     }
1602                     break;
1603                 }
1604
1605                 #if NJ_JTBL_SUPPORTED
1606                 case LIR_jtbl:
1607                 {
1608                     countlir_jtbl();
1609                     // Multiway jump can contain both forward and backward jumps.
1610                     // Out of range indices aren't allowed or checked.
1611                     // Code after this jtbl instruction is unreachable.
1612                     releaseRegisters();
1613                     NanoAssert(_allocator.countActive() == 0);
1614
1615                     uint32_t count = ins->getTableSize();
1616                     bool has_back_edges = false;
1617
1618                     // Merge the regstates of labels we have already seen.
1619                     for (uint32_t i = count; i-- > 0;) {
1620                         LIns* to = ins->getTarget(i);
1621                         LabelState *lstate = _labels.get(to);
1622                         if (lstate) {
1623                             unionRegisterState(lstate->regs);
1624                             verbose_only( RefBuf b; )
1625                             asm_output("   %u: [&%s]", i, _thisfrag->lirbuf->printer->formatRef(&b, to));
1626                         } else {
1627                             has_back_edges = true;
1628                         }
1629                     }
1630                     asm_output("forward edges");
1631
1632                     // In a multi-way jump, the register allocator has no ability to deal
1633                     // with two existing edges that have conflicting register assignments, unlike
1634                     // a conditional branch where code can be inserted on the fall-through path
1635                     // to reconcile registers.  So, frontends *must* insert LIR_regfence at labels of
1636                     // forward jtbl jumps.  Check here to make sure no registers were picked up from
1637                     // any forward edges.
1638                     NanoAssert(_allocator.countActive() == 0);
1639
1640                     if (has_back_edges) {
1641                         handleLoopCarriedExprs(pending_lives);
1642                         // save merged (empty) register state at target labels we haven't seen yet
1643                         for (uint32_t i = count; i-- > 0;) {
1644                             LIns* to = ins->getTarget(i);
1645                             LabelState *lstate = _labels.get(to);
1646                             if (!lstate) {
1647                                 _labels.add(to, 0, _allocator);
1648                                 verbose_only( RefBuf b; )
1649                                 asm_output("   %u: [&%s]", i, _thisfrag->lirbuf->printer->formatRef(&b, to));
1650                             }
1651                         }
1652                         asm_output("backward edges");
1653                     }
1654
1655                     // Emit the jump instruction, which allocates 1 register for the jump index.
1656                     NIns** native_table = new (_dataAlloc) NIns*[count];
1657                     asm_output("[%p]:", (void*)native_table);
1658                     _patches.put((NIns*)native_table, ins);
1659                     asm_jtbl(ins, native_table);
1660                     break;
1661                 }
1662                 #endif
1663
1664                 case LIR_label:
1665                 {
1666                     countlir_label();
1667                     LabelState *label = _labels.get(ins);
1668                     // add profiling inc, if necessary.
1669                     verbose_only( if (_logc->lcbits & LC_FragProfile) {
1670                         if (ins == _thisfrag->loopLabel)
1671                             asm_inc_m32(& _thisfrag->profCount);
1672                     })
1673                     if (!label) {
1674                         // label seen first, normal target of forward jump, save addr & allocator
1675                         _labels.add(ins, _nIns, _allocator);
1676                     }
1677                     else {
1678                         // we're at the top of a loop
1679                         NanoAssert(label->addr == 0);
1680                         //evictAllActiveRegs();
1681                         intersectRegisterState(label->regs);
1682                         label->addr = _nIns;
1683                     }
1684                     verbose_only(
1685                         RefBuf b;
1686                         if (_logc->lcbits & LC_Assembly) {
1687                             asm_output("[%s]", _thisfrag->lirbuf->printer->formatRef(&b, ins));
1688                     })
1689                     break;
1690                 }
1691                 case LIR_xbarrier: {
1692                     break;
1693                 }
1694 #ifdef NANOJIT_IA32
1695                 case LIR_xtbl: {
1696                     NIns* exit = asm_exit(ins); // does intersectRegisterState()
1697                     asm_switch(ins, exit);
1698                     break;
1699                 }
1700 #else
1701                  case LIR_xtbl:
1702                     NanoAssertMsg(0, "Not supported for this architecture");
1703                     break;
1704 #endif
1705                 case LIR_xt:
1706                 case LIR_xf:
1707                 {
1708                     verbose_only( _thisfrag->nStaticExits++; )
1709                     countlir_xcc();
1710                     // we only support cmp with guard right now, also assume it is 'close' and only emit the branch
1711                     NIns* exit = asm_exit(ins); // does intersectRegisterState()
1712                     LIns* cond = ins->oprnd1();
1713                     asm_branch(op == LIR_xf, cond, exit);
1714                     break;
1715                 }
1716                 case LIR_x:
1717                 {
1718                     verbose_only( _thisfrag->nStaticExits++; )
1719                     countlir_x();
1720                     // generate the side exit branch on the main trace.
1721                     NIns *exit = asm_exit(ins);
1722                     JMP( exit );
1723                     break;
1724                 }
1725                 case LIR_addxov:
1726                 case LIR_subxov:
1727                 case LIR_mulxov:
1728                 {
1729                     verbose_only( _thisfrag->nStaticExits++; )
1730                     countlir_xcc();
1731                     countlir_alu();
1732                     NIns* exit = asm_exit(ins); // does intersectRegisterState()
1733                     asm_branch_xov(op, exit);
1734                     asm_arith(ins);
1735                     break;
1736                 }
1737
1738                 case LIR_feq:
1739                 case LIR_fle:
1740                 case LIR_flt:
1741                 case LIR_fgt:
1742                 case LIR_fge:
1743                 {
1744                     countlir_fpu();
1745                     asm_fcond(ins);
1746                     break;
1747                 }
1748                 case LIR_eq:
1749                 case LIR_le:
1750                 case LIR_lt:
1751                 case LIR_gt:
1752                 case LIR_ge:
1753                 case LIR_ult:
1754                 case LIR_ule:
1755                 case LIR_ugt:
1756                 case LIR_uge:
1757 #ifdef NANOJIT_64BIT
1758                 case LIR_qeq:
1759                 case LIR_qle:
1760                 case LIR_qlt:
1761                 case LIR_qgt:
1762                 case LIR_qge:
1763                 case LIR_qult:
1764                 case LIR_qule:
1765                 case LIR_qugt:
1766                 case LIR_quge:
1767 #endif
1768                 {
1769                     countlir_alu();
1770                     asm_cond(ins);
1771                     break;
1772                 }
1773
1774                 case LIR_fcall:
1775             #ifdef NANOJIT_64BIT
1776                 case LIR_qcall:
1777             #endif
1778                 case LIR_icall:
1779                 {
1780                     countlir_call();
1781                     asm_call(ins);
1782                     break;
1783                 }
1784
1785                 #ifdef VTUNE
1786                 case LIR_file:
1787                 {
1788                     // we traverse backwards so we are now hitting the file
1789                     // that is associated with a bunch of LIR_lines we already have seen
1790                     uintptr_t currentFile = ins->oprnd1()->imm32();
1791                     cgen->jitFilenameUpdate(currentFile);
1792                     break;
1793                 }
1794                 case LIR_line:
1795                 {
1796                     // add a new table entry, we don't yet knwo which file it belongs
1797                     // to so we need to add it to the update table too
1798                     // note the alloc, actual act is delayed; see above
1799                     uint32_t currentLine = (uint32_t) ins->oprnd1()->imm32();
1800                     cgen->jitLineNumUpdate(currentLine);
1801                     cgen->jitAddRecord((uintptr_t)_nIns, 0, currentLine, true);
1802                     break;
1803                 }
1804                 #endif // VTUNE
1805             }
1806
1807 #ifdef NJ_VERBOSE
1808             // We have to do final LIR printing inside this loop.  If we do it
1809             // before this loop, we we end up printing a lot of dead LIR
1810             // instructions.
1811             //
1812             // We print the LIns after generating the code.  This ensures that
1813             // the LIns will appear in debug output *before* the generated
1814             // code, because Assembler::outputf() prints everything in reverse.
1815             //
1816             // Note that some live LIR instructions won't be printed.  Eg. an
1817             // immediate won't be printed unless it is explicitly loaded into
1818             // a register (as opposed to being incorporated into an immediate
1819             // field in another machine instruction).
1820             //
1821             if (_logc->lcbits & LC_Assembly) {
1822                 InsBuf b;
1823                 LInsPrinter* printer = _thisfrag->lirbuf->printer;
1824                 outputf("    %s", printer->formatIns(&b, ins));
1825                 if (ins->isGuard() && ins->oprnd1() && ins->oprnd1()->isCmp()) {
1826                     // Special case: code is generated for guard conditions at
1827                     // the same time that code is generated for the guard
1828                     // itself.  If the condition is only used by the guard, we
1829                     // must print it now otherwise it won't get printed.  So
1830                     // we do print it now, with an explanatory comment.  If
1831                     // the condition *is* used again we'll end up printing it
1832                     // twice, but that's ok.
1833                     outputf("    %s       # codegen'd with the %s",
1834                             printer->formatIns(&b, ins->oprnd1()), lirNames[op]);
1835
1836                 } else if (ins->isCmov()) {
1837                     // Likewise for cmov conditions.
1838                     outputf("    %s       # codegen'd with the %s",
1839                             printer->formatIns(&b, ins->oprnd1()), lirNames[op]);
1840
1841                 }
1842 #if defined NANOJIT_IA32 || defined NANOJIT_X64
1843                 else if (ins->isop(LIR_mod)) {
1844                     // There's a similar case when a div feeds into a mod.
1845                     outputf("    %s       # codegen'd with the mod",
1846                             printer->formatIns(&b, ins->oprnd1()));
1847                 }
1848 #endif
1849             }
1850 #endif
1851
1852             if (error())
1853                 return;
1854
1855         #ifdef VTUNE
1856             cgen->jitCodePosUpdate((uintptr_t)_nIns);
1857         #endif
1858
1859             // check that all is well (don't check in exit paths since its more complicated)
1860             debug_only( pageValidate(); )
1861             debug_only( resourceConsistencyCheck();  )
1862
1863           end_of_loop:
1864             for (int32_t i = 1; i < N_LOOKAHEAD; i++)
1865                 lookahead[i-1] = lookahead[i];
1866             lookahead[N_LOOKAHEAD-1] = reader->read();
1867         }
1868     }
1869
1870     /*
1871      * Write a jump table for the given SwitchInfo and store the table
1872      * address in the SwitchInfo. Every entry will initially point to
1873      * target.
1874      */
1875     void Assembler::emitJumpTable(SwitchInfo* si, NIns* target)
1876     {
1877         si->table = (NIns **) alloc.alloc(si->count * sizeof(NIns*));
1878         for (uint32_t i = 0; i < si->count; ++i)
1879             si->table[i] = target;
1880     }
1881
1882     void Assembler::assignSavedRegs()
1883     {
1884         // Restore saved regsters.
1885         LirBuffer *b = _thisfrag->lirbuf;
1886         for (int i=0, n = NumSavedRegs; i < n; i++) {
1887             LIns *p = b->savedRegs[i];
1888             if (p)
1889                 findSpecificRegForUnallocated(p, savedRegs[p->paramArg()]);
1890         }
1891     }
1892
1893     void Assembler::reserveSavedRegs()
1894     {
1895         LirBuffer *b = _thisfrag->lirbuf;
1896         for (int i = 0, n = NumSavedRegs; i < n; i++) {
1897             LIns *ins = b->savedRegs[i];
1898             if (ins)
1899                 findMemFor(ins);
1900         }
1901     }
1902
1903     void Assembler::assignParamRegs()
1904     {
1905         LInsp state = _thisfrag->lirbuf->state;
1906         if (state)
1907             findSpecificRegForUnallocated(state, argRegs[state->paramArg()]);
1908         LInsp param1 = _thisfrag->lirbuf->param1;
1909         if (param1)
1910             findSpecificRegForUnallocated(param1, argRegs[param1->paramArg()]);
1911     }
1912
1913     void Assembler::handleLoopCarriedExprs(InsList& pending_lives)
1914     {
1915         // ensure that exprs spanning the loop are marked live at the end of the loop
1916         reserveSavedRegs();
1917         for (Seq<LIns*> *p = pending_lives.get(); p != NULL; p = p->tail) {
1918             LIns *ins = p->head;
1919             NanoAssert(ins->isLive());
1920             LIns *op1 = ins->oprnd1();
1921             // Must findMemFor even if we're going to findRegFor; loop-carried
1922             // operands may spill on another edge, and we need them to always
1923             // spill to the same place.
1924 #if NJ_USES_QUAD_CONSTANTS
1925             // Exception: if float constants are true constants, we should
1926             // never call findMemFor on those ops.
1927             if (!op1->isconstf())
1928 #endif
1929             {
1930                 findMemFor(op1);
1931             }
1932             if (!op1->isImmAny())
1933                 findRegFor(op1, ins->isop(LIR_flive) ? FpRegs : GpRegs);
1934         }
1935
1936         // clear this list since we have now dealt with those lifetimes.  extending
1937         // their lifetimes again later (earlier in the code) serves no purpose.
1938         pending_lives.clear();
1939     }
1940
1941     void AR::freeEntryAt(uint32_t idx)
1942     {
1943         NanoAssert(idx > 0 && idx <= _highWaterMark);
1944
1945         // NB: this loop relies on using entry[0] being NULL,
1946         // so that we are guaranteed to terminate
1947         // without access negative entries.
1948         LIns* i = _entries[idx];
1949         NanoAssert(i != NULL);
1950         do {
1951             _entries[idx] = NULL;
1952             idx--;
1953         } while (_entries[idx] == i);
1954     }
1955
1956 #ifdef NJ_VERBOSE
1957     void Assembler::printRegState()
1958     {
1959         char* s = &outline[0];
1960         VMPI_memset(s, ' ', 26);  s[26] = '\0';
1961         s += VMPI_strlen(s);
1962         VMPI_sprintf(s, "RR");
1963         s += VMPI_strlen(s);
1964
1965         for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
1966             LIns *ins = _allocator.getActive(r);
1967             if (ins) {
1968                 NanoAssertMsg(!_allocator.isFree(r),
1969                               "Coding error; register is both free and active! " );
1970                 RefBuf b;
1971                 const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
1972
1973                 if (ins->isop(LIR_param) && ins->paramKind()==1 &&
1974                     r == Assembler::savedRegs[ins->paramArg()])
1975                 {
1976                     // dont print callee-saved regs that arent used
1977                     continue;
1978                 }
1979
1980                 VMPI_sprintf(s, " %s(%s)", gpn(r), n);
1981                 s += VMPI_strlen(s);
1982             }
1983         }
1984         output();
1985     }
1986
1987     void Assembler::printActivationState()
1988     {
1989         char* s = &outline[0];
1990         VMPI_memset(s, ' ', 26);  s[26] = '\0';
1991         s += VMPI_strlen(s);
1992         VMPI_sprintf(s, "AR");
1993         s += VMPI_strlen(s);
1994
1995         LIns* ins = 0;
1996         uint32_t nStackSlots = 0;
1997         int32_t arIndex = 0;
1998         for (AR::Iter iter(_activation); iter.next(ins, nStackSlots, arIndex); )
1999         {
2000             RefBuf b;
2001             const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
2002             if (nStackSlots > 1) {
2003                 VMPI_sprintf(s," %d-%d(%s)", 4*arIndex, 4*(arIndex+nStackSlots-1), n);
2004             }
2005             else {
2006                 VMPI_sprintf(s," %d(%s)", 4*arIndex, n);
2007             }
2008             s += VMPI_strlen(s);
2009         }
2010         output();
2011     }
2012 #endif
2013
2014     inline bool AR::isEmptyRange(uint32_t start, uint32_t nStackSlots) const
2015     {
2016         for (uint32_t i=0; i < nStackSlots; i++)
2017         {
2018             if (_entries[start-i] != NULL)
2019                 return false;
2020         }
2021         return true;
2022     }
2023
2024     uint32_t AR::reserveEntry(LIns* ins)
2025     {
2026         uint32_t const nStackSlots = nStackSlotsFor(ins);
2027
2028         if (nStackSlots == 1)
2029         {
2030             for (uint32_t i = 1; i <= _highWaterMark; i++)
2031             {
2032                 if (_entries[i] == NULL)
2033                 {
2034                     _entries[i] = ins;
2035                     return i;
2036                 }
2037             }
2038             if (_highWaterMark < NJ_MAX_STACK_ENTRY - 1)
2039             {
2040                 NanoAssert(_entries[_highWaterMark+1] == BAD_ENTRY);
2041                 _highWaterMark++;
2042                 _entries[_highWaterMark] = ins;
2043                 return _highWaterMark;
2044              }
2045         }
2046         else
2047         {
2048             // alloc larger block on 8byte boundary.
2049             uint32_t const start = nStackSlots + (nStackSlots & 1);
2050             for (uint32_t i = start; i <= _highWaterMark; i += 2)
2051             {
2052                 if (isEmptyRange(i, nStackSlots))
2053                 {
2054                     // place the entry in the table and mark the instruction with it
2055                     for (uint32_t j=0; j < nStackSlots; j++)
2056                     {
2057                         NanoAssert(i-j <= _highWaterMark);
2058                         NanoAssert(_entries[i-j] == NULL);
2059                         _entries[i-j] = ins;
2060                     }
2061                     return i;
2062                 }
2063             }
2064
2065             // Be sure to account for any 8-byte-round-up when calculating spaceNeeded.
2066             uint32_t const spaceLeft = NJ_MAX_STACK_ENTRY - _highWaterMark - 1;
2067             uint32_t const spaceNeeded = nStackSlots + (_highWaterMark & 1);
2068             if (spaceLeft >= spaceNeeded)
2069             {
2070                 if (_highWaterMark & 1)
2071                 {
2072                     NanoAssert(_entries[_highWaterMark+1] == BAD_ENTRY);
2073                     _entries[_highWaterMark+1] = NULL;
2074                 }
2075                 _highWaterMark += spaceNeeded;
2076                 for (uint32_t j = 0; j < nStackSlots; j++)
2077                 {
2078                     NanoAssert(_highWaterMark-j < NJ_MAX_STACK_ENTRY);
2079                     NanoAssert(_entries[_highWaterMark-j] == BAD_ENTRY);
2080                     _entries[_highWaterMark-j] = ins;
2081                 }
2082                 return _highWaterMark;
2083             }
2084         }
2085         // no space. oh well.
2086         return 0;
2087     }
2088
2089     #ifdef _DEBUG
2090     void AR::checkForResourceLeaks() const
2091     {
2092         for (uint32_t i = 1; i <= _highWaterMark; i++) {
2093             NanoAssertMsgf(_entries[i] == NULL, "frame entry %d wasn't freed\n",4*i);
2094         }
2095     }
2096     #endif
2097
2098     uint32_t Assembler::arReserve(LIns* ins)
2099     {
2100         uint32_t i = _activation.reserveEntry(ins);
2101         if (!i)
2102             setError(StackFull);
2103         return i;
2104     }
2105
2106     void Assembler::arFree(LIns* ins)
2107     {
2108         NanoAssert(ins->isInAr());
2109         uint32_t arIndex = ins->getArIndex();
2110         NanoAssert(arIndex);
2111         NanoAssert(_activation.isValidEntry(arIndex, ins));
2112         _activation.freeEntryAt(arIndex);        // free any stack stack space associated with entry
2113     }
2114
2115     /**
2116      * Move regs around so the SavedRegs contains the highest priority regs.
2117      */
2118     void Assembler::evictScratchRegsExcept(RegisterMask ignore)
2119     {
2120         // Find the top GpRegs that are candidates to put in SavedRegs.
2121
2122         // 'tosave' is a binary heap stored in an array.  The root is tosave[0],
2123         // left child is at i+1, right child is at i+2.
2124
2125         Register tosave[LastReg-FirstReg+1];
2126         int len=0;
2127         RegAlloc *regs = &_allocator;
2128         for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
2129             if (rmask(r) & GpRegs & ~ignore) {
2130                 LIns *ins = regs->getActive(r);
2131                 if (ins) {
2132                     if (canRemat(ins)) {
2133                         NanoAssert(ins->getReg() == r);
2134                         evict(ins);
2135                     }
2136                     else {
2137                         int32_t pri = regs->getPriority(r);
2138                         // add to heap by adding to end and bubbling up
2139                         int j = len++;
2140                         while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
2141                             tosave[j] = tosave[j/2];
2142                             j /= 2;
2143                         }
2144                         NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
2145                         tosave[j] = r;
2146                     }
2147                 }
2148             }
2149         }
2150
2151         // Now primap has the live exprs in priority order.
2152         // Allocate each of the top priority exprs to a SavedReg.
2153
2154         RegisterMask allow = SavedRegs;
2155         while (allow && len > 0) {
2156             // get the highest priority var
2157             Register hi = tosave[0];
2158             if (!(rmask(hi) & SavedRegs)) {
2159                 LIns *ins = regs->getActive(hi);
2160                 Register r = findRegFor(ins, allow);
2161                 allow &= ~rmask(r);
2162             }
2163             else {
2164                 // hi is already in a saved reg, leave it alone.
2165                 allow &= ~rmask(hi);
2166             }
2167
2168             // remove from heap by replacing root with end element and bubbling down.
2169             if (allow && --len > 0) {
2170                 Register last = tosave[len];
2171                 int j = 0;
2172                 while (j+1 < len) {
2173                     int child = j+1;
2174                     if (j+2 < len && regs->getPriority(tosave[j+2]) > regs->getPriority(tosave[j+1]))
2175                         child++;
2176                     if (regs->getPriority(last) > regs->getPriority(tosave[child]))
2177                         break;
2178                     tosave[j] = tosave[child];
2179                     j = child;
2180                 }
2181                 tosave[j] = last;
2182             }
2183         }
2184
2185         // now evict everything else.
2186         evictSomeActiveRegs(~(SavedRegs | ignore));
2187     }
2188
2189     void Assembler::evictAllActiveRegs()
2190     {
2191         // generate code to restore callee saved registers
2192         // @todo speed this up
2193         for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
2194             evictIfActive(r);
2195         }
2196     }
2197
2198     void Assembler::evictSomeActiveRegs(RegisterMask regs)
2199     {
2200         // generate code to restore callee saved registers
2201         // @todo speed this up
2202         for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
2203             if ((rmask(r) & regs)) {
2204                 evictIfActive(r);
2205             }
2206         }
2207     }
2208
2209     /**
2210      * Merge the current regstate with a previously stored version.
2211      *
2212      * Situation                            Change to _allocator
2213      * ---------                            --------------------
2214      * !current & !saved
2215      * !current &  saved                    add saved
2216      *  current & !saved                    evict current (unionRegisterState does nothing)
2217      *  current &  saved & current==saved
2218      *  current &  saved & current!=saved   evict current, add saved
2219      */
2220     void Assembler::intersectRegisterState(RegAlloc& saved)
2221     {
2222         Register regsTodo[LastReg + 1];
2223         LIns* insTodo[LastReg + 1];
2224         int nTodo = 0;
2225
2226         // Do evictions and pops first.
2227         verbose_only(bool shouldMention=false; )
2228         // The obvious thing to do here is to iterate from FirstReg to LastReg.
2229         // viz: for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) ...
2230         // However, on ARM that causes lower-numbered integer registers
2231         // to be be saved at higher addresses, which inhibits the formation
2232         // of load/store multiple instructions.  Hence iterate the loop the
2233         // other way.  The "r <= LastReg" guards against wraparound in
2234         // the case where Register is treated as unsigned and FirstReg is zero.
2235         //
2236         // Note, the loop var is deliberately typed as int (*not* Register)
2237         // to outsmart compilers that will otherwise report
2238         // "error: comparison is always true due to limited range of data type".
2239         for (int ri = LastReg; ri >= FirstReg && ri <= LastReg; ri = int(prevreg(Register(ri))))
2240         {
2241             Register const r = Register(ri);
2242             LIns* curins = _allocator.getActive(r);
2243             LIns* savedins = saved.getActive(r);
2244             if (curins != savedins)
2245             {
2246                 if (savedins) {
2247                     regsTodo[nTodo] = r;
2248                     insTodo[nTodo] = savedins;
2249                     nTodo++;
2250                 }
2251                 if (curins) {
2252                     //_nvprof("intersect-evict",1);
2253                     verbose_only( shouldMention=true; )
2254                     NanoAssert(curins->getReg() == r);
2255                     evict(curins);
2256                 }
2257
2258                 #ifdef NANOJIT_IA32
2259                 if (savedins && (rmask(r) & x87Regs)) {
2260                     verbose_only( shouldMention=true; )
2261                     FSTP(r);
2262                 }
2263                 #endif
2264             }
2265         }
2266         // Now reassign mainline registers.
2267         for (int i = 0; i < nTodo; i++) {
2268             findSpecificRegFor(insTodo[i], regsTodo[i]);
2269         }
2270         verbose_only(
2271             if (shouldMention)
2272                 verbose_outputf("## merging registers (intersect) with existing edge");
2273         )
2274     }
2275
2276     /**
2277      * Merge the current state of the registers with a previously stored version.
2278      *
2279      * Situation                            Change to _allocator
2280      * ---------                            --------------------
2281      * !current & !saved                    none
2282      * !current &  saved                    add saved
2283      *  current & !saved                    none (intersectRegisterState evicts current)
2284      *  current &  saved & current==saved   none
2285      *  current &  saved & current!=saved   evict current, add saved
2286      */
2287     void Assembler::unionRegisterState(RegAlloc& saved)
2288     {
2289         Register regsTodo[LastReg + 1];
2290         LIns* insTodo[LastReg + 1];
2291         int nTodo = 0;
2292
2293         // Do evictions and pops first.
2294         verbose_only(bool shouldMention=false; )
2295         for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
2296         {
2297             LIns* curins = _allocator.getActive(r);
2298             LIns* savedins = saved.getActive(r);
2299             if (curins != savedins)
2300             {
2301                 if (savedins) {
2302                     regsTodo[nTodo] = r;
2303                     insTodo[nTodo] = savedins;
2304                     nTodo++;
2305                 }
2306                 if (curins && savedins) {
2307                     //_nvprof("union-evict",1);
2308                     verbose_only( shouldMention=true; )
2309                     NanoAssert(curins->getReg() == r);
2310                     evict(curins);
2311                 }
2312
2313                 #ifdef NANOJIT_IA32
2314                 if (rmask(r) & x87Regs) {
2315                     if (savedins) {
2316                         FSTP(r);
2317                     }
2318                     else if (curins) {
2319                         // saved state did not have fpu reg allocated,
2320                         // so we must evict here to keep x87 stack balanced.
2321                         evict(curins);
2322                     }
2323                     verbose_only( shouldMention=true; )
2324                 }
2325                 #endif
2326             }
2327         }
2328         // Now reassign mainline registers.
2329         for (int i = 0; i < nTodo; i++) {
2330             findSpecificRegFor(insTodo[i], regsTodo[i]);
2331         }
2332         verbose_only(
2333             if (shouldMention)
2334                 verbose_outputf("## merging registers (union) with existing edge");
2335         )
2336     }
2337
2338     // Scan table for instruction with the lowest priority, meaning it is used
2339     // furthest in the future.
2340     LIns* Assembler::findVictim(RegisterMask allow)
2341     {
2342         NanoAssert(allow);
2343         LIns *ins, *vic = 0;
2344         int allow_pri = 0x7fffffff;
2345         for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
2346         {
2347             if ((allow & rmask(r)) && (ins = _allocator.getActive(r)) != 0)
2348             {
2349                 int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
2350                 if (!vic || pri < allow_pri) {
2351                     vic = ins;
2352                     allow_pri = pri;
2353                 }
2354             }
2355         }
2356         NanoAssert(vic != 0);
2357         return vic;
2358     }
2359
2360 #ifdef NJ_VERBOSE
2361     char Assembler::outline[8192];
2362     char Assembler::outlineEOL[512];
2363
2364     void Assembler::output()
2365     {
2366         // The +1 is for the terminating NUL char.
2367         VMPI_strncat(outline, outlineEOL, sizeof(outline)-(strlen(outline)+1));
2368
2369         if (_outputCache) {
2370             char* str = new (alloc) char[VMPI_strlen(outline)+1];
2371             VMPI_strcpy(str, outline);
2372             _outputCache->insert(str);
2373         } else {
2374             _logc->printf("%s\n", outline);
2375         }
2376
2377         outline[0] = '\0';
2378         outlineEOL[0] = '\0';
2379     }
2380
2381     void Assembler::outputf(const char* format, ...)
2382     {
2383         va_list args;
2384         va_start(args, format);
2385
2386         outline[0] = '\0';
2387         vsprintf(outline, format, args);
2388         output();
2389     }
2390
2391     void Assembler::setOutputForEOL(const char* format, ...)
2392     {
2393         va_list args;
2394         va_start(args, format);
2395
2396         outlineEOL[0] = '\0';
2397         vsprintf(outlineEOL, format, args);
2398     }
2399 #endif // NJ_VERBOSE
2400
2401     void LabelStateMap::add(LIns *label, NIns *addr, RegAlloc &regs) {
2402         LabelState *st = new (alloc) LabelState(addr, regs);
2403         labels.put(label, st);
2404     }
2405
2406     LabelState* LabelStateMap::get(LIns *label) {
2407         return labels.get(label);
2408     }
2409 }
2410 #endif /* FEATURE_NANOJIT */