nanojit/Assembler.cpp

   1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
   2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is [Open Source Virtual Machine].
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Adobe System Incorporated.
  20  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Adobe AS3 Team
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either the GNU General Public License Version 2 or later (the "GPL"), or
  28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 #include "nanojit.h"
  41
  42 #ifdef FEATURE_NANOJIT
  43
  44 #ifdef VMCFG_VTUNE
  45 #include "../core/CodegenLIR.h"
  46 #endif
  47
  48 #ifdef _MSC_VER
  49     // disable some specific warnings which are normally useful, but pervasive in the code-gen macros
  50     #pragma warning(disable:4310) // cast truncates constant value
  51 #endif
  52
  53 #ifdef VMCFG_VTUNE
  54 namespace vtune {
  55     using namespace nanojit;
  56     void vtuneStart(void*, NIns*);
  57     void vtuneEnd(void*, NIns*);
  58     void vtuneLine(void*, int, NIns*);
  59     void vtuneFile(void*, void*);
  60 }
  61 using namespace vtune;
  62 #endif // VMCFG_VTUNE
  63
  64
  65 namespace nanojit
  66 {
  67     /**
  68      * Need the following:
  69      *
  70      *    - merging paths ( build a graph? ), possibly use external rep to drive codegen
  71      */
  72     Assembler::Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config)
  73         : codeList(NULL)
  74         , alloc(alloc)
  75         , _codeAlloc(codeAlloc)
  76         , _dataAlloc(dataAlloc)
  77         , _thisfrag(NULL)
  78         , _branchStateMap(alloc)
  79         , _patches(alloc)
  80         , _labels(alloc)
  81     #if NJ_USES_IMMD_POOL
  82         , _immDPool(alloc)
  83     #endif
  84         , _epilogue(NULL)
  85         , _err(None)
  86     #if PEDANTIC
  87         , pedanticTop(NULL)
  88     #endif
  89     #ifdef VMCFG_VTUNE
  90         , vtuneHandle(NULL)
  91     #endif
  92         , _config(config)
  93     {
  94         nInit(core);
  95         (void)logc;
  96         verbose_only( _logc = logc; )
  97         verbose_only( _outputCache = 0; )
  98         verbose_only( outline[0] = '\0'; )
  99         verbose_only( outlineEOL[0] = '\0'; )
 100
 101         reset();
 102     }
 103
 104     // Per-opcode register hint table.  Default to no hints for all
 105     // instructions.  It's not marked const because individual back-ends can
 106     // install hint values for opcodes of interest in nInit().
 107     RegisterMask Assembler::nHints[LIR_sentinel+1] = {
 108 #define OP___(op, number, repKind, retType, isCse) \
 109         0,
 110 #include "LIRopcode.tbl"
 111 #undef OP___
 112         0
 113     };
 114
 115 #ifdef _DEBUG
 116
 117     /*static*/ LIns* const AR::BAD_ENTRY = (LIns*)0xdeadbeef;
 118
 119     void AR::validateQuick()
 120     {
 121         NanoAssert(_highWaterMark < NJ_MAX_STACK_ENTRY);
 122         NanoAssert(_entries[0] == NULL);
 123         // Only check a few entries around _highWaterMark.
 124         uint32_t const RADIUS = 4;
 125         uint32_t const lo = (_highWaterMark > 1 + RADIUS ? _highWaterMark - RADIUS : 1);
 126         uint32_t const hi = (_highWaterMark + 1 + RADIUS < NJ_MAX_STACK_ENTRY ? _highWaterMark + 1 + RADIUS : NJ_MAX_STACK_ENTRY);
 127         for (uint32_t i = lo; i <= _highWaterMark; ++i)
 128             NanoAssert(_entries[i] != BAD_ENTRY);
 129         for (uint32_t i = _highWaterMark+1; i < hi; ++i)
 130             NanoAssert(_entries[i] == BAD_ENTRY);
 131     }
 132
 133     void AR::validateFull()
 134     {
 135         NanoAssert(_highWaterMark < NJ_MAX_STACK_ENTRY);
 136         NanoAssert(_entries[0] == NULL);
 137         for (uint32_t i = 1; i <= _highWaterMark; ++i)
 138             NanoAssert(_entries[i] != BAD_ENTRY);
 139         for (uint32_t i = _highWaterMark+1; i < NJ_MAX_STACK_ENTRY; ++i)
 140             NanoAssert(_entries[i] == BAD_ENTRY);
 141     }
 142
 143     void AR::validate()
 144     {
 145         static uint32_t validateCounter = 0;
 146         if (++validateCounter >= 100)
 147         {
 148             validateFull();
 149             validateCounter = 0;
 150         }
 151         else
 152         {
 153             validateQuick();
 154         }
 155     }
 156
 157 #endif
 158
 159     inline void AR::clear()
 160     {
 161         _highWaterMark = 0;
 162         NanoAssert(_entries[0] == NULL);
 163     #ifdef _DEBUG
 164         for (uint32_t i = 1; i < NJ_MAX_STACK_ENTRY; ++i)
 165             _entries[i] = BAD_ENTRY;
 166     #endif
 167     }
 168
 169     bool AR::Iter::next(LIns*& ins, uint32_t& nStackSlots, int32_t& arIndex)
 170     {
 171         while (_i <= _ar._highWaterMark) {
 172             ins = _ar._entries[_i];
 173             if (ins) {
 174                 arIndex = _i;
 175                 nStackSlots = nStackSlotsFor(ins);
 176                 _i += nStackSlots;
 177                 return true;
 178             }
 179             _i++;
 180         }
 181         ins = NULL;
 182         nStackSlots = 0;
 183         arIndex = 0;
 184         return false;
 185     }
 186
 187     void Assembler::arReset()
 188     {
 189         _activation.clear();
 190         _branchStateMap.clear();
 191         _patches.clear();
 192         _labels.clear();
 193     #if NJ_USES_IMMD_POOL
 194         _immDPool.clear();
 195     #endif
 196     }
 197
 198     void Assembler::registerResetAll()
 199     {
 200         nRegisterResetAll(_allocator);
 201         _allocator.managed = _allocator.free;
 202
 203         // At start, should have some registers free and none active.
 204         NanoAssert(0 != _allocator.free);
 205         NanoAssert(0 == _allocator.activeMask());
 206 #ifdef NANOJIT_IA32
 207         debug_only(_fpuStkDepth = 0; )
 208 #endif
 209     }
 210
 211     // Legend for register sets: A = allowed, P = preferred, F = free, S = SavedReg.
 212     //
 213     // Finds a register in 'setA___' to store the result of 'ins' (one from
 214     // 'set_P__' if possible), evicting one if necessary.  Doesn't consider
 215     // the prior state of 'ins'.
 216     //
 217     // Nb: 'setA___' comes from the instruction's use, 'set_P__' comes from its def.
 218     // Eg. in 'add(call(...), ...)':
 219     //     - the call's use means setA___==GpRegs;
 220     //     - the call's def means set_P__==rmask(retRegs[0]).
 221     //
 222     Register Assembler::registerAlloc(LIns* ins, RegisterMask setA___, RegisterMask set_P__)
 223     {
 224         Register r;
 225         RegisterMask set__F_ = _allocator.free;
 226         RegisterMask setA_F_ = setA___ & set__F_;
 227
 228         if (setA_F_) {
 229             RegisterMask set___S = SavedRegs;
 230             RegisterMask setA_FS = setA_F_ & set___S;
 231             RegisterMask setAPF_ = setA_F_ & set_P__;
 232             RegisterMask setAPFS = setA_FS & set_P__;
 233             RegisterMask set;
 234
 235             if      (setAPFS) set = setAPFS;
 236             else if (setAPF_) set = setAPF_;
 237             else if (setA_FS) set = setA_FS;
 238             else              set = setA_F_;
 239
 240             r = nRegisterAllocFromSet(set);
 241             _allocator.addActive(r, ins);
 242             ins->setReg(r);
 243         } else {
 244             // Nothing free, steal one.
 245             // LSRA says pick the one with the furthest use.
 246             LIns* vic = findVictim(setA___);
 247             NanoAssert(vic->isInReg());
 248             r = vic->getReg();
 249
 250             evict(vic);
 251
 252             // r ends up staying active, but the LIns defining it changes.
 253             _allocator.removeFree(r);
 254             _allocator.addActive(r, ins);
 255             ins->setReg(r);
 256         }
 257
 258         return r;
 259     }
 260
 261     // Finds a register in 'allow' to store a temporary value (one not
 262     // associated with a particular LIns), evicting one if necessary.  The
 263     // returned register is marked as being free and so can only be safely
 264     // used for code generation purposes until the regstate is next inspected
 265     // or updated.
 266     Register Assembler::registerAllocTmp(RegisterMask allow)
 267     {
 268         LIns dummyIns;
 269         Register r = registerAlloc(&dummyIns, allow, /*prefer*/0);
 270
 271         // Mark r as free, ready for use as a temporary value.
 272         _allocator.removeActive(r);
 273         _allocator.addFree(r);
 274         return r;
 275     }
 276
 277     void Assembler::codeAlloc(NIns *&start, NIns *&end, NIns *&eip
 278                               verbose_only(, size_t &nBytes))
 279     {
 280         // save the block we just filled
 281         if (start)
 282             CodeAlloc::add(codeList, start, end);
 283
 284         // CodeAlloc contract: allocations never fail
 285         _codeAlloc.alloc(start, end);
 286         verbose_only( nBytes += (end - start) * sizeof(NIns); )
 287         NanoAssert(uintptr_t(end) - uintptr_t(start) >= (size_t)LARGEST_UNDERRUN_PROT);
 288         eip = end;
 289     }
 290
 291     void Assembler::reset()
 292     {
 293         _nIns = 0;
 294         _nExitIns = 0;
 295         codeStart = codeEnd = 0;
 296         exitStart = exitEnd = 0;
 297         codeList = 0;
 298
 299         nativePageReset();
 300         registerResetAll();
 301         arReset();
 302     }
 303
 304     #ifdef _DEBUG
 305     void Assembler::pageValidate()
 306     {
 307         if (error()) return;
 308         // This may be a normal code chunk or an exit code chunk.
 309         NanoAssertMsg(codeStart <= _nIns && _nIns <= codeEnd,
 310                      "Native instruction pointer overstep paging bounds; check overrideProtect for last instruction");
 311     }
 312     #endif
 313
 314     #ifdef _DEBUG
 315
 316     bool AR::isValidEntry(uint32_t idx, LIns* ins) const
 317     {
 318         return idx > 0 && idx <= _highWaterMark && _entries[idx] == ins;
 319     }
 320
 321     void AR::checkForResourceConsistency(const RegAlloc& regs)
 322     {
 323         validate();
 324         for (uint32_t i = 1; i <= _highWaterMark; ++i)
 325         {
 326             LIns* ins = _entries[i];
 327             if (!ins)
 328                 continue;
 329             uint32_t arIndex = ins->getArIndex();
 330             NanoAssert(arIndex != 0);
 331             if (ins->isop(LIR_allocp)) {
 332                 int const n = i + (ins->size()>>2);
 333                 for (int j=i+1; j < n; j++) {
 334                     NanoAssert(_entries[j]==ins);
 335                 }
 336                 NanoAssert(arIndex == (uint32_t)n-1);
 337                 i = n-1;
 338             }
 339             else if (ins->isQorD()) {
 340                 NanoAssert(_entries[i + 1]==ins);
 341                 i += 1; // skip high word
 342             }
 343             else {
 344                 NanoAssertMsg(arIndex == i, "Stack record index mismatch");
 345             }
 346             NanoAssertMsg(!ins->isInReg() || regs.isConsistent(ins->getReg(), ins),
 347                           "Register record mismatch");
 348         }
 349     }
 350
 351     void Assembler::resourceConsistencyCheck()
 352     {
 353         NanoAssert(!error());
 354 #ifdef NANOJIT_IA32
 355         // Within the expansion of a single LIR instruction, we may use the x87
 356         // stack for unmanaged temporaries.  Otherwise, we do not use the x87 stack
 357         // as such, but use the top element alone as a single allocatable FP register.
 358         // Compensation code must be inserted to keep the stack balanced and avoid
 359         // overflow, and the mechanisms for this are rather fragile and IA32-specific.
 360         // The predicate below should hold between any pair of instructions within
 361         // a basic block, at labels, and just after a conditional branch.  Currently,
 362         // we enforce this condition between all pairs of instructions, but this is
 363         // overly restrictive, and would fail if we did not generate unreachable x87
 364         // stack pops following unconditional branches.
 365         NanoAssert((_allocator.active[FST0] && _fpuStkDepth == -1) ||
 366                    (!_allocator.active[FST0] && _fpuStkDepth == 0));
 367 #endif
 368         _activation.checkForResourceConsistency(_allocator);
 369         registerConsistencyCheck();
 370     }
 371
 372     void Assembler::registerConsistencyCheck()
 373     {
 374         RegisterMask managed = _allocator.managed;
 375         for (Register r = lsReg(managed); managed; r = nextLsReg(managed, r)) {
 376             // A register managed by register allocation must be either
 377             // free or active, but not both.
 378             if (_allocator.isFree(r)) {
 379                 NanoAssertMsgf(_allocator.getActive(r)==0,
 380                     "register %s is free but assigned to ins", gpn(r));
 381             } else {
 382                 // An LIns defining a register must have that register in
 383                 // its reservation.
 384                 LIns* ins = _allocator.getActive(r);
 385                 NanoAssert(ins);
 386                 NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
 387             }
 388         }
 389
 390         RegisterMask not_managed = ~_allocator.managed;
 391         for (Register r = lsReg(not_managed); not_managed; r = nextLsReg(not_managed, r)) {
 392             // A register not managed by register allocation must be
 393             // neither free nor active.
 394             if (r <= LastReg) {
 395                 NanoAssert(!_allocator.isFree(r));
 396                 NanoAssert(!_allocator.getActive(r));
 397             }
 398         }
 399     }
 400     #endif /* _DEBUG */
 401
 402     void Assembler::findRegFor2(RegisterMask allowa, LIns* ia, Register& ra,
 403                                 RegisterMask allowb, LIns* ib, Register& rb)
 404     {
 405         // There should be some overlap between 'allowa' and 'allowb', else
 406         // there's no point calling this function.
 407         NanoAssert(allowa & allowb);
 408
 409         if (ia == ib) {
 410             ra = rb = findRegFor(ia, allowa & allowb);  // use intersection(allowa, allowb)
 411
 412         } else if (ib->isInRegMask(allowb)) {
 413             // 'ib' is already in an allowable reg -- don't let it get evicted
 414             // when finding 'ra'.
 415             rb = ib->getReg();
 416             ra = findRegFor(ia, allowa & ~rmask(rb));
 417
 418         } else {
 419             ra = findRegFor(ia, allowa);
 420             rb = findRegFor(ib, allowb & ~rmask(ra));
 421         }
 422     }
 423
 424     Register Assembler::findSpecificRegFor(LIns* i, Register w)
 425     {
 426         return findRegFor(i, rmask(w));
 427     }
 428
 429     // Like findRegFor(), but called when the LIns is used as a pointer.  It
 430     // doesn't have to be called, findRegFor() can still be used, but it can
 431     // optimize the LIR_allocp case by indexing off FP, thus saving the use of
 432     // a GpReg.
 433     //
 434     Register Assembler::getBaseReg(LIns* base, int &d, RegisterMask allow)
 435     {
 436     #if !PEDANTIC
 437         if (base->isop(LIR_allocp)) {
 438             // The value of a LIR_allocp is a pointer to its stack memory,
 439             // which is always relative to FP.  So we can just return FP if we
 440             // also adjust 'd' (and can do so in a valid manner).  Or, in the
 441             // PEDANTIC case, we can just assign a register as normal;
 442             // findRegFor() will allocate the stack memory for LIR_allocp if
 443             // necessary.
 444             d += findMemFor(base);
 445             return FP;
 446         }
 447     #else
 448         (void) d;
 449     #endif
 450         return findRegFor(base, allow);
 451     }
 452
 453     // Like findRegFor2(), but used for stores where the base value has the
 454     // same type as the stored value, eg. in asm_store32() on 32-bit platforms
 455     // and asm_store64() on 64-bit platforms.  Similar to getBaseReg(),
 456     // findRegFor2() can be called instead, but this function can optimize the
 457     // case where the base value is a LIR_allocp.
 458     void Assembler::getBaseReg2(RegisterMask allowValue, LIns* value, Register& rv,
 459                                 RegisterMask allowBase, LIns* base, Register& rb, int &d)
 460     {
 461     #if !PEDANTIC
 462         if (base->isop(LIR_allocp)) {
 463             rb = FP;
 464             d += findMemFor(base);
 465             rv = findRegFor(value, allowValue);
 466             return;
 467         }
 468     #else
 469         (void) d;
 470     #endif
 471         findRegFor2(allowValue, value, rv, allowBase, base, rb);
 472     }
 473
 474     RegisterMask Assembler::hint(LIns* ins)
 475     {
 476         RegisterMask prefer = nHints[ins->opcode()];
 477         return (prefer == PREFER_SPECIAL) ? nHint(ins) : prefer;
 478     }
 479
 480     // Finds a register in 'allow' to hold the result of 'ins'.  Used when we
 481     // encounter a use of 'ins'.  The actions depend on the prior regstate of
 482     // 'ins':
 483     // - If the result of 'ins' is not in any register, we find an allowed
 484     //   one, evicting one if necessary.
 485     // - If the result of 'ins' is already in an allowed register, we use that.
 486     // - If the result of 'ins' is already in a not-allowed register, we find an
 487     //   allowed one and move it.
 488     //
 489     Register Assembler::findRegFor(LIns* ins, RegisterMask allow)
 490     {
 491         if (ins->isop(LIR_allocp)) {
 492             // Never allocate a reg for this without stack space too.
 493             findMemFor(ins);
 494         }
 495
 496         Register r;
 497
 498         if (!ins->isInReg()) {
 499             // 'ins' isn't in a register (must be in a spill slot or nowhere).
 500             r = registerAlloc(ins, allow, hint(ins));
 501
 502         } else if (rmask(r = ins->getReg()) & allow) {
 503             // 'ins' is in an allowed register.
 504             _allocator.useActive(r);
 505
 506         } else {
 507             // 'ins' is in a register (r) that's not in 'allow'.
 508 #ifdef NANOJIT_IA32
 509             if (((rmask(r)&XmmRegs) && !(allow&XmmRegs)) ||
 510                 ((rmask(r)&x87Regs) && !(allow&x87Regs)))
 511             {
 512                 // x87 <-> xmm copy required
 513                 //_nvprof("fpu-evict",1);
 514                 evict(ins);
 515                 r = registerAlloc(ins, allow, hint(ins));
 516             } else
 517 #elif defined(NANOJIT_PPC) || defined(NANOJIT_MIPS)
 518             if (((rmask(r)&GpRegs) && !(allow&GpRegs)) ||
 519                 ((rmask(r)&FpRegs) && !(allow&FpRegs)))
 520             {
 521                 evict(ins);
 522                 r = registerAlloc(ins, allow, hint(ins));
 523             } else
 524 #endif
 525             {
 526                 // The post-state register holding 'ins' is 's', the pre-state
 527                 // register holding 'ins' is 'r'.  For example, if s=eax and
 528                 // r=ecx:
 529                 //
 530                 // pre-state:   ecx(ins)
 531                 // instruction: mov eax, ecx
 532                 // post-state:  eax(ins)
 533                 //
 534                 Register s = r;
 535                 _allocator.retire(r);
 536                 r = registerAlloc(ins, allow, hint(ins));
 537
 538                 // 'ins' is in 'allow', in register r (different to the old r);
 539                 //  s is the old r.
 540                 if ((rmask(s) & GpRegs) && (rmask(r) & GpRegs)) {
 541                     MR(s, r);   // move 'ins' from its pre-state reg (r) to its post-state reg (s)
 542                 } else {
 543                     asm_nongp_copy(s, r);
 544                 }
 545             }
 546         }
 547
 548         return r;
 549     }
 550
 551     // Like findSpecificRegFor(), but only for when 'r' is known to be free
 552     // and 'ins' is known to not already have a register allocated.  Updates
 553     // the regstate (maintaining the invariants) but does not generate any
 554     // code.  The return value is redundant, always being 'r', but it's
 555     // sometimes useful to have it there for assignments.
 556     Register Assembler::findSpecificRegForUnallocated(LIns* ins, Register r)
 557     {
 558         if (ins->isop(LIR_allocp)) {
 559             // never allocate a reg for this w/out stack space too
 560             findMemFor(ins);
 561         }
 562
 563         NanoAssert(!ins->isInReg());
 564         NanoAssert(_allocator.free & rmask(r));
 565
 566         ins->setReg(r);
 567         _allocator.removeFree(r);
 568         _allocator.addActive(r, ins);
 569
 570         return r;
 571     }
 572
 573 #if NJ_USES_IMMD_POOL
 574     const uint64_t* Assembler::findImmDFromPool(uint64_t q)
 575     {
 576         uint64_t* p = _immDPool.get(q);
 577         if (!p)
 578         {
 579             p = new (_dataAlloc) uint64_t;
 580             *p = q;
 581             _immDPool.put(q, p);
 582         }
 583         return p;
 584     }
 585 #endif
 586
 587     int Assembler::findMemFor(LIns *ins)
 588     {
 589 #if NJ_USES_IMMD_POOL
 590         NanoAssert(!ins->isImmD());
 591 #endif
 592         if (!ins->isInAr()) {
 593             uint32_t const arIndex = arReserve(ins);
 594             ins->setArIndex(arIndex);
 595             NanoAssert(_activation.isValidEntry(ins->getArIndex(), ins) == (arIndex != 0));
 596         }
 597         return arDisp(ins);
 598     }
 599
 600     // XXX: this function is dangerous and should be phased out;
 601     // See bug 513615.  Calls to it should replaced it with a
 602     // prepareResultReg() / generate code / freeResourcesOf() sequence.
 603     Register Assembler::deprecated_prepResultReg(LIns *ins, RegisterMask allow)
 604     {
 605 #ifdef NANOJIT_IA32
 606         // We used to have to worry about possibly popping the x87 stack here.
 607         // But this function is no longer used on i386, and this assertion
 608         // ensures that.
 609         NanoAssert(0);
 610 #endif
 611         Register r = findRegFor(ins, allow);
 612         deprecated_freeRsrcOf(ins);
 613         return r;
 614     }
 615
 616     // Finds a register in 'allow' to hold the result of 'ins'.  Also
 617     // generates code to spill the result if necessary.  Called just prior to
 618     // generating the code for 'ins' (because we generate code backwards).
 619     //
 620     // An example where no spill is necessary.  Lines marked '*' are those
 621     // done by this function.
 622     //
 623     //   regstate:  R
 624     //   asm:       define res into r
 625     // * regstate:  R + r(res)
 626     //              ...
 627     //   asm:       use res in r
 628     //
 629     // An example where a spill is necessary.
 630     //
 631     //   regstate:  R
 632     //   asm:       define res into r
 633     // * regstate:  R + r(res)
 634     // * asm:       spill res from r
 635     //   regstate:  R
 636     //              ...
 637     //   asm:       restore res into r2
 638     //   regstate:  R + r2(res) + other changes from "..."
 639     //   asm:       use res in r2
 640     //
 641     Register Assembler::prepareResultReg(LIns *ins, RegisterMask allow)
 642     {
 643         // At this point, we know the result of 'ins' is used later in the
 644         // code, unless it is a call to an impure function that must be
 645         // included for effect even though its result is ignored.  It may have
 646         // had to be evicted, in which case the restore will have already been
 647         // generated, so we now generate the spill.  QUERY: Is there any attempt
 648         // to elide the spill if we know that all restores can be rematerialized?
 649 #ifdef NANOJIT_IA32
 650         const bool notInFST0 = (!ins->isInReg() || ins->getReg() != FST0);
 651         Register r = findRegFor(ins, allow);
 652         // If the result register is FST0, but FST0 is not in the post-regstate,
 653         // then we must pop the x87 stack.  This may occur because the result is
 654         // unused, or because it has been stored to a spill slot or an XMM register.
 655         const bool needPop = notInFST0 && (r == FST0);
 656         const bool didSpill = asm_maybe_spill(ins, needPop);
 657         if (!didSpill && needPop) {
 658             // If the instruction is spilled, then the pop will have already
 659             // been performed by the store to the stack slot.  Otherwise, we
 660             // must pop now.  This may occur when the result of a LIR_calld
 661             // to an impure (side-effecting) function is not used.
 662             FSTP(FST0);
 663         }
 664 #else
 665         Register r = findRegFor(ins, allow);
 666         asm_maybe_spill(ins, false);
 667 #endif
 668         return r;
 669     }
 670
 671     bool Assembler::asm_maybe_spill(LIns* ins, bool pop)
 672     {
 673         if (ins->isInAr()) {
 674             int d = arDisp(ins);
 675             Register r = ins->getReg();
 676             verbose_only( RefBuf b;
 677                           if (_logc->lcbits & LC_Native) {
 678                              setOutputForEOL("  <= spill %s",
 679                              _thisfrag->lirbuf->printer->formatRef(&b, ins)); } )
 680 #ifdef NANOJIT_IA32
 681             asm_spill(r, d, pop);
 682 #else
 683             (void)pop;
 684             asm_spill(r, d, ins->isQorD());
 685 #endif
 686             return true;
 687         }
 688         return false;
 689     }
 690
 691     // XXX: This function is error-prone and should be phased out; see bug 513615.
 692     void Assembler::deprecated_freeRsrcOf(LIns *ins)
 693     {
 694         if (ins->isInReg()) {
 695             asm_maybe_spill(ins, /*pop*/false);
 696             _allocator.retire(ins->getReg());   // free any register associated with entry
 697             ins->clearReg();
 698         }
 699         if (ins->isInAr()) {
 700             arFree(ins);                        // free any AR space associated with entry
 701             ins->clearArIndex();
 702         }
 703     }
 704
 705     // Frees all record of registers and spill slots used by 'ins'.
 706     void Assembler::freeResourcesOf(LIns *ins)
 707     {
 708         if (ins->isInReg()) {
 709             _allocator.retire(ins->getReg());   // free any register associated with entry
 710             ins->clearReg();
 711         }
 712         if (ins->isInAr()) {
 713             arFree(ins);                        // free any AR space associated with entry
 714             ins->clearArIndex();
 715         }
 716     }
 717
 718     // Frees 'r' in the RegAlloc regstate, if it's not already free.
 719     void Assembler::evictIfActive(Register r)
 720     {
 721         if (LIns* vic = _allocator.getActive(r)) {
 722             NanoAssert(vic->getReg() == r);
 723             evict(vic);
 724         }
 725     }
 726
 727     // Frees 'r' (which currently holds the result of 'vic') in the regstate.
 728     // An example:
 729     //
 730     //   pre-regstate:  eax(ld1)
 731     //   instruction:   mov ebx,-4(ebp) <= restore add1   # %ebx is dest
 732     //   post-regstate: eax(ld1) ebx(add1)
 733     //
 734     // At run-time we are *restoring* 'add1' into %ebx, hence the call to
 735     // asm_restore().  But at regalloc-time we are moving backwards through
 736     // the code, so in that sense we are *evicting* 'add1' from %ebx.
 737     //
 738     void Assembler::evict(LIns* vic)
 739     {
 740         // Not free, need to steal.
 741         Register r = vic->getReg();
 742
 743         NanoAssert(!_allocator.isFree(r));
 744         NanoAssert(vic == _allocator.getActive(r));
 745
 746         verbose_only( RefBuf b;
 747                       if (_logc->lcbits & LC_Native) {
 748                         setOutputForEOL("  <= restore %s",
 749                         _thisfrag->lirbuf->printer->formatRef(&b, vic)); } )
 750         asm_restore(vic, r);
 751
 752         _allocator.retire(r);
 753         vic->clearReg();
 754
 755         // At this point 'vic' is unused (if rematerializable), or in a spill
 756         // slot (if not).
 757     }
 758
 759     void Assembler::patch(GuardRecord *lr)
 760     {
 761         if (!lr->jmp) // the guard might have been eliminated as redundant
 762             return;
 763         Fragment *frag = lr->exit->target;
 764         NanoAssert(frag->fragEntry != 0);
 765         nPatchBranch((NIns*)lr->jmp, frag->fragEntry);
 766         CodeAlloc::flushICache(lr->jmp, LARGEST_BRANCH_PATCH);
 767         verbose_only(verbose_outputf("patching jump at %p to target %p\n",
 768             lr->jmp, frag->fragEntry);)
 769     }
 770
 771     void Assembler::patch(SideExit *exit)
 772     {
 773         GuardRecord *rec = exit->guards;
 774         NanoAssert(rec);
 775         while (rec) {
 776             patch(rec);
 777             rec = rec->next;
 778         }
 779     }
 780
 781 #ifdef NANOJIT_IA32
 782     void Assembler::patch(SideExit* exit, SwitchInfo* si)
 783     {
 784         for (GuardRecord* lr = exit->guards; lr; lr = lr->next) {
 785             Fragment *frag = lr->exit->target;
 786             NanoAssert(frag->fragEntry != 0);
 787             si->table[si->index] = frag->fragEntry;
 788         }
 789     }
 790 #endif
 791
 792     NIns* Assembler::asm_exit(LIns* guard)
 793     {
 794         SideExit *exit = guard->record()->exit;
 795         NIns* at = 0;
 796         if (!_branchStateMap.get(exit))
 797         {
 798             at = asm_leave_trace(guard);
 799         }
 800         else
 801         {
 802             RegAlloc* captured = _branchStateMap.get(exit);
 803             intersectRegisterState(*captured);
 804             at = exit->target->fragEntry;
 805             NanoAssert(at != 0);
 806             _branchStateMap.remove(exit);
 807         }
 808         return at;
 809     }
 810
 811     NIns* Assembler::asm_leave_trace(LIns* guard)
 812     {
 813         verbose_only( verbose_outputf("----------------------------------- ## END exit block %p", guard);)
 814
 815         // This point is unreachable.  So free all the registers.  If an
 816         // instruction has a stack entry we will leave it alone, otherwise we
 817         // free it entirely.  intersectRegisterState() will restore.
 818         RegAlloc capture = _allocator;
 819         releaseRegisters();
 820
 821         swapCodeChunks();
 822         _inExit = true;
 823
 824 #ifdef NANOJIT_IA32
 825         debug_only( _sv_fpuStkDepth = _fpuStkDepth; _fpuStkDepth = 0; )
 826 #endif
 827
 828         nFragExit(guard);
 829
 830         // Restore the callee-saved register and parameters.
 831         assignSavedRegs();
 832         assignParamRegs();
 833
 834         intersectRegisterState(capture);
 835
 836         // this can be useful for breaking whenever an exit is taken
 837         //INT3();
 838         //NOP();
 839
 840         // we are done producing the exit logic for the guard so demark where our exit block code begins
 841         NIns* jmpTarget = _nIns;     // target in exit path for our mainline conditional jump
 842
 843         // swap back pointers, effectively storing the last location used in the exit path
 844         swapCodeChunks();
 845         _inExit = false;
 846
 847         //verbose_only( verbose_outputf("         LIR_xt/xf swapCodeChunks, _nIns is now %08X(%08X), _nExitIns is now %08X(%08X)",_nIns, *_nIns,_nExitIns,*_nExitIns) );
 848         verbose_only( verbose_outputf("%p:", jmpTarget);)
 849         verbose_only( verbose_outputf("----------------------------------- ## BEGIN exit block (LIR_xt|LIR_xf)") );
 850
 851 #ifdef NANOJIT_IA32
 852         NanoAssertMsgf(_fpuStkDepth == _sv_fpuStkDepth, "LIR_xtf, _fpuStkDepth=%d, expect %d",_fpuStkDepth, _sv_fpuStkDepth);
 853         debug_only( _fpuStkDepth = _sv_fpuStkDepth; _sv_fpuStkDepth = 9999; )
 854 #endif
 855
 856         return jmpTarget;
 857     }
 858
 859     void Assembler::compile(Fragment* frag, Allocator& alloc, bool optimize verbose_only(, LInsPrinter* printer))
 860     {
 861         verbose_only(
 862         bool anyVerb = (_logc->lcbits & 0xFFFF & ~LC_FragProfile) > 0;
 863         bool liveVerb = (_logc->lcbits & 0xFFFF & LC_Liveness) > 0;
 864         )
 865
 866         /* BEGIN decorative preamble */
 867         verbose_only(
 868         if (anyVerb) {
 869             _logc->printf("========================================"
 870                           "========================================\n");
 871             _logc->printf("=== BEGIN LIR::compile(%p, %p)\n",
 872                           (void*)this, (void*)frag);
 873             _logc->printf("===\n");
 874         })
 875         /* END decorative preamble */
 876
 877         verbose_only( if (liveVerb) {
 878             _logc->printf("\n");
 879             _logc->printf("=== Results of liveness analysis:\n");
 880             _logc->printf("===\n");
 881             LirReader br(frag->lastIns);
 882             LirFilter* lir = &br;
 883             if (optimize) {
 884                 StackFilter* sf = new (alloc) StackFilter(lir, alloc, frag->lirbuf->sp);
 885                 lir = sf;
 886             }
 887             live(lir, alloc, frag, _logc);
 888         })
 889
 890         /* Set up the generic text output cache for the assembler */
 891         verbose_only( StringList asmOutput(alloc); )
 892         verbose_only( _outputCache = &asmOutput; )
 893
 894         beginAssembly(frag);
 895         if (error())
 896             return;
 897
 898         //_logc->printf("recompile trigger %X kind %d\n", (int)frag, frag->kind);
 899
 900         verbose_only( if (anyVerb) {
 901             _logc->printf("=== Translating LIR fragments into assembly:\n");
 902         })
 903
 904         // now the the main trunk
 905         verbose_only( RefBuf b; )
 906         verbose_only( if (anyVerb) {
 907             _logc->printf("=== -- Compile trunk %s: begin\n", printer->formatAddr(&b, frag));
 908         })
 909
 910         // Used for debug printing, if needed
 911         debug_only(ValidateReader *validate = NULL;)
 912         verbose_only(
 913         ReverseLister *pp_init = NULL;
 914         ReverseLister *pp_after_sf = NULL;
 915         )
 916
 917         // The LIR passes through these filters as listed in this
 918         // function, viz, top to bottom.
 919
 920         // set up backwards pipeline: assembler <- StackFilter <- LirReader
 921         LirFilter* lir = new (alloc) LirReader(frag->lastIns);
 922
 923 #ifdef DEBUG
 924         // VALIDATION
 925         validate = new (alloc) ValidateReader(lir);
 926         lir = validate;
 927 #endif
 928
 929         // INITIAL PRINTING
 930         verbose_only( if (_logc->lcbits & LC_ReadLIR) {
 931         pp_init = new (alloc) ReverseLister(lir, alloc, frag->lirbuf->printer, _logc,
 932                                     "Initial LIR");
 933         lir = pp_init;
 934         })
 935
 936         // STACKFILTER
 937         if (optimize) {
 938             StackFilter* stackfilter = new (alloc) StackFilter(lir, alloc, frag->lirbuf->sp);
 939             lir = stackfilter;
 940         }
 941
 942         verbose_only( if (_logc->lcbits & LC_AfterSF) {
 943         pp_after_sf = new (alloc) ReverseLister(lir, alloc, frag->lirbuf->printer, _logc,
 944                                                 "After StackFilter");
 945         lir = pp_after_sf;
 946         })
 947
 948         assemble(frag, lir);
 949
 950         // If we were accumulating debug info in the various ReverseListers,
 951         // call finish() to emit whatever contents they have accumulated.
 952         verbose_only(
 953         if (pp_init)        pp_init->finish();
 954         if (pp_after_sf)    pp_after_sf->finish();
 955         )
 956
 957         verbose_only( if (anyVerb) {
 958             _logc->printf("=== -- Compile trunk %s: end\n", printer->formatAddr(&b, frag));
 959         })
 960
 961         endAssembly(frag);
 962
 963         // Reverse output so that assembly is displayed low-to-high.
 964         // Up to this point, _outputCache has been non-NULL, and so has been
 965         // accumulating output.  Now we set it to NULL, traverse the entire
 966         // list of stored strings, and hand them a second time to output.
 967         // Since _outputCache is now NULL, outputf just hands these strings
 968         // directly onwards to _logc->printf.
 969         verbose_only( if (anyVerb) {
 970             _logc->printf("\n");
 971             _logc->printf("=== Aggregated assembly output: BEGIN\n");
 972             _logc->printf("===\n");
 973             _outputCache = 0;
 974             for (Seq<char*>* p = asmOutput.get(); p != NULL; p = p->tail) {
 975                 char *str = p->head;
 976                 outputf("  %s", str);
 977             }
 978             _logc->printf("===\n");
 979             _logc->printf("=== Aggregated assembly output: END\n");
 980         });
 981
 982         if (error())
 983             frag->fragEntry = 0;
 984
 985         verbose_only( frag->nCodeBytes += codeBytes; )
 986         verbose_only( frag->nExitBytes += exitBytes; )
 987
 988         /* BEGIN decorative postamble */
 989         verbose_only( if (anyVerb) {
 990             _logc->printf("\n");
 991             _logc->printf("===\n");
 992             _logc->printf("=== END LIR::compile(%p, %p)\n",
 993                           (void*)this, (void*)frag);
 994             _logc->printf("========================================"
 995                           "========================================\n");
 996             _logc->printf("\n");
 997         });
 998         /* END decorative postamble */
 999     }
1000
1001     void Assembler::beginAssembly(Fragment *frag)
1002     {
1003         verbose_only( codeBytes = 0; )
1004         verbose_only( exitBytes = 0; )
1005
1006         reset();
1007
1008         NanoAssert(codeList == 0);
1009         NanoAssert(codeStart == 0);
1010         NanoAssert(codeEnd == 0);
1011         NanoAssert(exitStart == 0);
1012         NanoAssert(exitEnd == 0);
1013         NanoAssert(_nIns == 0);
1014         NanoAssert(_nExitIns == 0);
1015
1016         _thisfrag = frag;
1017         _inExit = false;
1018
1019         setError(None);
1020
1021         // native code gen buffer setup
1022         nativePageSetup();
1023
1024         // make sure we got memory at least one page
1025         if (error()) return;
1026
1027         _epilogue = NULL;
1028
1029         nBeginAssembly();
1030     }
1031
1032     void Assembler::assemble(Fragment* frag, LirFilter* reader)
1033     {
1034         if (error()) return;
1035         _thisfrag = frag;
1036
1037         // check the fragment is starting out with a sane profiling state
1038         verbose_only( NanoAssert(frag->nStaticExits == 0); )
1039         verbose_only( NanoAssert(frag->nCodeBytes == 0); )
1040         verbose_only( NanoAssert(frag->nExitBytes == 0); )
1041         verbose_only( NanoAssert(frag->profCount == 0); )
1042         verbose_only( if (_logc->lcbits & LC_FragProfile)
1043                           NanoAssert(frag->profFragID > 0);
1044                       else
1045                           NanoAssert(frag->profFragID == 0); )
1046
1047         _inExit = false;
1048
1049         gen(reader);
1050
1051         if (!error()) {
1052             // patch all branches
1053             NInsMap::Iter iter(_patches);
1054             while (iter.next()) {
1055                 NIns* where = iter.key();
1056                 LIns* target = iter.value();
1057                 if (target->isop(LIR_jtbl)) {
1058                     // Need to patch up a whole jump table, 'where' is the table.
1059                     LIns *jtbl = target;
1060                     NIns** native_table = (NIns**) (void *) where;
1061                     for (uint32_t i = 0, n = jtbl->getTableSize(); i < n; i++) {
1062                         LabelState* lstate = _labels.get(jtbl->getTarget(i));
1063                         NIns* ntarget = lstate->addr;
1064                         if (ntarget) {
1065                             native_table[i] = ntarget;
1066                         } else {
1067                             setError(UnknownBranch);
1068                             break;
1069                         }
1070                     }
1071                 } else {
1072                     // target is a label for a single-target branch
1073                     LabelState *lstate = _labels.get(target);
1074                     NIns* ntarget = lstate->addr;
1075                     if (ntarget) {
1076                         nPatchBranch(where, ntarget);
1077                     } else {
1078                         setError(UnknownBranch);
1079                         break;
1080                     }
1081                 }
1082             }
1083         }
1084     }
1085
1086     void Assembler::endAssembly(Fragment* frag)
1087     {
1088         // don't try to patch code if we are in an error state since we might have partially
1089         // overwritten the code cache already
1090         if (error()) {
1091             // something went wrong, release all allocated code memory
1092             _codeAlloc.freeAll(codeList);
1093             if (_nExitIns)
1094                 _codeAlloc.free(exitStart, exitEnd);
1095             _codeAlloc.free(codeStart, codeEnd);
1096             codeList = NULL;
1097             return;
1098         }
1099
1100         NIns* fragEntry = genPrologue();
1101         verbose_only( asm_output("[prologue]"); )
1102
1103         debug_only(_activation.checkForResourceLeaks());
1104
1105         NanoAssert(!_inExit);
1106         // save used parts of current block on fragment's code list, free the rest
1107 #if defined(NANOJIT_ARM) || defined(NANOJIT_MIPS)
1108         // [codeStart, _nSlot) ... gap ... [_nIns, codeEnd)
1109         if (_nExitIns) {
1110             _codeAlloc.addRemainder(codeList, exitStart, exitEnd, _nExitSlot, _nExitIns);
1111             verbose_only( exitBytes -= (_nExitIns - _nExitSlot) * sizeof(NIns); )
1112         }
1113         _codeAlloc.addRemainder(codeList, codeStart, codeEnd, _nSlot, _nIns);
1114         verbose_only( codeBytes -= (_nIns - _nSlot) * sizeof(NIns); )
1115 #else
1116         // [codeStart ... gap ... [_nIns, codeEnd))
1117         if (_nExitIns) {
1118             _codeAlloc.addRemainder(codeList, exitStart, exitEnd, exitStart, _nExitIns);
1119             verbose_only( exitBytes -= (_nExitIns - exitStart) * sizeof(NIns); )
1120         }
1121         _codeAlloc.addRemainder(codeList, codeStart, codeEnd, codeStart, _nIns);
1122         verbose_only( codeBytes -= (_nIns - codeStart) * sizeof(NIns); )
1123 #endif
1124
1125         // at this point all our new code is in the d-cache and not the i-cache,
1126         // so flush the i-cache on cpu's that need it.
1127         CodeAlloc::flushICache(codeList);
1128
1129         // save entry point pointers
1130         frag->fragEntry = fragEntry;
1131         frag->setCode(_nIns);
1132
1133 #ifdef VMCFG_VTUNE
1134         if (vtuneHandle)
1135         {
1136             vtuneEnd(vtuneHandle, codeEnd);
1137             vtuneStart(vtuneHandle, _nIns);
1138         }
1139 #endif
1140
1141         PERFM_NVPROF("code", CodeAlloc::size(codeList));
1142
1143 #ifdef NANOJIT_IA32
1144         NanoAssertMsgf(_fpuStkDepth == 0,"_fpuStkDepth %d\n",_fpuStkDepth);
1145 #endif
1146
1147         debug_only( pageValidate(); )
1148         NanoAssert(_branchStateMap.isEmpty());
1149     }
1150
1151     void Assembler::releaseRegisters()
1152     {
1153         RegisterMask active = _allocator.activeMask();
1154         for (Register r = lsReg(active); active; r = nextLsReg(active, r))
1155         {
1156             LIns *ins = _allocator.getActive(r);
1157             // Clear reg allocation, preserve stack allocation.
1158             _allocator.retire(r);
1159             NanoAssert(r == ins->getReg());
1160             ins->clearReg();
1161         }
1162     }
1163
1164 #ifdef PERFM
1165 #define countlir_live() _nvprof("lir-live",1)
1166 #define countlir_ret() _nvprof("lir-ret",1)
1167 #define countlir_alloc() _nvprof("lir-alloc",1)
1168 #define countlir_var() _nvprof("lir-var",1)
1169 #define countlir_use() _nvprof("lir-use",1)
1170 #define countlir_def() _nvprof("lir-def",1)
1171 #define countlir_imm() _nvprof("lir-imm",1)
1172 #define countlir_param() _nvprof("lir-param",1)
1173 #define countlir_cmov() _nvprof("lir-cmov",1)
1174 #define countlir_ld() _nvprof("lir-ld",1)
1175 #define countlir_ldq() _nvprof("lir-ldq",1)
1176 #define countlir_alu() _nvprof("lir-alu",1)
1177 #define countlir_qjoin() _nvprof("lir-qjoin",1)
1178 #define countlir_qlo() _nvprof("lir-qlo",1)
1179 #define countlir_qhi() _nvprof("lir-qhi",1)
1180 #define countlir_fpu() _nvprof("lir-fpu",1)
1181 #define countlir_st() _nvprof("lir-st",1)
1182 #define countlir_stq() _nvprof("lir-stq",1)
1183 #define countlir_jmp() _nvprof("lir-jmp",1)
1184 #define countlir_jcc() _nvprof("lir-jcc",1)
1185 #define countlir_label() _nvprof("lir-label",1)
1186 #define countlir_xcc() _nvprof("lir-xcc",1)
1187 #define countlir_x() _nvprof("lir-x",1)
1188 #define countlir_call() _nvprof("lir-call",1)
1189 #define countlir_jtbl() _nvprof("lir-jtbl",1)
1190 #else
1191 #define countlir_live()
1192 #define countlir_ret()
1193 #define countlir_alloc()
1194 #define countlir_var()
1195 #define countlir_use()
1196 #define countlir_def()
1197 #define countlir_imm()
1198 #define countlir_param()
1199 #define countlir_cmov()
1200 #define countlir_ld()
1201 #define countlir_ldq()
1202 #define countlir_alu()
1203 #define countlir_qjoin()
1204 #define countlir_qlo()
1205 #define countlir_qhi()
1206 #define countlir_fpu()
1207 #define countlir_st()
1208 #define countlir_stq()
1209 #define countlir_jmp()
1210 #define countlir_jcc()
1211 #define countlir_label()
1212 #define countlir_xcc()
1213 #define countlir_x()
1214 #define countlir_call()
1215 #define countlir_jtbl()
1216 #endif
1217
1218     void Assembler::asm_jmp(LIns* ins, InsList& pending_lives)
1219     {
1220         NanoAssert((ins->isop(LIR_j) && !ins->oprnd1()) ||
1221                    (ins->isop(LIR_jf) && ins->oprnd1()->isImmI(0)) ||
1222                    (ins->isop(LIR_jt) && ins->oprnd1()->isImmI(1)));
1223
1224         countlir_jmp();
1225         LIns* to = ins->getTarget();
1226         LabelState *label = _labels.get(to);
1227         // The jump is always taken so whatever register state we
1228         // have from downstream code, is irrelevant to code before
1229         // this jump.  So clear it out.  We will pick up register
1230         // state from the jump target, if we have seen that label.
1231         releaseRegisters();
1232 #ifdef NANOJIT_IA32
1233         // Unreachable, so assume correct stack depth.
1234         debug_only( _fpuStkDepth = 0; )
1235 #endif
1236         if (label && label->addr) {
1237             // Forward jump - pick up register state from target.
1238             unionRegisterState(label->regs);
1239 #ifdef NANOJIT_IA32
1240             // Set stack depth according to the register state we just loaded,
1241             // negating the effect of any unreachable x87 stack pop that might
1242             // have been emitted by unionRegisterState().
1243             debug_only( _fpuStkDepth = (_allocator.getActive(FST0) ? -1 : 0); )
1244 #endif
1245             JMP(label->addr);
1246         }
1247         else {
1248             // Backwards jump.
1249             handleLoopCarriedExprs(pending_lives);
1250             if (!label) {
1251                 // save empty register state at loop header
1252                 _labels.add(to, 0, _allocator);
1253             }
1254             else {
1255                 intersectRegisterState(label->regs);
1256 #ifdef NANOJIT_IA32
1257                 debug_only( _fpuStkDepth = (_allocator.getActive(FST0) ? -1 : 0); )
1258 #endif
1259             }
1260             JMP(0);
1261             _patches.put(_nIns, to);
1262         }
1263     }
1264
1265     void Assembler::asm_jcc(LIns* ins, InsList& pending_lives)
1266     {
1267         bool branchOnFalse = (ins->opcode() == LIR_jf);
1268         LIns* cond = ins->oprnd1();
1269         if (cond->isImmI()) {
1270             if ((!branchOnFalse && !cond->immI()) || (branchOnFalse && cond->immI())) {
1271                 // jmp never taken, not needed
1272             } else {
1273                 asm_jmp(ins, pending_lives);    // jmp always taken
1274             }
1275             return;
1276         }
1277
1278         // Changes to the logic below will likely need to be propagated to Assembler::asm_jov().
1279
1280         countlir_jcc();
1281         LIns* to = ins->getTarget();
1282         LabelState *label = _labels.get(to);
1283         if (label && label->addr) {
1284             // Forward jump to known label.  Need to merge with label's register state.
1285             unionRegisterState(label->regs);
1286             asm_branch(branchOnFalse, cond, label->addr);
1287         }
1288         else {
1289             // Back edge.
1290             handleLoopCarriedExprs(pending_lives);
1291             if (!label) {
1292                 // Evict all registers, most conservative approach.
1293                 evictAllActiveRegs();
1294                 _labels.add(to, 0, _allocator);
1295             }
1296             else {
1297                 // Evict all registers, most conservative approach.
1298                 intersectRegisterState(label->regs);
1299             }
1300             NIns *branch = asm_branch(branchOnFalse, cond, 0);
1301             _patches.put(branch,to);
1302         }
1303     }
1304
1305     void Assembler::asm_jov(LIns* ins, InsList& pending_lives)
1306     {
1307         // The caller is responsible for countlir_* profiling, unlike
1308         // asm_jcc above.  The reason for this is that asm_jov may not be
1309         // be called if the instruction is dead, and it is our convention
1310         // to count such instructions anyway.
1311         LOpcode op = ins->opcode();
1312         LIns* to = ins->getTarget();
1313         LabelState *label = _labels.get(to);
1314         if (label && label->addr) {
1315             // forward jump to known label.  need to merge with label's register state.
1316             unionRegisterState(label->regs);
1317             asm_branch_ov(op, label->addr);
1318         }
1319         else {
1320             // back edge.
1321             handleLoopCarriedExprs(pending_lives);
1322             if (!label) {
1323                 // evict all registers, most conservative approach.
1324                 evictAllActiveRegs();
1325                 _labels.add(to, 0, _allocator);
1326             }
1327             else {
1328                 // evict all registers, most conservative approach.
1329                 intersectRegisterState(label->regs);
1330             }
1331             NIns *branch = asm_branch_ov(op, 0);
1332             _patches.put(branch,to);
1333         }
1334     }
1335
1336     void Assembler::asm_x(LIns* ins)
1337     {
1338         verbose_only( _thisfrag->nStaticExits++; )
1339         countlir_x();
1340         // Generate the side exit branch on the main trace.
1341         NIns *exit = asm_exit(ins);
1342         JMP(exit);
1343     }
1344
1345     void Assembler::asm_xcc(LIns* ins)
1346     {
1347         LIns* cond = ins->oprnd1();
1348         if (cond->isImmI()) {
1349             if ((ins->isop(LIR_xt) && !cond->immI()) || (ins->isop(LIR_xf) && cond->immI())) {
1350                 // guard never taken, not needed
1351             } else {
1352                 asm_x(ins);     // guard always taken
1353             }
1354             return;
1355         }
1356
1357         verbose_only( _thisfrag->nStaticExits++; )
1358         countlir_xcc();
1359         // We only support cmp with guard right now, also assume it is 'close'
1360         // and only emit the branch.
1361         NIns* exit = asm_exit(ins); // does intersectRegisterState()
1362         asm_branch(ins->opcode() == LIR_xf, cond, exit);
1363     }
1364
1365     void Assembler::gen(LirFilter* reader)
1366     {
1367         NanoAssert(_thisfrag->nStaticExits == 0);
1368
1369         InsList pending_lives(alloc);
1370
1371         NanoAssert(!error());
1372
1373         // What's going on here: we're visiting all the LIR instructions in
1374         // the buffer, working strictly backwards in buffer-order, and
1375         // generating machine instructions for them as we go.
1376         //
1377         // For each LIns, we first check if it's live.  If so we mark its
1378         // operands as also live, and then generate code for it *if
1379         // necessary*.  It may not be necessary if the instruction is an
1380         // expression and code has already been generated for all its uses in
1381         // combination with previously handled instructions (ins->isExtant()
1382         // will return false if this is so).
1383
1384         // Note that the backwards code traversal can make register allocation
1385         // confusing.  (For example, we restore a value before we spill it!)
1386         // In particular, words like "before" and "after" must be used very
1387         // carefully -- their meaning at regalloc-time is opposite to their
1388         // meaning at run-time.  We use the term "pre-regstate" to refer to
1389         // the register allocation state that occurs prior to an instruction's
1390         // execution, and "post-regstate" to refer to the state that occurs
1391         // after an instruction's execution, e.g.:
1392         //
1393         //   pre-regstate:  ebx(ins)
1394         //   instruction:   mov eax, ebx     // mov dst, src
1395         //   post-regstate: eax(ins)
1396         //
1397         // At run-time, the instruction updates the pre-regstate into the
1398         // post-regstate (and these states are the real machine's regstates).
1399         // But when allocating registers, because we go backwards, the
1400         // pre-regstate is constructed from the post-regstate (and these
1401         // regstates are those stored in RegAlloc).
1402         //
1403         // One consequence of generating code backwards is that we tend to
1404         // both spill and restore registers as early (at run-time) as
1405         // possible;  this is good for tolerating memory latency.  If we
1406         // generated code forwards, we would expect to both spill and restore
1407         // registers as late (at run-time) as possible;  this might be better
1408         // for reducing register pressure.
1409
1410         // The trace must end with one of these opcodes.  Mark it as live.
1411         NanoAssert(reader->finalIns()->isop(LIR_x)    ||
1412                    reader->finalIns()->isop(LIR_xtbl) ||
1413                    reader->finalIns()->isRet()        ||
1414                    isLiveOpcode(reader->finalIns()->opcode()));
1415
1416         for (currIns = reader->read(); !currIns->isop(LIR_start); currIns = reader->read())
1417         {
1418             LIns* ins = currIns;        // give it a shorter name for local use
1419
1420             if (!ins->isLive()) {
1421                 NanoAssert(!ins->isExtant());
1422                 continue;
1423             }
1424
1425 #ifdef NJ_VERBOSE
1426             // Output the post-regstate (registers and/or activation).
1427             // Because asm output comes in reverse order, doing it now means
1428             // it is printed after the LIR and native code, exactly when the
1429             // post-regstate should be shown.
1430             if ((_logc->lcbits & LC_Native) && (_logc->lcbits & LC_Activation))
1431                 printActivationState();
1432             if ((_logc->lcbits & LC_Native) && (_logc->lcbits & LC_RegAlloc))
1433                 printRegState();
1434 #endif
1435
1436             LOpcode op = ins->opcode();
1437             switch (op)
1438             {
1439                 default:
1440                     NanoAssertMsgf(false, "unsupported LIR instruction: %d\n", op);
1441                     break;
1442
1443                 case LIR_regfence:
1444                     evictAllActiveRegs();
1445                     break;
1446
1447                 case LIR_livei:
1448                 CASE64(LIR_liveq:)
1449                 case LIR_lived: {
1450                     countlir_live();
1451                     LIns* op1 = ins->oprnd1();
1452                     op1->setResultLive();
1453                     // LIR_allocp's are meant to live until the point of the
1454                     // LIR_livep instruction, marking other expressions as
1455                     // live ensures that they remain so at loop bottoms.
1456                     // LIR_allocp areas require special treatment because they
1457                     // are accessed indirectly and the indirect accesses are
1458                     // invisible to the assembler, other than via LIR_livep.
1459                     // Other expression results are only accessed directly in
1460                     // ways that are visible to the assembler, so extending
1461                     // those expression's lifetimes past the last loop edge
1462                     // isn't necessary.
1463                     if (op1->isop(LIR_allocp)) {
1464                         findMemFor(op1);
1465                     } else {
1466                         pending_lives.add(ins);
1467                     }
1468                     break;
1469                 }
1470
1471                 case LIR_reti:
1472                 CASE64(LIR_retq:)
1473                 case LIR_retd:
1474                     countlir_ret();
1475                     ins->oprnd1()->setResultLive();
1476                     asm_ret(ins);
1477                     break;
1478
1479                 // Allocate some stack space.  The value of this instruction
1480                 // is the address of the stack space.
1481                 case LIR_allocp:
1482                     countlir_alloc();
1483                     if (ins->isExtant()) {
1484                         NanoAssert(ins->isInAr());
1485                         if (ins->isInReg())
1486                             evict(ins);
1487                         freeResourcesOf(ins);
1488                     }
1489                     break;
1490
1491                 case LIR_immi:
1492                     countlir_imm();
1493                     if (ins->isExtant()) {
1494                         asm_immi(ins);
1495                     }
1496                     break;
1497
1498 #ifdef NANOJIT_64BIT
1499                 case LIR_immq:
1500                     countlir_imm();
1501                     if (ins->isExtant()) {
1502                         asm_immq(ins);
1503                     }
1504                     break;
1505 #endif
1506                 case LIR_immd:
1507                     countlir_imm();
1508                     if (ins->isExtant()) {
1509                         asm_immd(ins);
1510                     }
1511                     break;
1512
1513                 case LIR_paramp:
1514                     countlir_param();
1515                     if (ins->isExtant()) {
1516                         asm_param(ins);
1517                     }
1518                     break;
1519
1520 #if NJ_SOFTFLOAT_SUPPORTED
1521                 case LIR_hcalli: {
1522                     LIns* op1 = ins->oprnd1();
1523                     op1->setResultLive();
1524                     if (ins->isExtant()) {
1525                         // Return result of quad-call in register.
1526                         deprecated_prepResultReg(ins, rmask(retRegs[1]));
1527                         // If hi half was used, we must use the call to ensure it happens.
1528                         findSpecificRegFor(op1, retRegs[0]);
1529                     }
1530                     break;
1531                 }
1532
1533                 case LIR_dlo2i:
1534                     countlir_qlo();
1535                     ins->oprnd1()->setResultLive();
1536                     if (ins->isExtant()) {
1537                         asm_qlo(ins);
1538                     }
1539                     break;
1540
1541                 case LIR_dhi2i:
1542                     countlir_qhi();
1543                     ins->oprnd1()->setResultLive();
1544                     if (ins->isExtant()) {
1545                         asm_qhi(ins);
1546                     }
1547                     break;
1548
1549                 case LIR_ii2d:
1550                     countlir_qjoin();
1551                     ins->oprnd1()->setResultLive();
1552                     ins->oprnd2()->setResultLive();
1553                     if (ins->isExtant()) {
1554                         asm_qjoin(ins);
1555                     }
1556                     break;
1557 #endif
1558                 case LIR_cmovi:
1559                 CASE64(LIR_cmovq:)
1560                 case LIR_cmovd:
1561                     countlir_cmov();
1562                     ins->oprnd1()->setResultLive();
1563                     ins->oprnd2()->setResultLive();
1564                     ins->oprnd3()->setResultLive();
1565                     if (ins->isExtant()) {
1566                         asm_cmov(ins);
1567                     }
1568                     break;
1569
1570                 case LIR_lduc2ui:
1571                 case LIR_ldus2ui:
1572                 case LIR_ldc2i:
1573                 case LIR_lds2i:
1574                 case LIR_ldi:
1575                     countlir_ld();
1576                     ins->oprnd1()->setResultLive();
1577                     if (ins->isExtant()) {
1578                         asm_load32(ins);
1579                     }
1580                     break;
1581
1582                 CASE64(LIR_ldq:)
1583                 case LIR_ldd:
1584                 case LIR_ldf2d:
1585                     countlir_ldq();
1586                     ins->oprnd1()->setResultLive();
1587                     if (ins->isExtant()) {
1588                         asm_load64(ins);
1589                     }
1590                     break;
1591
1592                 case LIR_negi:
1593                 case LIR_noti:
1594                     countlir_alu();
1595                     ins->oprnd1()->setResultLive();
1596                     if (ins->isExtant()) {
1597                         asm_neg_not(ins);
1598                     }
1599                     break;
1600
1601 #if defined NANOJIT_64BIT
1602                 case LIR_addq:
1603                 case LIR_subq:
1604                 case LIR_andq:
1605                 case LIR_lshq:
1606                 case LIR_rshuq:
1607                 case LIR_rshq:
1608                 case LIR_orq:
1609                 case LIR_xorq:
1610                     countlir_alu();
1611                     ins->oprnd1()->setResultLive();
1612                     ins->oprnd2()->setResultLive();
1613                     if (ins->isExtant()) {
1614                         asm_qbinop(ins);
1615                     }
1616                     break;
1617 #endif
1618
1619                 case LIR_addi:
1620                 case LIR_subi:
1621                 case LIR_muli:
1622                 case LIR_andi:
1623                 case LIR_ori:
1624                 case LIR_xori:
1625                 case LIR_lshi:
1626                 case LIR_rshi:
1627                 case LIR_rshui:
1628                 CASE86(LIR_divi:)
1629                     countlir_alu();
1630                     ins->oprnd1()->setResultLive();
1631                     ins->oprnd2()->setResultLive();
1632                     if (ins->isExtant()) {
1633                         asm_arith(ins);
1634                     }
1635                     break;
1636
1637 #if defined NANOJIT_IA32 || defined NANOJIT_X64
1638                 CASE86(LIR_modi:)
1639                     countlir_alu();
1640                     ins->oprnd1()->setResultLive();
1641                     if (ins->isExtant()) {
1642                         asm_arith(ins);
1643                     }
1644                     break;
1645 #endif
1646
1647                 case LIR_negd:
1648                     countlir_fpu();
1649                     ins->oprnd1()->setResultLive();
1650                     if (ins->isExtant()) {
1651                         asm_fneg(ins);
1652                     }
1653                     break;
1654
1655                 case LIR_addd:
1656                 case LIR_subd:
1657                 case LIR_muld:
1658                 case LIR_divd:
1659                     countlir_fpu();
1660                     ins->oprnd1()->setResultLive();
1661                     ins->oprnd2()->setResultLive();
1662                     if (ins->isExtant()) {
1663                         asm_fop(ins);
1664                     }
1665                     break;
1666
1667                 case LIR_i2d:
1668                     countlir_fpu();
1669                     ins->oprnd1()->setResultLive();
1670                     if (ins->isExtant()) {
1671                         asm_i2d(ins);
1672                     }
1673                     break;
1674
1675                 case LIR_ui2d:
1676                     countlir_fpu();
1677                     ins->oprnd1()->setResultLive();
1678                     if (ins->isExtant()) {
1679                         asm_ui2d(ins);
1680                     }
1681                     break;
1682
1683                 case LIR_d2i:
1684                     countlir_fpu();
1685                     ins->oprnd1()->setResultLive();
1686                     if (ins->isExtant()) {
1687                         asm_d2i(ins);
1688                     }
1689                     break;
1690
1691 #ifdef NANOJIT_64BIT
1692                 case LIR_i2q:
1693                 case LIR_ui2uq:
1694                     countlir_alu();
1695                     ins->oprnd1()->setResultLive();
1696                     if (ins->isExtant()) {
1697                         asm_ui2uq(ins);
1698                     }
1699                     break;
1700
1701                 case LIR_q2i:
1702                     countlir_alu();
1703                     ins->oprnd1()->setResultLive();
1704                     if (ins->isExtant()) {
1705                         asm_q2i(ins);
1706                     }
1707                     break;
1708
1709                 case LIR_dasq:
1710                     countlir_alu();
1711                     ins->oprnd1()->setResultLive();
1712                     if (ins->isExtant()) {
1713                         asm_dasq(ins);
1714                     }
1715                     break;
1716
1717                 case LIR_qasd:
1718                     countlir_alu();
1719                     ins->oprnd1()->setResultLive();
1720                     if (ins->isExtant()) {
1721                         asm_qasd(ins);
1722                     }
1723                     break;
1724 #endif
1725                 case LIR_sti2c:
1726                 case LIR_sti2s:
1727                 case LIR_sti:
1728                     countlir_st();
1729                     ins->oprnd1()->setResultLive();
1730                     ins->oprnd2()->setResultLive();
1731                     asm_store32(op, ins->oprnd1(), ins->disp(), ins->oprnd2());
1732                     break;
1733
1734                 CASE64(LIR_stq:)
1735                 case LIR_std:
1736                 case LIR_std2f: {
1737                     countlir_stq();
1738                     ins->oprnd1()->setResultLive();
1739                     ins->oprnd2()->setResultLive();
1740                     LIns* value = ins->oprnd1();
1741                     LIns* base = ins->oprnd2();
1742                     int dr = ins->disp();
1743 #if NJ_SOFTFLOAT_SUPPORTED
1744                     if (value->isop(LIR_ii2d) && op == LIR_std)
1745                     {
1746                         // This is correct for little-endian only.
1747                         asm_store32(LIR_sti, value->oprnd1(), dr, base);
1748                         asm_store32(LIR_sti, value->oprnd2(), dr+4, base);
1749                     }
1750                     else
1751 #endif
1752                     {
1753                         asm_store64(op, value, dr, base);
1754                     }
1755                     break;
1756                 }
1757
1758                 case LIR_j:
1759                     asm_jmp(ins, pending_lives);
1760                     break;
1761
1762                 case LIR_jt:
1763                 case LIR_jf:
1764                     ins->oprnd1()->setResultLive();
1765                     asm_jcc(ins, pending_lives);
1766                     break;
1767
1768                 #if NJ_JTBL_SUPPORTED
1769                 case LIR_jtbl: {
1770                     countlir_jtbl();
1771                     ins->oprnd1()->setResultLive();
1772                     // Multiway jump can contain both forward and backward jumps.
1773                     // Out of range indices aren't allowed or checked.
1774                     // Code after this jtbl instruction is unreachable.
1775                     releaseRegisters();
1776                     NanoAssert(_allocator.activeMask() == 0);
1777
1778                     uint32_t count = ins->getTableSize();
1779                     bool has_back_edges = false;
1780
1781                     // Merge the regstates of labels we have already seen.
1782                     for (uint32_t i = count; i-- > 0;) {
1783                         LIns* to = ins->getTarget(i);
1784                         LabelState *lstate = _labels.get(to);
1785                         if (lstate) {
1786                             unionRegisterState(lstate->regs);
1787                             verbose_only( RefBuf b; )
1788                             asm_output("   %u: [&%s]", i, _thisfrag->lirbuf->printer->formatRef(&b, to));
1789                         } else {
1790                             has_back_edges = true;
1791                         }
1792                     }
1793                     asm_output("forward edges");
1794
1795                     // In a multi-way jump, the register allocator has no ability to deal
1796                     // with two existing edges that have conflicting register assignments, unlike
1797                     // a conditional branch where code can be inserted on the fall-through path
1798                     // to reconcile registers.  So, frontends *must* insert LIR_regfence at labels of
1799                     // forward jtbl jumps.  Check here to make sure no registers were picked up from
1800                     // any forward edges.
1801                     NanoAssert(_allocator.activeMask() == 0);
1802
1803                     if (has_back_edges) {
1804                         handleLoopCarriedExprs(pending_lives);
1805                         // save merged (empty) register state at target labels we haven't seen yet
1806                         for (uint32_t i = count; i-- > 0;) {
1807                             LIns* to = ins->getTarget(i);
1808                             LabelState *lstate = _labels.get(to);
1809                             if (!lstate) {
1810                                 _labels.add(to, 0, _allocator);
1811                                 verbose_only( RefBuf b; )
1812                                 asm_output("   %u: [&%s]", i, _thisfrag->lirbuf->printer->formatRef(&b, to));
1813                             }
1814                         }
1815                         asm_output("backward edges");
1816                     }
1817
1818                     // Emit the jump instruction, which allocates 1 register for the jump index.
1819                     NIns** native_table = new (_dataAlloc) NIns*[count];
1820                     asm_output("[%p]:", (void*)native_table);
1821                     _patches.put((NIns*)native_table, ins);
1822                     asm_jtbl(ins, native_table);
1823                     break;
1824                 }
1825                 #endif
1826
1827                 case LIR_label: {
1828                     countlir_label();
1829                     LabelState *label = _labels.get(ins);
1830                     // add profiling inc, if necessary.
1831                     verbose_only( if (_logc->lcbits & LC_FragProfile) {
1832                         if (ins == _thisfrag->loopLabel)
1833                             asm_inc_m32(& _thisfrag->profCount);
1834                     })
1835                     if (!label) {
1836                         // label seen first, normal target of forward jump, save addr & allocator
1837                         _labels.add(ins, _nIns, _allocator);
1838                     }
1839                     else {
1840                         // we're at the top of a loop
1841                         NanoAssert(label->addr == 0);
1842                         //evictAllActiveRegs();
1843                         intersectRegisterState(label->regs);
1844                         label->addr = _nIns;
1845                     }
1846                     verbose_only(
1847                         RefBuf b;
1848                         if (_logc->lcbits & LC_Native) {
1849                             asm_output("[%s]", _thisfrag->lirbuf->printer->formatRef(&b, ins));
1850                     })
1851                     break;
1852                 }
1853
1854                 case LIR_xbarrier:
1855                     break;
1856
1857                 case LIR_xtbl: {
1858                     ins->oprnd1()->setResultLive();
1859 #ifdef NANOJIT_IA32
1860                     NIns* exit = asm_exit(ins); // does intersectRegisterState()
1861                     asm_switch(ins, exit);
1862 #else
1863                     NanoAssertMsg(0, "Not supported for this architecture");
1864 #endif
1865                     break;
1866                 }
1867
1868                 case LIR_xt:
1869                 case LIR_xf:
1870                     ins->oprnd1()->setResultLive();
1871                     asm_xcc(ins);
1872                     break;
1873
1874                 case LIR_x:
1875                     asm_x(ins);
1876                     break;
1877
1878                 case LIR_addxovi:
1879                 case LIR_subxovi:
1880                 case LIR_mulxovi:
1881                     verbose_only( _thisfrag->nStaticExits++; )
1882                     countlir_xcc();
1883                     countlir_alu();
1884                     ins->oprnd1()->setResultLive();
1885                     ins->oprnd2()->setResultLive();
1886                     if (ins->isExtant()) {
1887                         NIns* exit = asm_exit(ins); // does intersectRegisterState()
1888                         asm_branch_ov(op, exit);
1889                         asm_arith(ins);
1890                     }
1891                     break;
1892
1893                 case LIR_addjovi:
1894                 case LIR_subjovi:
1895                 case LIR_muljovi:
1896                     countlir_jcc();
1897                     countlir_alu();
1898                     ins->oprnd1()->setResultLive();
1899                     ins->oprnd2()->setResultLive();
1900                     if (ins->isExtant()) {
1901                         asm_jov(ins, pending_lives);
1902                         asm_arith(ins);
1903                     }
1904                     break;
1905
1906 #ifdef NANOJIT_64BIT
1907                 case LIR_addjovq:
1908                 case LIR_subjovq:
1909                     countlir_jcc();
1910                     countlir_alu();
1911                     ins->oprnd1()->setResultLive();
1912                     ins->oprnd2()->setResultLive();
1913                     if (ins->isExtant()) {
1914                         asm_jov(ins, pending_lives);
1915                         asm_qbinop(ins);
1916                     }
1917                     break;
1918 #endif
1919
1920                 case LIR_eqd:
1921                 case LIR_led:
1922                 case LIR_ltd:
1923                 case LIR_gtd:
1924                 case LIR_ged:
1925                     countlir_fpu();
1926                     ins->oprnd1()->setResultLive();
1927                     ins->oprnd2()->setResultLive();
1928                     if (ins->isExtant()) {
1929                         asm_condd(ins);
1930                     }
1931                     break;
1932
1933                 case LIR_eqi:
1934                 case LIR_lei:
1935                 case LIR_lti:
1936                 case LIR_gti:
1937                 case LIR_gei:
1938                 case LIR_ltui:
1939                 case LIR_leui:
1940                 case LIR_gtui:
1941                 case LIR_geui:
1942                 CASE64(LIR_eqq:)
1943                 CASE64(LIR_leq:)
1944                 CASE64(LIR_ltq:)
1945                 CASE64(LIR_gtq:)
1946                 CASE64(LIR_geq:)
1947                 CASE64(LIR_ltuq:)
1948                 CASE64(LIR_leuq:)
1949                 CASE64(LIR_gtuq:)
1950                 CASE64(LIR_geuq:)
1951                     countlir_alu();
1952                     ins->oprnd1()->setResultLive();
1953                     ins->oprnd2()->setResultLive();
1954                     if (ins->isExtant()) {
1955                         asm_cond(ins);
1956                     }
1957                     break;
1958
1959                 case LIR_calli:
1960                 CASE64(LIR_callq:)
1961                 case LIR_calld:
1962                     countlir_call();
1963                     for (int i = 0, argc = ins->argc(); i < argc; i++)
1964                         ins->arg(i)->setResultLive();
1965                     // It must be impure or pure-and-extant -- it couldn't be
1966                     // pure-and-not-extant, because there's no way the codegen
1967                     // for a call can be folded into the codegen of another
1968                     // LIR instruction.
1969                     NanoAssert(!ins->callInfo()->_isPure || ins->isExtant());
1970                     asm_call(ins);
1971                     break;
1972
1973                 #ifdef VMCFG_VTUNE
1974                 case LIR_file: {
1975                      // we traverse backwards so we are now hitting the file
1976                      // that is associated with a bunch of LIR_lines we already have seen
1977                     if (vtuneHandle) {
1978                         void * currentFile = (void *) ins->oprnd1()->immI();
1979                         vtuneFile(vtuneHandle, currentFile);
1980                     }
1981                     break;
1982                 }
1983                 case LIR_line: {
1984                      // add a new table entry, we don't yet knwo which file it belongs
1985                      // to so we need to add it to the update table too
1986                      // note the alloc, actual act is delayed; see above
1987                     if (vtuneHandle) {
1988                         uint32_t currentLine = (uint32_t) ins->oprnd1()->immI();
1989                         vtuneLine(vtuneHandle, currentLine, _nIns);
1990                     }
1991                     break;
1992                 }
1993                #endif // VMCFG_VTUNE
1994
1995             }
1996
1997 #ifdef NJ_VERBOSE
1998             // We do final LIR printing inside this loop to avoid printing
1999             // dead LIR instructions.  We print the LIns after generating the
2000             // code.  This ensures that the LIns will appear in debug output
2001             // *before* the native code, because Assembler::outputf()
2002             // prints everything in reverse.
2003             //
2004             if (_logc->lcbits & LC_AfterDCE) {
2005                 InsBuf b;
2006                 LInsPrinter* printer = _thisfrag->lirbuf->printer;
2007                 outputf("    %s", printer->formatIns(&b, ins));
2008             }
2009 #endif
2010
2011             if (error())
2012                 return;
2013
2014             // check that all is well (don't check in exit paths since its more complicated)
2015             debug_only( pageValidate(); )
2016             debug_only( resourceConsistencyCheck();  )
2017         }
2018     }
2019
2020     /*
2021      * Write a jump table for the given SwitchInfo and store the table
2022      * address in the SwitchInfo. Every entry will initially point to
2023      * target.
2024      */
2025     void Assembler::emitJumpTable(SwitchInfo* si, NIns* target)
2026     {
2027         si->table = (NIns **) alloc.alloc(si->count * sizeof(NIns*));
2028         for (uint32_t i = 0; i < si->count; ++i)
2029             si->table[i] = target;
2030     }
2031
2032     void Assembler::assignSavedRegs()
2033     {
2034         // Restore saved regsters.
2035         LirBuffer *b = _thisfrag->lirbuf;
2036         for (int i=0, n = NumSavedRegs; i < n; i++) {
2037             LIns *p = b->savedRegs[i];
2038             if (p)
2039                 findSpecificRegForUnallocated(p, savedRegs[p->paramArg()]);
2040         }
2041     }
2042
2043     void Assembler::reserveSavedRegs()
2044     {
2045         LirBuffer *b = _thisfrag->lirbuf;
2046         for (int i = 0, n = NumSavedRegs; i < n; i++) {
2047             LIns *ins = b->savedRegs[i];
2048             if (ins)
2049                 findMemFor(ins);
2050         }
2051     }
2052
2053     void Assembler::assignParamRegs()
2054     {
2055         LIns* state = _thisfrag->lirbuf->state;
2056         if (state)
2057             findSpecificRegForUnallocated(state, argRegs[state->paramArg()]);
2058         LIns* param1 = _thisfrag->lirbuf->param1;
2059         if (param1)
2060             findSpecificRegForUnallocated(param1, argRegs[param1->paramArg()]);
2061     }
2062
2063     void Assembler::handleLoopCarriedExprs(InsList& pending_lives)
2064     {
2065         // ensure that exprs spanning the loop are marked live at the end of the loop
2066         reserveSavedRegs();
2067         for (Seq<LIns*> *p = pending_lives.get(); p != NULL; p = p->tail) {
2068             LIns *ins = p->head;
2069             NanoAssert(isLiveOpcode(ins->opcode()));
2070             LIns *op1 = ins->oprnd1();
2071             // Must findMemFor even if we're going to findRegFor; loop-carried
2072             // operands may spill on another edge, and we need them to always
2073             // spill to the same place.
2074 #if NJ_USES_IMMD_POOL
2075             // Exception: if float constants are true constants, we should
2076             // never call findMemFor on those ops.
2077             if (!op1->isImmD())
2078 #endif
2079             {
2080                 findMemFor(op1);
2081             }
2082             if (!op1->isImmAny())
2083                 findRegFor(op1, ins->isop(LIR_lived) ? FpRegs : GpRegs);
2084         }
2085
2086         // clear this list since we have now dealt with those lifetimes.  extending
2087         // their lifetimes again later (earlier in the code) serves no purpose.
2088         pending_lives.clear();
2089     }
2090
2091     void AR::freeEntryAt(uint32_t idx)
2092     {
2093         NanoAssert(idx > 0 && idx <= _highWaterMark);
2094
2095         // NB: this loop relies on using entry[0] being NULL,
2096         // so that we are guaranteed to terminate
2097         // without access negative entries.
2098         LIns* i = _entries[idx];
2099         NanoAssert(i != NULL);
2100         do {
2101             _entries[idx] = NULL;
2102             idx--;
2103         } while (_entries[idx] == i);
2104     }
2105
2106 #ifdef NJ_VERBOSE
2107     void Assembler::printRegState()
2108     {
2109         char* s = &outline[0];
2110         VMPI_memset(s, ' ', 26);  s[26] = '\0';
2111         s += VMPI_strlen(s);
2112         VMPI_sprintf(s, "RR");
2113         s += VMPI_strlen(s);
2114
2115         RegisterMask active = _allocator.activeMask();
2116         for (Register r = lsReg(active); active != 0; r = nextLsReg(active, r)) {
2117             LIns *ins = _allocator.getActive(r);
2118             NanoAssertMsg(!_allocator.isFree(r),
2119                           "Coding error; register is both free and active! " );
2120             RefBuf b;
2121             const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
2122
2123             if (ins->isop(LIR_paramp) && ins->paramKind()==1 &&
2124                 r == Assembler::savedRegs[ins->paramArg()])
2125             {
2126                 // dont print callee-saved regs that arent used
2127                 continue;
2128             }
2129
2130             VMPI_sprintf(s, " %s(%s)", gpn(r), n);
2131             s += VMPI_strlen(s);
2132         }
2133         output();
2134     }
2135
2136     void Assembler::printActivationState()
2137     {
2138         char* s = &outline[0];
2139         VMPI_memset(s, ' ', 26);  s[26] = '\0';
2140         s += VMPI_strlen(s);
2141         VMPI_sprintf(s, "AR");
2142         s += VMPI_strlen(s);
2143
2144         LIns* ins = 0;
2145         uint32_t nStackSlots = 0;
2146         int32_t arIndex = 0;
2147         for (AR::Iter iter(_activation); iter.next(ins, nStackSlots, arIndex); )
2148         {
2149             RefBuf b;
2150             const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
2151             if (nStackSlots > 1) {
2152                 VMPI_sprintf(s," %d-%d(%s)", 4*arIndex, 4*(arIndex+nStackSlots-1), n);
2153             }
2154             else {
2155                 VMPI_sprintf(s," %d(%s)", 4*arIndex, n);
2156             }
2157             s += VMPI_strlen(s);
2158         }
2159         output();
2160     }
2161 #endif
2162
2163     inline bool AR::isEmptyRange(uint32_t start, uint32_t nStackSlots) const
2164     {
2165         for (uint32_t i=0; i < nStackSlots; i++)
2166         {
2167             if (_entries[start-i] != NULL)
2168                 return false;
2169         }
2170         return true;
2171     }
2172
2173     uint32_t AR::reserveEntry(LIns* ins)
2174     {
2175         uint32_t const nStackSlots = nStackSlotsFor(ins);
2176
2177         if (nStackSlots == 1)
2178         {
2179             for (uint32_t i = 1; i <= _highWaterMark; i++)
2180             {
2181                 if (_entries[i] == NULL)
2182                 {
2183                     _entries[i] = ins;
2184                     return i;
2185                 }
2186             }
2187             if (_highWaterMark < NJ_MAX_STACK_ENTRY - 1)
2188             {
2189                 NanoAssert(_entries[_highWaterMark+1] == BAD_ENTRY);
2190                 _highWaterMark++;
2191                 _entries[_highWaterMark] = ins;
2192                 return _highWaterMark;
2193              }
2194         }
2195         else
2196         {
2197             // alloc larger block on 8byte boundary.
2198             uint32_t const start = nStackSlots + (nStackSlots & 1);
2199             for (uint32_t i = start; i <= _highWaterMark; i += 2)
2200             {
2201                 if (isEmptyRange(i, nStackSlots))
2202                 {
2203                     // place the entry in the table and mark the instruction with it
2204                     for (uint32_t j=0; j < nStackSlots; j++)
2205                     {
2206                         NanoAssert(i-j <= _highWaterMark);
2207                         NanoAssert(_entries[i-j] == NULL);
2208                         _entries[i-j] = ins;
2209                     }
2210                     return i;
2211                 }
2212             }
2213
2214             // Be sure to account for any 8-byte-round-up when calculating spaceNeeded.
2215             uint32_t const spaceLeft = NJ_MAX_STACK_ENTRY - _highWaterMark - 1;
2216             uint32_t const spaceNeeded = nStackSlots + (_highWaterMark & 1);
2217             if (spaceLeft >= spaceNeeded)
2218             {
2219                 if (_highWaterMark & 1)
2220                 {
2221                     NanoAssert(_entries[_highWaterMark+1] == BAD_ENTRY);
2222                     _entries[_highWaterMark+1] = NULL;
2223                 }
2224                 _highWaterMark += spaceNeeded;
2225                 for (uint32_t j = 0; j < nStackSlots; j++)
2226                 {
2227                     NanoAssert(_highWaterMark-j < NJ_MAX_STACK_ENTRY);
2228                     NanoAssert(_entries[_highWaterMark-j] == BAD_ENTRY);
2229                     _entries[_highWaterMark-j] = ins;
2230                 }
2231                 return _highWaterMark;
2232             }
2233         }
2234         // no space. oh well.
2235         return 0;
2236     }
2237
2238     #ifdef _DEBUG
2239     void AR::checkForResourceLeaks() const
2240     {
2241         for (uint32_t i = 1; i <= _highWaterMark; i++) {
2242             NanoAssertMsgf(_entries[i] == NULL, "frame entry %d wasn't freed\n",4*i);
2243         }
2244     }
2245     #endif
2246
2247     uint32_t Assembler::arReserve(LIns* ins)
2248     {
2249         uint32_t i = _activation.reserveEntry(ins);
2250         if (!i)
2251             setError(StackFull);
2252         return i;
2253     }
2254
2255     void Assembler::arFree(LIns* ins)
2256     {
2257         NanoAssert(ins->isInAr());
2258         uint32_t arIndex = ins->getArIndex();
2259         NanoAssert(arIndex);
2260         NanoAssert(_activation.isValidEntry(arIndex, ins));
2261         _activation.freeEntryAt(arIndex);        // free any stack stack space associated with entry
2262     }
2263
2264     /**
2265      * Move regs around so the SavedRegs contains the highest priority regs.
2266      */
2267     void Assembler::evictScratchRegsExcept(RegisterMask ignore)
2268     {
2269         // Find the top GpRegs that are candidates to put in SavedRegs.
2270
2271         // 'tosave' is a binary heap stored in an array.  The root is tosave[0],
2272         // left child is at i+1, right child is at i+2.
2273
2274         Register tosave[LastReg-FirstReg+1];
2275         int len=0;
2276         RegAlloc *regs = &_allocator;
2277         RegisterMask evict_set = regs->activeMask() & GpRegs & ~ignore;
2278         for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r)) {
2279             LIns *ins = regs->getActive(r);
2280             if (canRemat(ins)) {
2281                 NanoAssert(ins->getReg() == r);
2282                 evict(ins);
2283             }
2284             else {
2285                 int32_t pri = regs->getPriority(r);
2286                 // add to heap by adding to end and bubbling up
2287                 int j = len++;
2288                 while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
2289                     tosave[j] = tosave[j/2];
2290                     j /= 2;
2291                 }
2292                 NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
2293                 tosave[j] = r;
2294             }
2295         }
2296
2297         // Now primap has the live exprs in priority order.
2298         // Allocate each of the top priority exprs to a SavedReg.
2299
2300         RegisterMask allow = SavedRegs;
2301         while (allow && len > 0) {
2302             // get the highest priority var
2303             Register hi = tosave[0];
2304             if (!(rmask(hi) & SavedRegs)) {
2305                 LIns *ins = regs->getActive(hi);
2306                 Register r = findRegFor(ins, allow);
2307                 allow &= ~rmask(r);
2308             }
2309             else {
2310                 // hi is already in a saved reg, leave it alone.
2311                 allow &= ~rmask(hi);
2312             }
2313
2314             // remove from heap by replacing root with end element and bubbling down.
2315             if (allow && --len > 0) {
2316                 Register last = tosave[len];
2317                 int j = 0;
2318                 while (j+1 < len) {
2319                     int child = j+1;
2320                     if (j+2 < len && regs->getPriority(tosave[j+2]) > regs->getPriority(tosave[j+1]))
2321                         child++;
2322                     if (regs->getPriority(last) > regs->getPriority(tosave[child]))
2323                         break;
2324                     tosave[j] = tosave[child];
2325                     j = child;
2326                 }
2327                 tosave[j] = last;
2328             }
2329         }
2330
2331         // now evict everything else.
2332         evictSomeActiveRegs(~(SavedRegs | ignore));
2333     }
2334
2335     // Generate code to restore any registers in 'regs' that are currently active,
2336     void Assembler::evictSomeActiveRegs(RegisterMask regs)
2337     {
2338         RegisterMask evict_set = regs & _allocator.activeMask();
2339         for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r))
2340             evict(_allocator.getActive(r));
2341     }
2342
2343     /**
2344      * Merge the current regstate with a previously stored version.
2345      *
2346      * Situation                            Change to _allocator
2347      * ---------                            --------------------
2348      * !current & !saved
2349      * !current &  saved                    add saved
2350      *  current & !saved                    evict current (unionRegisterState does nothing)
2351      *  current &  saved & current==saved
2352      *  current &  saved & current!=saved   evict current, add saved
2353      */
2354     void Assembler::intersectRegisterState(RegAlloc& saved)
2355     {
2356         Register regsTodo[LastReg + 1];
2357         LIns* insTodo[LastReg + 1];
2358         int nTodo = 0;
2359
2360         // Do evictions and pops first.
2361         verbose_only(bool shouldMention=false; )
2362         // The obvious thing to do here is to iterate from FirstReg to LastReg.
2363         // However, on ARM that causes lower-numbered integer registers
2364         // to be be saved at higher addresses, which inhibits the formation
2365         // of load/store multiple instructions.  Hence iterate the loop the
2366         // other way.
2367         RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
2368         for (Register r = msReg(reg_set); reg_set; r = nextMsReg(reg_set, r))
2369         {
2370             LIns* curins = _allocator.getActive(r);
2371             LIns* savedins = saved.getActive(r);
2372             if (curins != savedins)
2373             {
2374                 if (savedins) {
2375                     regsTodo[nTodo] = r;
2376                     insTodo[nTodo] = savedins;
2377                     nTodo++;
2378                 }
2379                 if (curins) {
2380                     //_nvprof("intersect-evict",1);
2381                     verbose_only( shouldMention=true; )
2382                     NanoAssert(curins->getReg() == r);
2383                     evict(curins);
2384                 }
2385
2386                 #ifdef NANOJIT_IA32
2387                 if (savedins && r == FST0) {
2388                     verbose_only( shouldMention=true; )
2389                     FSTP(FST0);
2390                 }
2391                 #endif
2392             }
2393         }
2394         // Now reassign mainline registers.
2395         for (int i = 0; i < nTodo; i++) {
2396             findSpecificRegFor(insTodo[i], regsTodo[i]);
2397         }
2398         verbose_only(
2399             if (shouldMention)
2400                 verbose_outputf("## merging registers (intersect) with existing edge");
2401         )
2402     }
2403
2404     /**
2405      * Merge the current state of the registers with a previously stored version.
2406      *
2407      * Situation                            Change to _allocator
2408      * ---------                            --------------------
2409      * !current & !saved                    none
2410      * !current &  saved                    add saved
2411      *  current & !saved                    none (intersectRegisterState evicts current)
2412      *  current &  saved & current==saved   none
2413      *  current &  saved & current!=saved   evict current, add saved
2414      */
2415     void Assembler::unionRegisterState(RegAlloc& saved)
2416     {
2417         Register regsTodo[LastReg + 1];
2418         LIns* insTodo[LastReg + 1];
2419         int nTodo = 0;
2420
2421         // Do evictions and pops first.
2422         verbose_only(bool shouldMention=false; )
2423         RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
2424         for (Register r = lsReg(reg_set); reg_set; r = nextLsReg(reg_set, r))
2425         {
2426             LIns* curins = _allocator.getActive(r);
2427             LIns* savedins = saved.getActive(r);
2428             if (curins != savedins)
2429             {
2430                 if (savedins) {
2431                     regsTodo[nTodo] = r;
2432                     insTodo[nTodo] = savedins;
2433                     nTodo++;
2434                 }
2435                 if (curins && savedins) {
2436                     //_nvprof("union-evict",1);
2437                     verbose_only( shouldMention=true; )
2438                     NanoAssert(curins->getReg() == r);
2439                     evict(curins);
2440                 }
2441
2442                 #ifdef NANOJIT_IA32
2443                 if (r == FST0) {
2444                     if (savedins) {
2445                         // Discard top of x87 stack.
2446                         FSTP(FST0);
2447                     }
2448                     else if (curins) {
2449                         // Saved state did not have fpu reg allocated,
2450                         // so we must evict here to keep x87 stack balanced.
2451                         evict(curins);
2452                     }
2453                     verbose_only( shouldMention=true; )
2454                 }
2455                 #endif
2456             }
2457         }
2458         // Now reassign mainline registers.
2459         for (int i = 0; i < nTodo; i++) {
2460             findSpecificRegFor(insTodo[i], regsTodo[i]);
2461         }
2462         verbose_only(
2463             if (shouldMention)
2464                 verbose_outputf("## merging registers (union) with existing edge");
2465         )
2466     }
2467
2468     // Scan table for instruction with the lowest priority, meaning it is used
2469     // furthest in the future.
2470     LIns* Assembler::findVictim(RegisterMask allow)
2471     {
2472         NanoAssert(allow);
2473         LIns *ins, *vic = 0;
2474         int allow_pri = 0x7fffffff;
2475         RegisterMask vic_set = allow & _allocator.activeMask();
2476         for (Register r = lsReg(vic_set); vic_set; r = nextLsReg(vic_set, r))
2477         {
2478             ins = _allocator.getActive(r);
2479             int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
2480             if (!vic || pri < allow_pri) {
2481                 vic = ins;
2482                 allow_pri = pri;
2483             }
2484         }
2485         NanoAssert(vic != 0);
2486         return vic;
2487     }
2488
2489 #ifdef NJ_VERBOSE
2490     char Assembler::outline[8192];
2491     char Assembler::outlineEOL[512];
2492
2493     void Assembler::output()
2494     {
2495         // The +1 is for the terminating NUL char.
2496         VMPI_strncat(outline, outlineEOL, sizeof(outline)-(strlen(outline)+1));
2497
2498         if (_outputCache) {
2499             char* str = new (alloc) char[VMPI_strlen(outline)+1];
2500             VMPI_strcpy(str, outline);
2501             _outputCache->insert(str);
2502         } else {
2503             _logc->printf("%s\n", outline);
2504         }
2505
2506         outline[0] = '\0';
2507         outlineEOL[0] = '\0';
2508     }
2509
2510     void Assembler::outputf(const char* format, ...)
2511     {
2512         va_list args;
2513         va_start(args, format);
2514
2515         outline[0] = '\0';
2516         vsprintf(outline, format, args);
2517         output();
2518     }
2519
2520     void Assembler::setOutputForEOL(const char* format, ...)
2521     {
2522         va_list args;
2523         va_start(args, format);
2524
2525         outlineEOL[0] = '\0';
2526         vsprintf(outlineEOL, format, args);
2527     }
2528 #endif // NJ_VERBOSE
2529
2530     void LabelStateMap::add(LIns *label, NIns *addr, RegAlloc &regs) {
2531         LabelState *st = new (alloc) LabelState(addr, regs);
2532         labels.put(label, st);
2533     }
2534
2535     LabelState* LabelStateMap::get(LIns *label) {
2536         return labels.get(label);
2537     }
2538 }
2539 #endif /* FEATURE_NANOJIT */