hphp/runtime/vm/jit/translator-x64.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com)     |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16 #include "hphp/runtime/vm/jit/translator-x64.h"
  17
  18 #include <cinttypes>
  19 #include <stdint.h>
  20 #include <assert.h>
  21 #include <unistd.h>
  22 #include <sys/mman.h>
  23 #include <strstream>
  24 #include <stdio.h>
  25 #include <stdarg.h>
  26 #include <string>
  27 #include <queue>
  28 #include <unwind.h>
  29 #include <unordered_set>
  30 #include <signal.h>
  31 #ifdef __FreeBSD__
  32 #include <sys/ucontext.h>
  33 #endif
  34
  35 #ifdef __FreeBSD__
  36 #define RIP_REGISTER(v) (v).mc_rip
  37 #elif defined(__APPLE__)
  38 #define RIP_REGISTER(v) (v)->__ss.__rip
  39 #elif defined(__x86_64__)
  40 #define RIP_REGISTER(v) (v).gregs[REG_RIP]
  41 #elif defined(__AARCH64EL__)
  42 #define RIP_REGISTER(v) (v).pc
  43 #else
  44 #error How is rip accessed on this architecture?
  45 #endif
  46
  47 #include <boost/bind.hpp>
  48 #include <boost/optional.hpp>
  49 #include <boost/utility/typed_in_place_factory.hpp>
  50 #include <boost/range/adaptors.hpp>
  51 #include <boost/scoped_ptr.hpp>
  52
  53 #include "folly/Format.h"
  54
  55 #include "hphp/util/asm-x64.h"
  56 #include "hphp/util/bitops.h"
  57 #include "hphp/util/debug.h"
  58 #include "hphp/util/disasm.h"
  59 #include "hphp/util/maphuge.h"
  60 #include "hphp/util/rank.h"
  61 #include "hphp/util/ringbuffer.h"
  62 #include "hphp/util/timer.h"
  63 #include "hphp/util/trace.h"
  64 #include "hphp/util/meta.h"
  65 #include "hphp/util/process.h"
  66 #include "hphp/util/util.h"
  67 #include "hphp/util/repo_schema.h"
  68 #include "hphp/util/cycles.h"
  69
  70 #include "hphp/runtime/vm/bytecode.h"
  71 #include "hphp/runtime/vm/php-debug.h"
  72 #include "hphp/runtime/vm/runtime.h"
  73 #include "hphp/runtime/base/complex_types.h"
  74 #include "hphp/runtime/base/execution_context.h"
  75 #include "hphp/runtime/base/runtime_option.h"
  76 #include "hphp/runtime/base/strings.h"
  77 #include "hphp/runtime/base/strings.h"
  78 #include "hphp/runtime/server/source_root_info.h"
  79 #include "hphp/runtime/base/zend-string.h"
  80 #include "hphp/runtime/ext/ext_closure.h"
  81 #include "hphp/runtime/ext/ext_continuation.h"
  82 #include "hphp/runtime/ext/ext_function.h"
  83 #include "hphp/runtime/vm/debug/debug.h"
  84 #include "hphp/runtime/base/stats.h"
  85 #include "hphp/runtime/vm/pendq.h"
  86 #include "hphp/runtime/vm/treadmill.h"
  87 #include "hphp/runtime/vm/repo.h"
  88 #include "hphp/runtime/vm/type-profile.h"
  89 #include "hphp/runtime/vm/member-operations.h"
  90 #include "hphp/runtime/vm/jit/abi-x64.h"
  91 #include "hphp/runtime/vm/jit/check.h"
  92 #include "hphp/runtime/vm/jit/code-gen.h"
  93 #include "hphp/runtime/vm/jit/hhbc-translator.h"
  94 #include "hphp/runtime/vm/jit/ir-translator.h"
  95 #include "hphp/runtime/vm/jit/normalized-instruction.h"
  96 #include "hphp/runtime/vm/jit/opt.h"
  97 #include "hphp/runtime/vm/jit/print.h"
  98 #include "hphp/runtime/vm/jit/region-selection.h"
  99 #include "hphp/runtime/vm/jit/srcdb.h"
 100 #include "hphp/runtime/vm/jit/target-cache.h"
 101 #include "hphp/runtime/vm/jit/tracelet.h"
 102 #include "hphp/runtime/vm/jit/translator-inline.h"
 103 #include "hphp/runtime/vm/jit/unwind-x64.h"
 104 #include "hphp/runtime/vm/jit/x64-util.h"
 105
 106 #include "hphp/runtime/vm/jit/translator-x64-internal.h"
 107
 108 namespace HPHP {
 109 namespace Transl {
 110
 111 using namespace reg;
 112 using namespace Util;
 113 using namespace Trace;
 114 using std::max;
 115
 116 #define TRANS_PERF_COUNTERS \
 117   TPC(translate) \
 118   TPC(retranslate) \
 119   TPC(interp_bb) \
 120   TPC(interp_instr) \
 121   TPC(interp_one) \
 122   TPC(max_trans) \
 123   TPC(enter_tc) \
 124   TPC(service_req)
 125
 126 static const char* const kInstrCountTx64Name = "instr_tx64";
 127 static const char* const kInstrCountIRName = "instr_hhir";
 128
 129 #define TPC(n) "trans_" #n,
 130 static const char* const kPerfCounterNames[] = {
 131   TRANS_PERF_COUNTERS
 132   kInstrCountTx64Name,
 133   kInstrCountIRName,
 134 };
 135 #undef TPC
 136
 137 #define TPC(n) tpc_ ## n,
 138 enum TransPerfCounter {
 139   TRANS_PERF_COUNTERS
 140   tpc_num_counters
 141 };
 142 #undef TPC
 143 static __thread int64_t s_perfCounters[tpc_num_counters];
 144 #define INC_TPC(n) ++s_perfCounters[tpc_ ## n];
 145
 146 // nextTx64: Global shared state. The tx64 that should be used for
 147 // new requests going forward.
 148 TranslatorX64* volatile nextTx64;
 149 // tx64: Thread-local state. The tx64 we're using for the current request.
 150 __thread TranslatorX64* tx64;
 151
 152 // Register dirtiness: thread-private.
 153 __thread VMRegState tl_regState = VMRegState::CLEAN;
 154
 155 static StaticString s___call(LITSTR_INIT("__call"));
 156 static StaticString s___callStatic(LITSTR_INIT("__callStatic"));
 157
 158 // Initialize at most this many locals inline in function body prologue; more
 159 // than this, and emitting a loop is more compact. To be precise, the actual
 160 // crossover point in terms of code size is 6; 9 was determined by experiment to
 161 // be the optimal point in certain benchmarks. #microoptimization
 162 static const int kLocalsToInitializeInline = 9;
 163
 164 // An intentionally funny-looking-in-core-dumps constant for uninitialized
 165 // instruction pointers.
 166 static const uint64_t kUninitializedRIP = 0xba5eba11acc01ade;
 167
 168 // stubBlock --
 169 //   Used to emit a bunch of outlined code that is unconditionally jumped to.
 170 template <typename L>
 171 void stubBlock(X64Assembler& hot, X64Assembler& cold, const L& body) {
 172   hot.  jmp(cold.frontier());
 173   guardDiamond(cold, body);
 174   cold. jmp(hot.frontier());
 175 }
 176
 177 static bool
 178 typeCanBeStatic(DataType t) {
 179   return t != KindOfObject && t != KindOfResource && t != KindOfRef;
 180 }
 181
 182 // IfCountNotStatic --
 183 //   Emits if (%reg->_count != RefCountStaticValue) { ... }.
 184 //   May short-circuit this check if the type is known to be
 185 //   static already.
 186 struct IfCountNotStatic {
 187   typedef CondBlock<FAST_REFCOUNT_OFFSET,
 188                     RefCountStaticValue,
 189                     CC_Z,
 190                     field_type(RefData, m_count)> NonStaticCondBlock;
 191   NonStaticCondBlock *m_cb; // might be null
 192   IfCountNotStatic(X64Assembler& a,
 193                    PhysReg reg,
 194                    DataType t = KindOfInvalid) {
 195     // Objects and variants cannot be static
 196     if (typeCanBeStatic(t)) {
 197       m_cb = new NonStaticCondBlock(a, reg);
 198     } else {
 199       m_cb = nullptr;
 200     }
 201   }
 202
 203   ~IfCountNotStatic() {
 204     delete m_cb;
 205   }
 206 };
 207
 208 // Segfault handler: figure out if it's an intentional segfault
 209 // (timeout exception) and if so, act appropriately. Otherwise, pass
 210 // the signal on.
 211 void TranslatorX64::SEGVHandler(int signum, siginfo_t *info, void *ctx) {
 212   TranslatorX64 *self = Get();
 213   void *surprisePage =
 214     ThreadInfo::s_threadInfo->m_reqInjectionData.surprisePage;
 215   if (info->si_addr == surprisePage) {
 216     ucontext_t *ucontext = (ucontext_t*)ctx;
 217     TCA rip = (TCA)RIP_REGISTER(ucontext->uc_mcontext);
 218     SignalStubMap::const_accessor a;
 219     if (!self->m_segvStubs.find(a, rip)) {
 220       NOT_REACHED();
 221     }
 222     TCA astubsCall = a->second;
 223
 224     // When this handler returns, "call" the astubs code for this
 225     // surprise check.
 226     RIP_REGISTER(ucontext->uc_mcontext) = (uintptr_t)astubsCall;
 227
 228     // We've processed this event; reset the page in case execution
 229     // continues normally.
 230     g_vmContext->m_stack.unprotect();
 231   } else {
 232     sig_t handler = (sig_t)self->m_segvChain;
 233     if (handler == SIG_DFL || handler == SIG_IGN) {
 234       signal(signum, handler);
 235       raise(signum);
 236     } else {
 237       self->m_segvChain(signum, info, ctx);
 238     }
 239   }
 240 }
 241
 242 // Logical register move: ensures the value in src will be in dest
 243 // after execution, but might do so in strange ways. Do not count on
 244 // being able to smash dest to a different register in the future, e.g.
 245 void
 246 emitMovRegReg(X64Assembler& a, PhysReg src, PhysReg dest) {
 247   SpaceRecorder("_RegMove", a);
 248   if (src != dest) {
 249     a.  movq (src, dest);
 250   }
 251 }
 252
 253 void
 254 emitLea(X64Assembler& a, PhysReg base, int disp, PhysReg dest) {
 255   if (!disp) {
 256     emitMovRegReg(a, base, dest);
 257     return;
 258   }
 259   a.   lea  (base[disp], dest);
 260 }
 261
 262 static void UNUSED tc_debug_print(const char* message,
 263                            uintptr_t r1,
 264                            uintptr_t r2,
 265                            uintptr_t r3,
 266                            ActRec* fp) {
 267   TRACE(1, "*********************** %s: %p %p %p  (for : %s)\n",
 268            message, (void*)r1, (void*)r2, (void*)r3,
 269            fp->m_func ? fp->m_func->fullName()->data() : "[?]");
 270 }
 271
 272 // Utility for debugging translations that will print a message,
 273 // followed by the value of up to three registers.
 274 void TranslatorX64::emitDebugPrint(Asm& a,
 275                                    const char* message,
 276                                    PhysReg r1,
 277                                    PhysReg r2,
 278                                    PhysReg r3) {
 279   boost::optional<PhysRegSaver> aSaver;
 280   boost::optional<PhysRegSaverStub> astubsSaver;
 281
 282   if (&a == &this->a) {
 283     aSaver = boost::in_place<PhysRegSaver>(boost::ref(a), kAllX64Regs);
 284   } else {
 285     astubsSaver = boost::in_place<PhysRegSaverStub>(boost::ref(a),
 286       kAllX64Regs);
 287   }
 288
 289   a.  mov_imm64_reg  (uintptr_t(message), argNumToRegName[0]);
 290   a.  mov_reg64_reg64(r1, argNumToRegName[1]);
 291   a.  mov_reg64_reg64(r2, argNumToRegName[2]);
 292   a.  mov_reg64_reg64(r3, argNumToRegName[3]);
 293   a.  mov_reg64_reg64(rVmFp, argNumToRegName[4]);
 294   a.  call((TCA)tc_debug_print);
 295 }
 296
 297 void
 298 TranslatorX64::emitRB(X64Assembler& a,
 299                       Trace::RingBufferType t,
 300                       SrcKey sk, RegSet toSave) {
 301   if (!Trace::moduleEnabledRelease(Trace::tx64, 3)) {
 302     return;
 303   }
 304   PhysRegSaver rs(a, toSave | kSpecialCrossTraceRegs);
 305   int arg = 0;
 306   a.    emitImmReg(t, argNumToRegName[arg++]);
 307   a.    emitImmReg(sk.getFuncId(), argNumToRegName[arg++]);
 308   a.    emitImmReg(sk.offset(), argNumToRegName[arg++]);
 309   a.    call((TCA)ringbufferEntry);
 310 }
 311
 312 void
 313 TranslatorX64::emitRB(X64Assembler& a,
 314                       Trace::RingBufferType t,
 315                       const char* msg,
 316                       RegSet toSave) {
 317   if (!Trace::moduleEnabledRelease(Trace::tx64, 3)) {
 318     return;
 319   }
 320   PhysRegSaver save(a, toSave | kSpecialCrossTraceRegs);
 321   int arg = 0;
 322   a.    emitImmReg((uintptr_t)msg, argNumToRegName[arg++]);
 323   a.    emitImmReg(strlen(msg), argNumToRegName[arg++]);
 324   a.    emitImmReg(t, argNumToRegName[arg++]);
 325   a.    call((TCA)ringbufferMsg);
 326 }
 327
 328 void
 329 TranslatorX64::emitCall(X64Assembler& a, TCA dest) {
 330   if (a.jmpDeltaFits(dest) && !Stats::enabled()) {
 331     a.    call(dest);
 332   } else {
 333     a.    call(getNativeTrampoline(dest));
 334   }
 335 }
 336
 337 void
 338 TranslatorX64::emitCall(X64Assembler& a, CppCall call) {
 339   if (call.isDirect()) {
 340     return emitCall(a, (TCA)call.getAddress());
 341   }
 342   // Virtual call.
 343   // Load method's address from proper offset off of object in rdi,
 344   // using rax as scratch.
 345   a.loadq(*rdi, rax);
 346   a.call(rax[call.getOffset()]);
 347 }
 348
 349 static void emitGetGContext(X64Assembler& a, PhysReg dest) {
 350   emitTLSLoad<ExecutionContext>(a, g_context, dest);
 351 }
 352
 353 void
 354 TranslatorX64::emitEagerSyncPoint(X64Assembler& a, const Opcode* pc,
 355                                   const Offset spDiff) {
 356   static COff spOff = offsetof(VMExecutionContext, m_stack) +
 357     Stack::topOfStackOffset();
 358   static COff fpOff = offsetof(VMExecutionContext, m_fp);
 359   static COff pcOff = offsetof(VMExecutionContext, m_pc);
 360
 361   /* we can't use rAsm because the pc store uses it as a
 362      temporary */
 363   Reg64 rEC = reg::rdi;
 364
 365   a.   push(rEC);
 366   emitGetGContext(a, rEC);
 367   a.   storeq(rVmFp, rEC[fpOff]);
 368   if (spDiff) {
 369     a.   lea(rVmSp[spDiff], rAsm);
 370     a.   storeq(rAsm, rEC[spOff]);
 371   } else {
 372     a.   storeq(rVmSp, rEC[spOff]);
 373   }
 374   a.   storeq(pc, rEC[pcOff]);
 375   a.   pop(rEC);
 376 }
 377
 378 void
 379 TranslatorX64::recordSyncPoint(X64Assembler& a, Offset pcOff, Offset spOff) {
 380   m_pendingFixups.push_back(PendingFixup(a.frontier(), Fixup(pcOff, spOff)));
 381 }
 382
 383 void
 384 TranslatorX64::recordIndirectFixup(CTCA addr, int dwordsPushed) {
 385   m_fixupMap.recordIndirectFixup(
 386     a.frontier(), IndirectFixup((2 + dwordsPushed) * 8));
 387 }
 388
 389 void
 390 TranslatorX64::emitIncRef(PhysReg base, DataType dtype) {
 391   emitIncRef(a, base, dtype);
 392 }
 393
 394 void
 395 TranslatorX64::emitIncRef(X64Assembler &a, PhysReg base, DataType dtype) {
 396   if (!IS_REFCOUNTED_TYPE(dtype) && dtype != KindOfInvalid) {
 397     return;
 398   }
 399   SpaceRecorder sr("_IncRef", a);
 400   static_assert(sizeof(RefCount) == sizeof(int32_t), "");
 401   { // if !static then
 402     IfCountNotStatic ins(a, base, dtype);
 403     /*
 404      * The optimization guide cautions against using inc; while it is
 405      * compact, it only writes the low-order 8 bits of eflags, causing a
 406      * partial dependency for any downstream flags-dependent code.
 407      */
 408     a.    incl(base[FAST_REFCOUNT_OFFSET]);
 409   } // endif
 410 }
 411
 412 void
 413 TranslatorX64::emitIncRefGenericRegSafe(PhysReg base,
 414                                         int disp,
 415                                         PhysReg tmpReg) {
 416   { // if RC
 417     IfRefCounted irc(a, base, disp);
 418     a.    load_reg64_disp_reg64(base, disp + TVOFF(m_data),
 419                                 tmpReg);
 420     { // if !static
 421       IfCountNotStatic ins(a, tmpReg);
 422       a.  incl(tmpReg[FAST_REFCOUNT_OFFSET]);
 423     } // endif
 424   } // endif
 425 }
 426
 427 // emitEagerVMRegSave --
 428 //   Inline. Saves regs in-place in the TC. This is an unusual need;
 429 //   you probably want to lazily save these regs via recordCall and
 430 //   its ilk.
 431 //
 432 //   SaveFP uses rVmFp, as usual. SavePC requires the caller to have
 433 //   placed the PC offset of the instruction about to be executed in
 434 //   rdi.
 435 enum RegSaveFlags {
 436   SaveFP = 1,
 437   SavePC = 2
 438 };
 439
 440 static TCA
 441 emitEagerVMRegSave(X64Assembler& a,
 442                    int flags /* :: RegSaveFlags */) {
 443   TCA start = a.frontier();
 444   bool saveFP = bool(flags & SaveFP);
 445   bool savePC = bool(flags & SavePC);
 446   assert((flags & ~(SavePC | SaveFP)) == 0);
 447
 448   Reg64 pcReg = rdi;
 449   PhysReg rEC = rAsm;
 450   assert(!kSpecialCrossTraceRegs.contains(rdi));
 451
 452   emitGetGContext(a, rEC);
 453
 454   static COff spOff = offsetof(VMExecutionContext, m_stack) +
 455     Stack::topOfStackOffset();
 456   static COff fpOff = offsetof(VMExecutionContext, m_fp) - spOff;
 457   static COff pcOff = offsetof(VMExecutionContext, m_pc) - spOff;
 458
 459   assert(spOff != 0);
 460   a.    addq   (spOff, r64(rEC));
 461   a.    storeq (rVmSp, *rEC);
 462   if (savePC) {
 463     // We're going to temporarily abuse rVmSp to hold the current unit.
 464     Reg64 rBC = rVmSp;
 465     a.  push   (rBC);
 466     // m_fp -> m_func -> m_unit -> m_bc + pcReg
 467     a.  loadq  (rVmFp[AROFF(m_func)], rBC);
 468     a.  loadq  (rBC[Func::unitOff()], rBC);
 469     a.  loadq  (rBC[Unit::bcOff()], rBC);
 470     a.  addq   (rBC, pcReg);
 471     a.  storeq (pcReg, rEC[pcOff]);
 472     a.  pop    (rBC);
 473   }
 474   if (saveFP) {
 475     a.  storeq (rVmFp, rEC[fpOff]);
 476   }
 477   return start;
 478 }
 479
 480 CppCall TranslatorX64::getDtorCall(DataType type) {
 481   switch (type) {
 482   case BitwiseKindOfString:
 483     return CppCall(getMethodPtr(&StringData::release));
 484   case KindOfArray:
 485     return CppCall(getMethodPtr(&ArrayData::release));
 486   case KindOfObject:
 487     return CppCall(getMethodPtr(&ObjectData::release));
 488   case KindOfResource:
 489     return CppCall(getMethodPtr(&ResourceData::release));
 490   case KindOfRef:
 491     return CppCall(getMethodPtr(&RefData::release));
 492   default:
 493     assert(false);
 494     NOT_REACHED();
 495   }
 496 }
 497
 498 /*
 499  * callDestructor/jumpDestructor --
 500  *
 501  * Emit a call or jump to the appropriate destructor for a dynamically
 502  * typed value.
 503  *
 504  * No registers are saved; most translated code should be using
 505  * emitDecRefGeneric{Reg,} instead of this.
 506  *
 507  *   Inputs:
 508  *
 509  *     - typeReg is destroyed and may not be argNumToRegName[0].
 510  *     - argNumToRegName[0] should contain the m_data for this value.
 511  *     - scratch is destoyed.
 512  */
 513
 514 static IndexedMemoryRef lookupDestructor(X64Assembler& a,
 515                                          PhysReg typeReg,
 516                                          PhysReg scratch) {
 517   assert(typeReg != r32(argNumToRegName[0]));
 518   assert(scratch != argNumToRegName[0]);
 519
 520   static_assert((BitwiseKindOfString >> kShiftDataTypeToDestrIndex == 1) &&
 521                 (KindOfArray         >> kShiftDataTypeToDestrIndex == 2) &&
 522                 (KindOfObject        >> kShiftDataTypeToDestrIndex == 3) &&
 523                 (KindOfResource      >> kShiftDataTypeToDestrIndex == 4) &&
 524                 (KindOfRef           >> kShiftDataTypeToDestrIndex == 5),
 525                 "lookup of destructors depends on KindOf* values");
 526
 527   a.    shrl   (kShiftDataTypeToDestrIndex, r32(typeReg));
 528   a.    movq   (&g_destructors, scratch);
 529   return scratch[typeReg*8];
 530 }
 531
 532 static void callDestructor(X64Assembler& a,
 533                            PhysReg typeReg,
 534                            PhysReg scratch) {
 535   a.    call   (lookupDestructor(a, typeReg, scratch));
 536 }
 537
 538 static void jumpDestructor(X64Assembler& a,
 539                            PhysReg typeReg,
 540                            PhysReg scratch) {
 541   a.    jmp    (lookupDestructor(a, typeReg, scratch));
 542 }
 543
 544 void TranslatorX64::emitGenericDecRefHelpers() {
 545   Label release;
 546
 547   // m_dtorGenericStub just takes a pointer to the TypedValue in rdi.
 548   moveToAlign(a, kNonFallthroughAlign);
 549   m_irPopRHelper = a.frontier();
 550   // popR: Move top-of-stack pointer to rdi
 551   emitMovRegReg(a, rVmSp, rdi);
 552   // fall through
 553   m_dtorGenericStub = a.frontier();
 554   emitLoadTVType(a, rdi[TVOFF(m_type)], r32(rAsm));
 555   a.    loadq  (rdi[TVOFF(m_data)], rdi);
 556   // Fall through to the regs stub.
 557
 558   /*
 559    * Custom calling convention: m_type goes in rAsm, m_data in
 560    * rdi.  We don't ever store program locations in rAsm, so the
 561    * caller didn't need to spill anything.  The assembler sometimes
 562    * uses rAsm, but we know the stub won't need to and it makes it
 563    * possible to share the code for both decref helpers.
 564    */
 565   m_dtorGenericStubRegs = a.frontier();
 566   a.    cmpl   (RefCountStaticValue, rdi[FAST_REFCOUNT_OFFSET]);
 567   jccBlock<CC_Z>(a, [&] {
 568     a.  decl   (rdi[FAST_REFCOUNT_OFFSET]);
 569     release.jcc8(a, CC_Z);
 570   });
 571   a.    ret    ();
 572
 573 asm_label(a, release);
 574   {
 575     PhysRegSaver prs(a, kGPCallerSaved - RegSet(rdi));
 576     callDestructor(a, rAsm, rax);
 577     recordIndirectFixup(a.frontier(), prs.rspTotalAdjustmentRegs());
 578   }
 579   a.    ret    ();
 580
 581   TRACE(1, "HOTSTUB: generic dtor start: %lx\n",
 582         uintptr_t(m_irPopRHelper));
 583   TRACE(1, "HOTSTUB: genericDtorStub: %lx\n", uintptr_t(m_dtorGenericStub));
 584   TRACE(1, "HOTSTUB: genericDtorStubRegs: %lx\n",
 585         uintptr_t(m_dtorGenericStubRegs));
 586   TRACE(1, "HOTSTUB: total dtor generic stubs %zu bytes\n",
 587         size_t(a.frontier() - m_dtorGenericStub));
 588 }
 589
 590 bool TranslatorX64::profileSrcKey(const SrcKey& sk) const {
 591   if (!RuntimeOption::EvalJitPGO) return false;
 592
 593   if (RuntimeOption::EvalJitPGOHotOnly && !(sk.func()->attrs() & AttrHot)) {
 594     return false;
 595   }
 596
 597   if (profData()->optimized(sk)) return false;
 598
 599   // The TCA of closure bodies is stored in the func's prologue
 600   // tables.  So, to support retranslating them, we need to reset the
 601   // prologue tables and the prologue cache appropriately.
 602   // (test/quick/floatcmp.php exposes this problem)
 603   if (sk.func()->isClosureBody()) return false;
 604
 605   return true;
 606 }
 607
 608 TCA TranslatorX64::retranslate(const TranslArgs& args) {
 609   if (isDebuggerAttachedProcess() && isSrcKeyInBL(args.m_sk)) {
 610     // We are about to translate something known to be blacklisted by
 611     // debugger, exit early
 612     SKTRACE(1, args.m_sk, "retranslate abort due to debugger\n");
 613     return nullptr;
 614   }
 615   LeaseHolder writer(s_writeLease);
 616   if (!writer) return nullptr;
 617   SKTRACE(1, args.m_sk, "retranslate\n");
 618   if (m_mode == TransInvalid) {
 619     m_mode = profileSrcKey(args.m_sk) ? TransProfile : TransLive;
 620   }
 621   return translate(args);
 622 }
 623
 624 // Only use comes from HHIR's cgExitTrace() case TraceExitType::SlowNoProgress
 625 TCA TranslatorX64::retranslateAndPatchNoIR(SrcKey sk,
 626                                            bool   align,
 627                                            TCA    toSmash) {
 628   if (isDebuggerAttachedProcess() && isSrcKeyInBL(sk)) {
 629     // We are about to translate something known to be blacklisted by
 630     // debugger, exit early
 631     SKTRACE(1, sk, "retranslateAndPatchNoIR abort due to debugger\n");
 632     return nullptr;
 633   }
 634   LeaseHolder writer(s_writeLease);
 635   if (!writer) return nullptr;
 636   SKTRACE(1, sk, "retranslateAndPatchNoIR\n");
 637   SrcRec* srcRec = getSrcRec(sk);
 638   if (srcRec->translations().size() ==
 639         RuntimeOption::EvalJitMaxTranslations + 1) {
 640     // we've gone over the translation limit and already have an anchor
 641     // translation that will interpret, so just return NULL and force
 642     // interpretation of this BB.
 643     return nullptr;
 644   }
 645   m_mode = TransLive;
 646   TCA start = translate(TranslArgs(sk, align).interp(true));
 647   if (start != nullptr) {
 648     smashJmp(getAsmFor(toSmash), toSmash, start);
 649   }
 650   return start;
 651 }
 652
 653 TCA TranslatorX64::retranslateOpt(TransID transId, bool align) {
 654   LeaseHolder writer(s_writeLease);
 655   if (!writer) return nullptr;
 656
 657   TRACE(1, "retranslateOpt: transId = %u\n", transId);
 658
 659   Func* func = nullptr;
 660   if (m_profData->transBlock(transId) == nullptr) {
 661     // This can happen for profiling translations that have some
 662     // feature not supported by translateRegion yet.  For such translations,
 663     // we don't have a Func* (since it's grabbed from the Block).
 664     // Anyway, in this case, the region translator resorts generates a
 665     // TransLive translation, corresponding to the current live VM context.
 666     func = const_cast<Func*>(liveFunc());
 667   } else {
 668     func = m_profData->transFunc(transId);
 669   }
 670
 671   // We may get here multiple times because different translations of
 672   // the same SrcKey hit the optimization threshold.  Only the first
 673   // time around we want to invalidate the existing translations.
 674   const SrcKey& sk = m_profData->transSrcKey(transId);
 675   bool alreadyOptimized = m_profData->optimized(sk);
 676   m_profData->setOptimized(sk);
 677
 678   bool setFuncBody = (!alreadyOptimized &&
 679                       func->base() == sk.offset() &&
 680                       func->getDVFunclets().size() == 0);
 681
 682   if (!alreadyOptimized) {
 683     if (setFuncBody) func->setFuncBody((TCA)funcBodyHelperThunk);
 684     invalidateSrcKey(sk);
 685   } else {
 686     // Bail if we already reached the maximum number of translations per SrcKey.
 687     // Note that this can only happen with multi-threading.
 688     SrcRec* srcRec = getSrcRec(sk);
 689     assert(srcRec);
 690     size_t nTrans = srcRec->translations().size();
 691     if (nTrans >= RuntimeOption::EvalJitMaxTranslations + 1) return nullptr;
 692   }
 693
 694   m_mode = TransOptimize;
 695   auto translArgs = TranslArgs(sk, align).transId(transId);
 696   if (setFuncBody) translArgs.setFuncBody();
 697
 698   return retranslate(translArgs);
 699 }
 700
 701 /*
 702  * Satisfy an alignment constraint. If we're in a reachable section
 703  * of code, bridge the gap with nops. Otherwise, int3's.
 704  */
 705 void
 706 TranslatorX64::moveToAlign(X64Assembler &aa,
 707                            const size_t align /* =kJmpTargetAlign */,
 708                            bool unreachable /* =true */) {
 709   using namespace HPHP::Util;
 710   SpaceRecorder sr("_Align", aa);
 711   assert(isPowerOfTwo(align));
 712   size_t leftInBlock = align - ((align - 1) & uintptr_t(aa.frontier()));
 713   if (leftInBlock == align) return;
 714   if (unreachable) {
 715     if (leftInBlock > 2) {
 716       aa.ud2();
 717       leftInBlock -= 2;
 718     }
 719     if (leftInBlock > 0) {
 720       aa.emitInt3s(leftInBlock);
 721     }
 722     return;
 723   }
 724   aa.emitNop(leftInBlock);
 725 }
 726
 727 /*
 728  * Req machinery. We sometimes emit code that is unable to proceed
 729  * without translator assistance; e.g., a basic block whose successor is
 730  * unknown. We leave one of these request arg blobs in m_data, and point
 731  * to it at callout-time.
 732  */
 733
 734 // REQ_BIND_CALL
 735 struct ReqBindCall {
 736   SrcKey m_sourceInstr;
 737   TCA m_toSmash;
 738   int m_nArgs;
 739   bool m_isImmutable; // call was to known func.
 740 } m_bindCall;
 741
 742 // ID to name mapping for tracing.
 743 static inline const char*
 744 reqName(int req) {
 745   static const char* reqNames[] = {
 746 #define REQ(nm) #nm,
 747     SERVICE_REQUESTS
 748 #undef REQ
 749   };
 750   return reqNames[req];
 751 }
 752
 753 /*
 754  * Find or create a translation for sk. Returns TCA of "best" current
 755  * translation. May return NULL if it is currently impossible to create
 756  * a translation.
 757  */
 758 TCA
 759 TranslatorX64::getTranslation(const TranslArgs& args) {
 760   auto sk = args.m_sk;
 761   sk.func()->validate();
 762   SKTRACE(2, sk,
 763           "getTranslation: curUnit %s funcId %x offset %d\n",
 764           sk.unit()->filepath()->data(),
 765           sk.getFuncId(),
 766           sk.offset());
 767   SKTRACE(2, sk, "   funcId: %x \n", sk.func()->getFuncId());
 768
 769   if (Translator::liveFrameIsPseudoMain()) {
 770     SKTRACE(2, sk, "punting on pseudoMain\n");
 771     return nullptr;
 772   }
 773   if (const SrcRec* sr = m_srcDB.find(sk)) {
 774     TCA tca = sr->getTopTranslation();
 775     if (tca) {
 776       SKTRACE(2, sk, "getTranslation: found %p\n", tca);
 777       return tca;
 778     }
 779   }
 780   return createTranslation(args);
 781 }
 782
 783 int
 784 TranslatorX64::numTranslations(SrcKey sk) const {
 785   if (const SrcRec* sr = m_srcDB.find(sk)) {
 786     return sr->translations().size();
 787   }
 788   return 0;
 789 }
 790
 791 static void populateLiveContext(JIT::RegionContext& ctx) {
 792   typedef JIT::RegionDesc::Location L;
 793
 794   const ActRec*     const fp {g_vmContext->getFP()};
 795   const TypedValue* const sp {g_vmContext->getStack().top()};
 796
 797   for (uint32_t i = 0; i < fp->m_func->numLocals(); ++i) {
 798     ctx.liveTypes.push_back(
 799       { L::Local{i}, JIT::liveTVType(frame_local(fp, i)) }
 800     );
 801   }
 802
 803   uint32_t stackOff = 0;
 804   visitStackElems(
 805     fp, sp, ctx.bcOffset,
 806     [&](const ActRec* ar) {
 807       // TODO(#2466980): when it's a Cls, we should pass the Class* in
 808       // the Type.
 809       using JIT::Type;
 810       auto const objOrCls =
 811         ar->hasThis()  ? Type::Obj.specialize(ar->getThis()->getVMClass()) :
 812         ar->hasClass() ? Type::Cls
 813                        : Type::Nullptr;
 814
 815       ctx.preLiveARs.push_back(
 816         { stackOff,
 817           ar->m_func,
 818           objOrCls
 819         }
 820       );
 821       FTRACE(2, "added prelive ActRec {}\n", show(ctx.preLiveARs.back()));
 822
 823       stackOff += kNumActRecCells;
 824     },
 825     [&](const TypedValue* tv) {
 826       ctx.liveTypes.push_back(
 827         { L::Stack{stackOff++}, JIT::liveTVType(tv) }
 828       );
 829       FTRACE(2, "added live type {}\n", show(ctx.liveTypes.back()));
 830     }
 831   );
 832 }
 833
 834 TCA
 835 TranslatorX64::createTranslation(const TranslArgs& args) {
 836   /*
 837    * Try to become the writer. We delay this until we *know* we will have
 838    * a need to create new translations, instead of just trying to win the
 839    * lottery at the dawn of time. Hopefully lots of requests won't require
 840    * any new translation.
 841    */
 842   auto retransl = [&] {
 843     return retranslate(args);
 844   };
 845   auto sk = args.m_sk;
 846   LeaseHolder writer(s_writeLease);
 847   if (!writer) return nullptr;
 848
 849   if (SrcRec* sr = m_srcDB.find(sk)) {
 850     TCA tca = sr->getTopTranslation();
 851     if (tca) {
 852       // Handle extremely unlikely race; someone may have just already
 853       // added the first instance of this SrcRec while we did a
 854       // non-blocking wait on the write lease.
 855       return tca;
 856     } else {
 857       // Since we are holding the write lease, we know that sk is properly
 858       // initialized, except that it has no translations (due to
 859       // replaceOldTranslations)
 860       return retransl();
 861     }
 862   }
 863
 864   // We put retranslate requests at the end of our slab to more frequently
 865   //   allow conditional jump fall-throughs
 866   TCA astart = a.frontier();
 867   TCA stubstart = astubs.frontier();
 868   TCA req = emitServiceReq(REQ_RETRANSLATE, sk.offset());
 869   SKTRACE(1, sk, "inserting anchor translation for (%p,%d) at %p\n",
 870           sk.unit(), sk.offset(), req);
 871   SrcRec* sr = m_srcDB.insert(sk);
 872   sr->setFuncInfo(sk.func());
 873   sr->setAnchorTranslation(req);
 874
 875   size_t asize = a.frontier() - astart;
 876   size_t stubsize = astubs.frontier() - stubstart;
 877   assert(asize == 0);
 878   if (stubsize && RuntimeOption::EvalDumpTCAnchors) {
 879     addTranslation(TransRec(sk, sk.unit()->md5(), TransAnchor,
 880                             astart, asize, stubstart, stubsize));
 881     if (m_profData) {
 882       m_profData->addTransAnchor(sk);
 883     }
 884     assert(!isTransDBEnabled() || getTransRec(stubstart)->kind == TransAnchor);
 885   }
 886
 887   return retransl();
 888 }
 889
 890 TCA
 891 TranslatorX64::lookupTranslation(SrcKey sk) const {
 892   if (SrcRec* sr = m_srcDB.find(sk)) {
 893     return sr->getTopTranslation();
 894   }
 895   return nullptr;
 896 }
 897
 898 TCA
 899 TranslatorX64::translate(const TranslArgs& args) {
 900   INC_TPC(translate);
 901   assert(((uintptr_t)vmsp() & (sizeof(Cell) - 1)) == 0);
 902   assert(((uintptr_t)vmfp() & (sizeof(Cell) - 1)) == 0);
 903   assert(m_mode != TransInvalid);
 904   SCOPE_EXIT{ m_mode = TransInvalid; };
 905
 906   if (!args.m_interp) {
 907     if (m_numHHIRTrans == RuntimeOption::EvalJitGlobalTranslationLimit) {
 908       RuntimeOption::EvalJit = false;
 909       ThreadInfo::s_threadInfo->m_reqInjectionData.updateJit();
 910       return nullptr;
 911     }
 912   }
 913
 914   Func* func = const_cast<Func*>(args.m_sk.func());
 915   AsmSelector asmSel(AsmSelector::Args(this).profile(m_mode == TransProfile)
 916                                             .hot(func->attrs() & AttrHot));
 917
 918   if (args.m_align) {
 919     moveToAlign(a, kNonFallthroughAlign);
 920   }
 921
 922   TCA start = a.frontier();
 923
 924   translateWork(args);
 925
 926   if (args.m_setFuncBody) {
 927     func->setFuncBody(start);
 928   }
 929   SKTRACE(1, args.m_sk, "translate moved head from %p to %p\n",
 930           getTopTranslation(args.m_sk), start);
 931   return start;
 932 }
 933
 934 /*
 935  * Returns true if the given current frontier can have an nBytes-long
 936  * instruction written without any risk of cache-tearing.
 937  */
 938 bool isSmashable(Address frontier, int nBytes, int offset /* = 0 */) {
 939   assert(nBytes <= int(kX64CacheLineSize));
 940   uintptr_t iFrontier = uintptr_t(frontier) + offset;
 941   uintptr_t lastByte = uintptr_t(frontier) + nBytes - 1;
 942   return (iFrontier & ~kX64CacheLineMask) == (lastByte & ~kX64CacheLineMask);
 943 }
 944
 945 /*
 946  * Call before emitting a test-jcc sequence. Inserts a nop gap such that after
 947  * writing a testBytes-long instruction, the frontier will be smashable.
 948  */
 949 void prepareForTestAndSmash(Asm& a, int testBytes, TestAndSmashFlags flags) {
 950   switch (flags) {
 951   case TestAndSmashFlags::kAlignJcc:
 952     prepareForSmash(a, testBytes + kJmpccLen, testBytes);
 953     assert(isSmashable(a.frontier() + testBytes, kJmpccLen));
 954     break;
 955   case TestAndSmashFlags::kAlignJccImmediate:
 956     prepareForSmash(a,
 957                     testBytes + kJmpccLen,
 958                     testBytes + kJmpccLen - kJmpImmBytes);
 959     assert(isSmashable(a.frontier() + testBytes, kJmpccLen,
 960                        kJmpccLen - kJmpImmBytes));
 961     break;
 962   case TestAndSmashFlags::kAlignJccAndJmp:
 963     // Ensure that the entire jcc, and the entire jmp are smashable
 964     // (but we dont need them both to be in the same cache line)
 965     prepareForSmash(a, testBytes + kJmpccLen, testBytes);
 966     prepareForSmash(a, testBytes + kJmpccLen + kJmpLen, testBytes + kJmpccLen);
 967     assert(isSmashable(a.frontier() + testBytes, kJmpccLen));
 968     assert(isSmashable(a.frontier() + testBytes + kJmpccLen, kJmpLen));
 969     break;
 970   }
 971 }
 972
 973 void prepareForSmash(X64Assembler& a, int nBytes, int offset /* = 0 */) {
 974   if (!isSmashable(a.frontier(), nBytes, offset)) {
 975     int gapSize = (~(uintptr_t(a.frontier()) + offset) &
 976                    kX64CacheLineMask) + 1;
 977     a.emitNop(gapSize);
 978     assert(isSmashable(a.frontier(), nBytes, offset));
 979   }
 980 }
 981
 982 void
 983 TranslatorX64::smash(X64Assembler &a, TCA src, TCA dest, bool isCall) {
 984   assert(canWrite());
 985   TRACE(2, "smash: %p -> %p\n", src, dest);
 986   /*
 987    * !
 988    *
 989    * We are about to smash reachable code in the translation cache. A
 990    * hardware thread might be executing the very instruction we're
 991    * modifying. This is safe because:
 992    *
 993    *    1. We align smashable instructions so that they reside on a single
 994    *       cache line;
 995    *
 996    *    2. We modify the instruction with a single processor store; and
 997    *
 998    *    3. The smashed region contains only a single instruction in the
 999    *       orignal instruction stream (see jmp() -> emitJ32() -> bytes() in
1000    *       the assembler.
1001    */
1002   CodeCursor cg(a, src);
1003   assert(isSmashable(a.frontier(), kJmpLen));
1004   if (dest > src && dest - src <= kJmpLen) {
1005     assert(!isCall);
1006     a.    emitNop(dest - src);
1007   } else if (!isCall) {
1008     a.    jmp(dest);
1009   } else {
1010     a.    call(dest);
1011   }
1012 }
1013
1014 void TranslatorX64::protectCode() {
1015   mprotect(tx64->ahot.base(),
1016            tx64->astubs.base() - tx64->ahot.base() +
1017            tx64->astubs.capacity(), PROT_READ | PROT_EXEC);
1018
1019 }
1020
1021 void TranslatorX64::unprotectCode() {
1022   mprotect(tx64->ahot.base(),
1023            tx64->astubs.base() - tx64->ahot.base() +
1024            tx64->astubs.capacity(),
1025            PROT_READ | PROT_WRITE | PROT_EXEC);
1026 }
1027
1028 void
1029 TranslatorX64::emitStackCheck(int funcDepth, Offset pc) {
1030   funcDepth += kStackCheckPadding * sizeof(Cell);
1031
1032   uint64_t stackMask = cellsToBytes(RuntimeOption::EvalVMStackElms) - 1;
1033   a.    mov_reg64_reg64(rVmSp, rAsm); // copy to destroy
1034   a.    and_imm64_reg64(stackMask, rAsm);
1035   a.    sub_imm64_reg64(funcDepth + Stack::sSurprisePageSize, rAsm);
1036   assert(m_stackOverflowHelper);
1037   a.    jl(m_stackOverflowHelper); // Unlikely branch to failure.
1038   // Success.
1039 }
1040
1041 // Tests the surprise flags for the current thread. Should be used
1042 // before a jnz to surprise handling code.
1043 void
1044 TranslatorX64::emitTestSurpriseFlags(Asm& a) {
1045   static_assert(RequestInjectionData::LastFlag < (1 << 8),
1046                 "Translator assumes RequestInjectionFlags fit in one byte");
1047   a.    testb((int8_t)0xff, rVmTl[TargetCache::kConditionFlagsOff]);
1048 }
1049
1050 void
1051 TranslatorX64::emitCheckSurpriseFlagsEnter(bool inTracelet, Fixup fixup) {
1052   emitTestSurpriseFlags(a);
1053   {
1054     UnlikelyIfBlock ifTracer(CC_NZ, a, astubs);
1055     if (false) { // typecheck
1056       const ActRec* ar = nullptr;
1057       functionEnterHelper(ar);
1058     }
1059     astubs.mov_reg64_reg64(rVmFp, argNumToRegName[0]);
1060     emitCall(astubs, (TCA)&functionEnterHelper);
1061     if (inTracelet) {
1062       recordSyncPoint(astubs, fixup.m_pcOffset, fixup.m_spOffset);
1063     } else {
1064       // If we're being called while generating a func prologue, we
1065       // have to record the fixup directly in the fixup map instead of
1066       // going through m_pendingFixups like normal.
1067       m_fixupMap.recordFixup(astubs.frontier(), fixup);
1068     }
1069   }
1070 }
1071
1072 void
1073 TranslatorX64::setArgInActRec(ActRec* ar, int argNum, uint64_t datum,
1074                               DataType t) {
1075   TypedValue* tv =
1076     (TypedValue*)(uintptr_t(ar) - (argNum+1) * sizeof(TypedValue));
1077   tv->m_data.num = datum;
1078   tv->m_type = t;
1079 }
1080
1081 int
1082 TranslatorX64::shuffleArgsForMagicCall(ActRec* ar) {
1083   if (!ar->hasInvName()) {
1084     return 0;
1085   }
1086   const Func* f UNUSED = ar->m_func;
1087   f->validate();
1088   assert(f->name()->isame(s___call.get())
1089          || f->name()->isame(s___callStatic.get()));
1090   assert(f->numParams() == 2);
1091   TRACE(1, "shuffleArgsForMagicCall: ar %p\n", ar);
1092   assert(ar->hasInvName());
1093   StringData* invName = ar->getInvName();
1094   assert(invName);
1095   ar->setVarEnv(nullptr);
1096   int nargs = ar->numArgs();
1097   // We need to make an array containing all the arguments passed by the
1098   // caller and put it where the second argument is
1099   HphpArray* argArray = ArrayData::Make(nargs);
1100   argArray->incRefCount();
1101   for (int i = 0; i < nargs; ++i) {
1102     TypedValue* tv =
1103       (TypedValue*)(uintptr_t(ar) - (i+1) * sizeof(TypedValue));
1104     argArray->nvAppend(tv);
1105     tvRefcountedDecRef(tv);
1106   }
1107   // Put invName in the slot for first argument
1108   setArgInActRec(ar, 0, uint64_t(invName), BitwiseKindOfString);
1109   // Put argArray in the slot for second argument
1110   setArgInActRec(ar, 1, uint64_t(argArray), KindOfArray);
1111   // Fix up ActRec's numArgs
1112   ar->initNumArgs(2);
1113   return 1;
1114 }
1115
1116 /*
1117  * The standard VMRegAnchor treatment won't work for some cases called
1118  * during function preludes.
1119  *
1120  * The fp sync machinery is fundamentally based on the notion that
1121  * instruction pointers in the TC are uniquely associated with source
1122  * HHBC instructions, and that source HHBC instructions are in turn
1123  * uniquely associated with SP->FP deltas.
1124  *
1125  * trimExtraArgs is called from the prologue of the callee.
1126  * The prologue is 1) still in the caller frame for now,
1127  * and 2) shared across multiple call sites. 1 means that we have the
1128  * fp from the caller's frame, and 2 means that this fp is not enough
1129  * to figure out sp.
1130  *
1131  * However, the prologue passes us the callee actRec, whose predecessor
1132  * has to be the caller. So we can sync sp and fp by ourselves here.
1133  * Geronimo!
1134  */
1135 static void sync_regstate_to_caller(ActRec* preLive) {
1136   assert(tl_regState == VMRegState::DIRTY);
1137   VMExecutionContext* ec = g_vmContext;
1138   ec->m_stack.top() = (TypedValue*)preLive - preLive->numArgs();
1139   ActRec* fp = preLive == ec->m_firstAR ?
1140     ec->m_nestedVMs.back().m_savedState.fp : (ActRec*)preLive->m_savedRbp;
1141   ec->m_fp = fp;
1142   ec->m_pc = fp->m_func->unit()->at(fp->m_func->base() + preLive->m_soff);
1143   tl_regState = VMRegState::CLEAN;
1144 }
1145
1146 void
1147 TranslatorX64::trimExtraArgs(ActRec* ar) {
1148   assert(!ar->hasInvName());
1149
1150   sync_regstate_to_caller(ar);
1151   const Func* f = ar->m_func;
1152   int numParams = f->numParams();
1153   int numArgs = ar->numArgs();
1154   assert(numArgs > numParams);
1155   int numExtra = numArgs - numParams;
1156
1157   TRACE(1, "trimExtraArgs: %d args, function %s takes only %d, ar %p\n",
1158         numArgs, f->name()->data(), numParams, ar);
1159
1160   if (f->attrs() & AttrMayUseVV) {
1161     assert(!ar->hasExtraArgs());
1162     ar->setExtraArgs(ExtraArgs::allocateCopy(
1163       (TypedValue*)(uintptr_t(ar) - numArgs * sizeof(TypedValue)),
1164       numArgs - numParams));
1165   } else {
1166     // Function is not marked as "MayUseVV", so discard the extra arguments
1167     TypedValue* tv = (TypedValue*)(uintptr_t(ar) - numArgs*sizeof(TypedValue));
1168     for (int i = 0; i < numExtra; ++i) {
1169       tvRefcountedDecRef(tv);
1170       ++tv;
1171     }
1172     ar->setNumArgs(numParams);
1173   }
1174
1175   // Only go back to dirty in a non-exception case.  (Same reason as
1176   // above.)
1177   tl_regState = VMRegState::DIRTY;
1178 }
1179
1180 TCA
1181 TranslatorX64::emitCallArrayPrologue(const Func* func,
1182                                    const DVFuncletsVec& dvs) {
1183   TCA start = a.frontier();
1184   if (dvs.size() == 1) {
1185     a.   cmp_imm32_disp_reg32(dvs[0].first,
1186                               AROFF(m_numArgsAndCtorFlag), rVmFp);
1187     emitBindJcc(a, CC_LE, SrcKey(func, dvs[0].second));
1188     emitBindJmp(a, SrcKey(func, func->base()));
1189   } else {
1190     a.   load_reg64_disp_reg32(rVmFp, AROFF(m_numArgsAndCtorFlag), rax);
1191     for (unsigned i = 0; i < dvs.size(); i++) {
1192       a.   cmp_imm32_reg32(dvs[i].first, rax);
1193       emitBindJcc(a, CC_LE, SrcKey(func, dvs[i].second));
1194     }
1195     emitBindJmp(a, SrcKey(func, func->base()));
1196   }
1197   return start;
1198 }
1199
1200 TCA
1201 TranslatorX64::getCallArrayPrologue(Func* func) {
1202   TCA tca = func->getFuncBody();
1203   if (tca != (TCA)funcBodyHelperThunk) return tca;
1204
1205   DVFuncletsVec dvs = func->getDVFunclets();
1206
1207   if (dvs.size()) {
1208     LeaseHolder writer(s_writeLease);
1209     if (!writer) return nullptr;
1210     tca = func->getFuncBody();
1211     if (tca != (TCA)funcBodyHelperThunk) return tca;
1212     tca = emitCallArrayPrologue(func, dvs);
1213     func->setFuncBody(tca);
1214   } else {
1215     SrcKey sk(func, func->base());
1216     tca = tx64->getTranslation(TranslArgs(sk, false).setFuncBody());
1217   }
1218
1219   return tca;
1220 }
1221
1222 TCA
1223 TranslatorX64::emitPrologueRedispatch(X64Assembler& a) {
1224   TCA retval;
1225   moveToAlign(a);
1226   retval = a.frontier();
1227   TRACE(1, "HOTSTUB: emitPrologueRedispatch: %lx\n", uintptr_t(a.frontier()));
1228
1229   // We're in the wrong func prologue.
1230
1231   assert(kScratchCrossTraceRegs.contains(rax));
1232   assert(kScratchCrossTraceRegs.contains(rdx));
1233   assert(kScratchCrossTraceRegs.contains(rcx));
1234
1235   //    Get the called func in rax
1236   a.    load_reg64_disp_reg64(rStashedAR, AROFF(m_func), rax);
1237   //    Get the number of passed parameters in rdx
1238   a.    load_reg64_disp_reg32(rStashedAR, AROFF(m_numArgsAndCtorFlag), rdx);
1239   a.    and_imm32_reg32(0x7fffffff, rdx);
1240   //    Get the number of declared parameters in rcx
1241   a.    load_reg64_disp_reg32(rax, Func::numParamsOff(), rcx);
1242
1243   // If we didn't pass too many args, directly dereference
1244   // func->m_prologues.
1245   a.    cmp_reg32_reg32(rdx, rcx);
1246   TCA bToFixedProloguesCheck = a.frontier();
1247   a.    jcc8(CC_L, bToFixedProloguesCheck);
1248
1249   //   cmp $kNumFixedPrologues, %rdx
1250   //   jl numParamsCheck
1251   TCA actualDispatch = a.frontier();
1252
1253   // rcx: prologueIdx
1254   // rax = func->prologues[numParams]
1255   // jmp rax
1256   a.    loadq  (rax[rdx*8 + Func::prologueTableOff()], rax);
1257   a.    jmp    (rax);
1258   a.    ud2    ();
1259
1260   // Hmm, more parameters passed than the function expected. Did we pass
1261   // kNumFixedPrologues or more? If not, %rdx is still a perfectly
1262   // legitimate index into the func prologue table.
1263   // numParamsCheck:
1264   //    cmp $kNumFixedPrologues, %rcx
1265   //    jl  dispatch
1266   a.patchJcc8(bToFixedProloguesCheck, a.frontier()); // numParamsCheck:
1267   a.    cmp_imm32_reg32(kNumFixedPrologues, rdx);
1268   a.    jcc8(CC_L, actualDispatch);
1269
1270   // Too many gosh-darned parameters passed. Go to numExpected + 1, which
1271   // is always a "too many params" entry point.
1272   //
1273   //    mov %rdx, %rcx
1274   //    add $1, %rcx
1275   //    jmp dispatch
1276   a.    load_reg64_disp_index_reg64(rax,
1277                                     // %rcx + 1
1278                                     Func::prologueTableOff() + sizeof(TCA),
1279                                     rcx,
1280                                     rax);
1281   a.    jmp(rax);
1282   a.    ud2();
1283   return retval;
1284 }
1285
1286 // The funcGuard gets skipped and patched by other code, so we have some
1287 // magic offsets.
1288 static const int kFuncMovImm = 6; // Offset to the immediate for 8 byte Func*
1289 static const int kFuncCmpImm = 4; // Offset to the immediate for 4 byte Func*
1290 static const int kFuncGuardLen = 23;
1291 static const int kFuncGuardShortLen = 14;
1292
1293 template<typename T>
1294 static T*
1295 funcPrologueToGuardImm(TCA prologue) {
1296   assert(sizeof(T) == 4 || sizeof(T) == 8);
1297   T* retval = (T*)(prologue - (sizeof(T) == 8 ?
1298                                kFuncGuardLen - kFuncMovImm :
1299                                kFuncGuardShortLen - kFuncCmpImm));
1300   // We padded these so the immediate would fit inside a cache line
1301   assert(((uintptr_t(retval) ^ (uintptr_t(retval + 1) - 1)) &
1302           ~(kX64CacheLineSize - 1)) == 0);
1303
1304   return retval;
1305 }
1306
1307 static inline bool
1308 funcPrologueHasGuard(TCA prologue, const Func* func) {
1309   intptr_t iptr = uintptr_t(func);
1310   if (deltaFits(iptr, sz::dword)) {
1311     return *funcPrologueToGuardImm<int32_t>(prologue) == iptr;
1312   }
1313   return *funcPrologueToGuardImm<int64_t>(prologue) == iptr;
1314 }
1315
1316 static TCA
1317 funcPrologueToGuard(TCA prologue, const Func* func) {
1318   if (!prologue || prologue == (TCA)fcallHelperThunk) return prologue;
1319   return prologue -
1320     (deltaFits(uintptr_t(func), sz::dword) ?
1321      kFuncGuardShortLen :
1322      kFuncGuardLen);
1323 }
1324
1325 static inline void
1326 funcPrologueSmashGuard(TCA prologue, const Func* func) {
1327   intptr_t iptr = uintptr_t(func);
1328   if (deltaFits(iptr, sz::dword)) {
1329     *funcPrologueToGuardImm<int32_t>(prologue) = 0;
1330     return;
1331   }
1332   *funcPrologueToGuardImm<int64_t>(prologue) = 0;
1333 }
1334
1335 void
1336 TranslatorX64::smashPrologueGuards(TCA* prologues, int numPrologues,
1337                                    const Func* func) {
1338   DEBUG_ONLY std::unique_ptr<LeaseHolder> writer;
1339   for (int i = 0; i < numPrologues; i++) {
1340     if (prologues[i] != (TCA)fcallHelperThunk
1341         && funcPrologueHasGuard(prologues[i], func)) {
1342       if (debug) {
1343         /*
1344          * Unit's are sometimes created racily, in which case all
1345          * but the first are destroyed immediately. In that case,
1346          * the Funcs of the destroyed Units never need their
1347          * prologues smashing, and it would be a lock rank violation
1348          * to take the write lease here.
1349          * In all other cases, Funcs are destroyed via a delayed path
1350          * (treadmill) and the rank violation isn't an issue.
1351          *
1352          * Also note that we only need the write lease because we
1353          * mprotect the translation cache in debug builds.
1354          */
1355         if (!writer) writer.reset(new LeaseHolder(s_writeLease));
1356       }
1357       funcPrologueSmashGuard(prologues[i], func);
1358     }
1359   }
1360 }
1361
1362 TCA
1363 TranslatorX64::emitFuncGuard(X64Assembler& a, const Func* func) {
1364   assert(kScratchCrossTraceRegs.contains(rax));
1365   assert(kScratchCrossTraceRegs.contains(rdx));
1366
1367   const int kAlign = kX64CacheLineSize;
1368   const int kAlignMask = kAlign - 1;
1369   int loBits = uintptr_t(a.frontier()) & kAlignMask;
1370   int delta, size;
1371
1372   // Ensure the immediate is safely smashable
1373   // the immediate must not cross a qword boundary,
1374   if (!deltaFits((intptr_t)func, sz::dword)) {
1375     size = 8;
1376     delta = loBits + kFuncMovImm;
1377   } else {
1378     size = 4;
1379     delta = loBits + kFuncCmpImm;
1380   }
1381
1382   delta = (delta + size - 1) & kAlignMask;
1383   if (delta < size - 1) {
1384     a.emitNop(size - 1 - delta);
1385   }
1386
1387   TCA aStart DEBUG_ONLY = a.frontier();
1388   if (!deltaFits((intptr_t)func, sz::dword)) {
1389     a.    load_reg64_disp_reg64(rStashedAR, AROFF(m_func), rax);
1390     /*
1391       Although func doesnt fit in a signed 32-bit immediate, it may still
1392       fit in an unsigned one. Rather than deal with yet another case
1393       (which only happens when we disable jemalloc) just force it to
1394       be an 8-byte immediate, and patch it up afterwards.
1395     */
1396     a.    mov_imm64_reg(0xdeadbeeffeedface, rdx);
1397     assert(((uint64_t*)a.frontier())[-1] == 0xdeadbeeffeedface);
1398     ((uint64_t*)a.frontier())[-1] = uintptr_t(func);
1399     a.    cmp_reg64_reg64(rax, rdx);
1400   } else {
1401     a.    cmp_imm32_disp_reg32(uint64_t(func), AROFF(m_func), rStashedAR);
1402   }
1403
1404   assert(m_funcPrologueRedispatch);
1405
1406   a.    jnz(m_funcPrologueRedispatch);
1407   assert(funcPrologueToGuard(a.frontier(), func) == aStart);
1408   assert(funcPrologueHasGuard(a.frontier(), func));
1409   return a.frontier();
1410 }
1411
1412 /*
1413  * funcPrologue --
1414  *
1415  * Given a callee and a number of args, match up to the callee's
1416  * argument expectations and dispatch.
1417  *
1418  * Call/return hand-shaking is a bit funny initially. At translation time,
1419  * we don't necessarily know what function we're calling. For instance,
1420  *
1421  *   f(g());
1422  *
1423  * Will lead to a set of basic blocks like:
1424  *
1425  * b1: pushfuncd "f"
1426  *     pushfuncd "g"
1427  *     fcall
1428  * b2: fcall
1429  *
1430  * The fcallc labelled "b2" above is not statically bindable in our
1431  * execution model.
1432  *
1433  * We decouple the call work into a per-callsite portion, responsible
1434  * for recording the return address, and a per-(callee, numArgs) portion,
1435  * responsible for fixing up arguments and dispatching to remaining
1436  * code. We call the per-callee portion a "prologue."
1437  *
1438  * Also, we are called from two distinct environments. From REQ_BIND_CALL,
1439  * we're running "between" basic blocks, with all VM registers sync'ed.
1440  * However, we're also called in the middle of basic blocks, when dropping
1441  * entries into func->m_prologues. So don't go around using the
1442  * translation-time values of vmfp()/vmsp(), since they have an
1443  * unpredictable relationship to the source.
1444  */
1445 bool
1446 TranslatorX64::checkCachedPrologue(const Func* func, int paramIdx,
1447                                    TCA& prologue) const {
1448   prologue = (TCA)func->getPrologue(paramIdx);
1449   if (prologue != (TCA)fcallHelperThunk && !s_replaceInFlight) {
1450     TRACE(1, "cached prologue %s(%d) -> cached %p\n",
1451           func->fullName()->data(), paramIdx, prologue);
1452     assert(isValidCodeAddress(prologue));
1453     return true;
1454   }
1455   return false;
1456 }
1457
1458 // pops the return address pushed by fcall and stores it into the actrec
1459 void
1460 TranslatorX64::emitPopRetIntoActRec(Asm& a) {
1461   a.    pop  (rStashedAR[AROFF(m_savedRip)]);
1462 }
1463
1464 static void interp_set_regs(ActRec* ar, Cell* sp, Offset pcOff) {
1465   assert(tl_regState == VMRegState::DIRTY);
1466   tl_regState = VMRegState::CLEAN;
1467   vmfp() = (Cell*)ar;
1468   vmsp() = sp;
1469   vmpc() = ar->unit()->at(pcOff);
1470 }
1471
1472 TCA
1473 TranslatorX64::funcPrologue(Func* func, int nPassed, ActRec* ar) {
1474   func->validate();
1475   TRACE(1, "funcPrologue %s(%d)\n", func->fullName()->data(), nPassed);
1476   int numParams = func->numParams();
1477   int paramIndex = nPassed <= numParams ? nPassed : numParams + 1;
1478
1479   bool funcIsMagic = func->isMagic();
1480
1481   // Do a quick test before grabbing the write lease
1482   TCA prologue;
1483   if (checkCachedPrologue(func, paramIndex, prologue)) return prologue;
1484   if (func->isClonedClosure()) {
1485     assert(ar);
1486     const Func::ParamInfoVec& paramInfo = func->params();
1487     Offset entry = func->base();
1488     for (int i = nPassed; i < numParams; ++i) {
1489       const Func::ParamInfo& pi = paramInfo[i];
1490       if (pi.hasDefaultValue()) {
1491         entry = pi.funcletOff();
1492         break;
1493       }
1494     }
1495     interp_set_regs(ar, (Cell*)ar - func->numSlotsInFrame(), entry);
1496     SrcKey funcBody(func, entry);
1497     TCA tca = getTranslation(TranslArgs(funcBody, false));
1498     tl_regState = VMRegState::DIRTY;
1499     if (tca) {
1500       // racy, but ok...
1501       func->setPrologue(paramIndex, tca);
1502     }
1503     return tca;
1504   }
1505
1506   // If the translator is getting replaced out from under us, refuse to
1507   // provide a prologue; we don't know whether this request is running on the
1508   // old or new context.
1509   LeaseHolder writer(s_writeLease);
1510   if (!writer || s_replaceInFlight) return nullptr;
1511   // Double check the prologue array now that we have the write lease
1512   // in case another thread snuck in and set the prologue already.
1513   if (checkCachedPrologue(func, paramIndex, prologue)) return prologue;
1514
1515   AsmSelector asmSel(AsmSelector::Args(this).hot(func->attrs() & AttrHot));
1516
1517   SpaceRecorder sr("_FuncPrologue", a);
1518   // If we're close to a cache line boundary, just burn some space to
1519   // try to keep the func and its body on fewer total lines.
1520   if (((uintptr_t)a.frontier() & kX64CacheLineMask) >= 32) {
1521     moveToAlign(a, kX64CacheLineSize);
1522   }
1523   // Careful: this isn't necessarily the real entry point. For funcIsMagic
1524   // prologues, this is just a possible prologue.
1525   TCA aStart    = a.frontier();
1526   TCA start     = aStart;
1527   TCA stubStart = astubs.frontier();
1528
1529   // Guard: we're in the right callee. This happens in magicStart for
1530   // magic callees.
1531   if (!funcIsMagic) {
1532     start = aStart = emitFuncGuard(a, func);
1533   }
1534
1535   emitRB(a, RBTypeFuncPrologueTry, func->fullName()->data());
1536
1537   // NB: We have most of the register file to play with, since we know
1538   // we're between BB's. So, we hardcode some registers here rather
1539   // than using the scratch allocator.
1540   TRACE(2, "funcPrologue: user function: %s\n", func->name()->data());
1541
1542   // Add a counter for the translation if requested
1543   if (RuntimeOption::EvalJitTransCounters) {
1544     emitTransCounterInc(a);
1545   }
1546
1547   if (!funcIsMagic) {
1548     emitPopRetIntoActRec(a);
1549     // entry point for magic methods comes later
1550     emitRB(a, RBTypeFuncEntry, func->fullName()->data());
1551
1552     /*
1553      * Guard: we have stack enough stack space to complete this
1554      * function.  We omit overflow checks if it is a leaf function
1555      * that can't use more than kStackCheckLeafPadding cells.
1556      */
1557     auto const needStackCheck =
1558       !(func->attrs() & AttrPhpLeafFn) ||
1559       func->maxStackCells() >= kStackCheckLeafPadding;
1560     if (needStackCheck) {
1561       emitStackCheck(cellsToBytes(func->maxStackCells()), func->base());
1562     }
1563   }
1564
1565   SrcKey skFuncBody = emitPrologue(func, nPassed);
1566
1567   if (funcIsMagic) {
1568     // entry points for magic methods is here
1569     TCA magicStart = emitFuncGuard(a, func);
1570     emitPopRetIntoActRec(a);
1571     emitRB(a, RBTypeFuncEntry, func->fullName()->data());
1572     // Guard: we have stack enough stack space to complete this function.
1573     emitStackCheck(cellsToBytes(func->maxStackCells()), func->base());
1574     assert(numParams == 2);
1575     // Special __call prologue
1576     a.  mov_reg64_reg64(rStashedAR, argNumToRegName[0]);
1577     emitCall(a, TCA(TranslatorX64::shuffleArgsForMagicCall));
1578     if (memory_profiling) {
1579       m_fixupMap.recordFixup(
1580         a.frontier(),
1581         Fixup(skFuncBody.offset() - func->base(), func->numSlotsInFrame())
1582       );
1583     }
1584     // if shuffleArgs returns 0, that means this was not a magic call
1585     // and we should proceed to a prologue specialized for nPassed;
1586     // otherwise, proceed to a prologue specialized for nPassed==numParams (2).
1587     if (nPassed == 2) {
1588       a.jmp(start);
1589     } else {
1590       a.test_reg64_reg64(rax, rax);
1591       // z ==> not a magic call, go to prologue for nPassed
1592       if (deltaFits(start - (a.frontier() + kJcc8Len), sz::byte)) {
1593         a.jcc8(CC_Z, start);
1594       } else {
1595         a.jcc(CC_Z, start);
1596       }
1597       // this was a magic call
1598       // nPassed == 2
1599       // Fix up hardware stack pointer
1600       nPassed = 2;
1601       emitLea(a, rStashedAR, -cellsToBytes(nPassed), rVmSp);
1602       // Optimization TODO: Reuse the prologue for args == 2
1603       emitPrologue(func, nPassed);
1604     }
1605     start = magicStart;
1606   }
1607   assert(funcPrologueHasGuard(start, func));
1608   TRACE(2, "funcPrologue tx64 %p %s(%d) setting prologue %p\n",
1609         this, func->fullName()->data(), nPassed, start);
1610   assert(isValidCodeAddress(start));
1611   func->setPrologue(paramIndex, start);
1612
1613   addTranslation(TransRec(skFuncBody, func->unit()->md5(),
1614                           TransPrologue, aStart, a.frontier() - aStart,
1615                           stubStart, astubs.frontier() - stubStart));
1616
1617   if (m_profData) {
1618     m_profData->addTransPrologue(skFuncBody);
1619   }
1620
1621   recordGdbTranslation(skFuncBody, func,
1622                        a, aStart,
1623                        false, true);
1624   recordBCInstr(OpFuncPrologue, a, start);
1625
1626   return start;
1627 }
1628
1629 static void raiseMissingArgument(const char* name, int expected, int got) {
1630   if (expected == 1) {
1631     raise_warning(Strings::MISSING_ARGUMENT, name, got);
1632   } else {
1633     raise_warning(Strings::MISSING_ARGUMENTS, name, expected, got);
1634   }
1635 }
1636
1637 SrcKey
1638 TranslatorX64::emitPrologue(Func* func, int nPassed) {
1639   int numParams = func->numParams();
1640   const Func::ParamInfoVec& paramInfo = func->params();
1641
1642   Offset dvInitializer = InvalidAbsoluteOffset;
1643
1644   assert(IMPLIES(func->isGenerator(), nPassed == numParams));
1645   if (nPassed > numParams) {
1646     // Too many args; a weird case, so just callout. Stash ar
1647     // somewhere callee-saved.
1648     if (false) { // typecheck
1649       TranslatorX64::trimExtraArgs((ActRec*)nullptr);
1650     }
1651     a.  mov_reg64_reg64(rStashedAR, argNumToRegName[0]);
1652     emitCall(a, TCA(TranslatorX64::trimExtraArgs));
1653     // We'll fix rVmSp below.
1654   } else if (nPassed < numParams) {
1655     // Figure out which, if any, default value initializer to go to
1656     for (int i = nPassed; i < numParams; ++i) {
1657       const Func::ParamInfo& pi = paramInfo[i];
1658       if (pi.hasDefaultValue()) {
1659         dvInitializer = pi.funcletOff();
1660         break;
1661       }
1662     }
1663     TRACE(1, "Only have %d of %d args; getting dvFunclet\n",
1664           nPassed, numParams);
1665     a.  emitImmReg(nPassed, rax);
1666     // do { *(--rVmSp) = NULL; nPassed++; } while (nPassed < numParams);
1667     // This should be an unusual case, so optimize for code density
1668     // rather than execution speed; i.e., don't unroll the loop.
1669     TCA loopTop = a.frontier();
1670     a.  sub_imm32_reg64(sizeof(Cell), rVmSp);
1671     a.  incl(eax);
1672     emitStoreUninitNull(a, 0, rVmSp);
1673     a.  cmp_imm32_reg32(numParams, rax);
1674     a.  jcc8(CC_L, loopTop);
1675   }
1676
1677   // Entry point for numParams == nPassed is here.
1678   // Args are kosher. Frame linkage: set fp = ar.
1679   a.    mov_reg64_reg64(rStashedAR, rVmFp);
1680
1681   int numLocals = numParams;
1682   if (func->isClosureBody()) {
1683     int numUseVars = func->cls()->numDeclProperties();
1684
1685     emitLea(a, rVmFp, -cellsToBytes(numParams), rVmSp);
1686
1687     PhysReg rClosure = rcx;
1688     a.  loadq(rVmFp[AROFF(m_this)], rClosure);
1689
1690     // Swap in the $this or late bound class
1691     a.  loadq(rClosure[c_Closure::ctxOffset()], rAsm);
1692     a.  storeq(rAsm, rVmFp[AROFF(m_this)]);
1693
1694     if (!(func->attrs() & AttrStatic)) {
1695       a.shrq(1, rAsm);
1696       JccBlock<CC_BE> ifRealThis(a);
1697       a.shlq(1, rAsm);
1698       emitIncRef(rAsm, KindOfObject);
1699     }
1700
1701     // Put in the correct context
1702     a.  loadq(rClosure[c_Closure::funcOffset()], rAsm);
1703     a.  storeq(rAsm, rVmFp[AROFF(m_func)]);
1704
1705     // Copy in all the use vars
1706     int baseUVOffset = sizeof(ObjectData) + func->cls()->builtinPropSize();
1707     for (int i = 0; i < numUseVars + 1; i++) {
1708       int spOffset = -cellsToBytes(i+1);
1709
1710       if (i == 0) {
1711         // The closure is the first local.
1712         // We don't incref because it used to be $this
1713         // and now it is a local, so they cancel out
1714         emitStoreTypedValue(a, KindOfObject, rClosure, spOffset, rVmSp);
1715         continue;
1716       }
1717
1718       int uvOffset = baseUVOffset + cellsToBytes(i-1);
1719
1720       emitCopyTo(a, rClosure, uvOffset, rVmSp, spOffset, rAsm);
1721       emitIncRefGenericRegSafe(rVmSp, spOffset, rAsm);
1722     }
1723
1724     numLocals += numUseVars + 1;
1725   }
1726
1727   // We're in the callee frame; initialize locals. Unroll the loop all
1728   // the way if there are a modest number of locals to update;
1729   // otherwise, do it in a compact loop. If we're in a generator body,
1730   // named locals will be initialized by UnpackCont so we can leave
1731   // them alone here.
1732   int numUninitLocals = func->numLocals() - numLocals;
1733   assert(numUninitLocals >= 0);
1734   if (numUninitLocals > 0 && !func->isGenerator()) {
1735     SpaceRecorder sr("_InitializeLocals", a);
1736
1737     // If there are too many locals, then emitting a loop to initialize locals
1738     // is more compact, rather than emitting a slew of movs inline.
1739     if (numUninitLocals > kLocalsToInitializeInline) {
1740       PhysReg loopReg = rcx;
1741
1742       // rVmFp + rcx points to the count/type fields of the TypedValue we're
1743       // about to write to.
1744       int loopStart = -func->numLocals() * sizeof(TypedValue) + TVOFF(m_type);
1745       int loopEnd = -numLocals * sizeof(TypedValue) + TVOFF(m_type);
1746
1747       a.  emitImmReg(loopStart, loopReg);
1748       a.  emitImmReg(KindOfUninit, rdx);
1749
1750       TCA topOfLoop = a.frontier();
1751       // do {
1752       //   rVmFp[loopReg].m_type = KindOfUninit;
1753       // } while(++loopReg != loopEnd);
1754
1755       emitStoreTVType(a, edx, rVmFp[loopReg]);
1756       a.  addq   (sizeof(Cell), loopReg);
1757       a.  cmpq   (loopEnd, loopReg);
1758       a.  jcc8   (CC_NE, topOfLoop);
1759     } else {
1760       PhysReg base;
1761       int disp, k;
1762       static_assert(KindOfUninit == 0, "");
1763       if (numParams < func->numLocals()) {
1764         a.xorl (eax, eax);
1765       }
1766       for (k = numLocals; k < func->numLocals(); ++k) {
1767         locToRegDisp(Location(Location::Local, k), &base, &disp, func);
1768         emitStoreTVType(a, eax, base[disp + TVOFF(m_type)]);
1769       }
1770     }
1771   }
1772
1773   const Opcode* destPC = func->unit()->entry() + func->base();
1774   if (dvInitializer != InvalidAbsoluteOffset) {
1775     // dispatch to funclet.
1776     destPC = func->unit()->entry() + dvInitializer;
1777   }
1778   SrcKey funcBody(func, destPC);
1779
1780   // Move rVmSp to the right place: just past all locals
1781   int frameCells = func->numSlotsInFrame();
1782   if (func->isGenerator()) {
1783     frameCells = 1;
1784   } else {
1785     emitLea(a, rVmFp, -cellsToBytes(frameCells), rVmSp);
1786   }
1787
1788   Fixup fixup(funcBody.offset() - func->base(), frameCells);
1789
1790   // Emit warnings for any missing arguments
1791   if (!func->info()) {
1792     for (int i = nPassed; i < numParams; ++i) {
1793       if (paramInfo[i].funcletOff() == InvalidAbsoluteOffset) {
1794         a.  emitImmReg((intptr_t)func->name()->data(), argNumToRegName[0]);
1795         a.  emitImmReg(numParams, argNumToRegName[1]);
1796         a.  emitImmReg(i, argNumToRegName[2]);
1797         emitCall(a, (TCA)raiseMissingArgument);
1798         m_fixupMap.recordFixup(a.frontier(), fixup);
1799       }
1800     }
1801   }
1802
1803   // Check surprise flags in the same place as the interpreter: after
1804   // setting up the callee's frame but before executing any of its
1805   // code
1806   emitCheckSurpriseFlagsEnter(false, fixup);
1807
1808   if (func->isClosureBody() && func->cls()) {
1809     int entry = nPassed <= numParams ? nPassed : numParams + 1;
1810     // Relying on rStashedAR == rVmFp here
1811     a.    loadq   (rStashedAR[AROFF(m_func)], rax);
1812     a.    loadq   (rax[Func::prologueTableOff() + sizeof(TCA)*entry], rax);
1813     a.    jmp     (rax);
1814   } else {
1815     emitBindJmp(funcBody);
1816   }
1817   return funcBody;
1818 }
1819
1820 static bool
1821 isNativeImplCall(const Func* funcd, int numArgs) {
1822   return funcd && funcd->info() && numArgs == funcd->numParams();
1823 }
1824
1825 int32_t // returns the amount by which rVmSp should be adjusted
1826 TranslatorX64::emitBindCall(SrcKey srcKey, const Func* funcd, int numArgs) {
1827   // If this is a call to a builtin and we don't need any argument
1828   // munging, we can skip the prologue system and do it inline.
1829   if (isNativeImplCall(funcd, numArgs)) {
1830     StoreImmPatcher patchIP(a, (uint64_t)a.frontier(), reg::rax,
1831                             cellsToBytes(numArgs) + AROFF(m_savedRip),
1832                             rVmSp);
1833     assert(funcd->numLocals() == funcd->numParams());
1834     assert(funcd->numIterators() == 0);
1835     emitLea(a, rVmSp, cellsToBytes(numArgs), rVmFp);
1836     emitCheckSurpriseFlagsEnter(true, Fixup(0, numArgs));
1837     // rVmSp is already correctly adjusted, because there's no locals
1838     // other than the arguments passed.
1839     auto retval = emitNativeImpl(funcd, false /* don't jump to return */);
1840     patchIP.patch(uint64_t(a.frontier()));
1841     return retval;
1842   }
1843   if (debug) {
1844     a.    storeq (kUninitializedRIP,
1845                   rVmSp[cellsToBytes(numArgs) + AROFF(m_savedRip)]);
1846   }
1847   // Stash callee's rVmFp into rStashedAR for the callee's prologue
1848   emitLea(a, rVmSp, cellsToBytes(numArgs), rStashedAR);
1849   emitBindCallHelper(srcKey, funcd, numArgs);
1850   return 0;
1851 }
1852
1853 void
1854 TranslatorX64::emitBindCallHelper(SrcKey srcKey,
1855                                   const Func* funcd,
1856                                   int numArgs) {
1857   // Whatever prologue we're branching to will check at runtime that we
1858   // went to the right Func*, correcting if necessary. We treat the first
1859   // Func we encounter as a decent prediction. Make space to burn in a
1860   // TCA.
1861   ReqBindCall* req = m_globalData.alloc<ReqBindCall>();
1862   prepareForSmash(a, kCallLen);
1863   TCA toSmash = a.frontier();
1864   a.    call(astubs.frontier());
1865
1866   astubs.    mov_reg64_reg64(rStashedAR, serviceReqArgRegs[1]);
1867   emitPopRetIntoActRec(astubs);
1868   emitServiceReq(REQ_BIND_CALL, req);
1869
1870   TRACE(1, "will bind static call: tca %p, this %p, funcd %p, astubs %p\n",
1871         toSmash, this, funcd, astubs.frontier());
1872   req->m_toSmash = toSmash;
1873   req->m_nArgs = numArgs;
1874   req->m_sourceInstr = srcKey;
1875   req->m_isImmutable = (bool)funcd;
1876
1877   return;
1878 }
1879
1880 /*
1881  * NativeImpl is a special operation in the sense that it must be the
1882  * only opcode in a function body, and also functions as the return.
1883  *
1884  * if emitSavedRIPReturn is false, it returns the amount by which
1885  * rVmSp should be adjusted, otherwise, it emits code to perform
1886  * the adjustment (this allows us to combine updates to rVmSp)
1887  */
1888 int32_t TranslatorX64::emitNativeImpl(const Func* func,
1889                                       bool emitSavedRIPReturn) {
1890   BuiltinFunction builtinFuncPtr = func->builtinFuncPtr();
1891   if (false) { // typecheck
1892     ActRec* ar = nullptr;
1893     builtinFuncPtr(ar);
1894   }
1895
1896   TRACE(2, "calling builtin preClass %p func %p\n", func->preClass(),
1897     builtinFuncPtr);
1898   /*
1899    * Call the native implementation. This will free the locals for us in the
1900    * normal case. In the case where an exception is thrown, the VM unwinder
1901    * will handle it for us.
1902    */
1903   a.   mov_reg64_reg64(rVmFp, argNumToRegName[0]);
1904   if (eagerRecord(func)) {
1905     emitEagerSyncPoint(a, func->getEntry(), 0);
1906   }
1907   emitCall(a, (TCA)builtinFuncPtr);
1908
1909   /*
1910    * We're sometimes calling this while curFunc() isn't really the
1911    * builtin---make sure to properly record the sync point as if we
1912    * are inside the builtin.
1913    *
1914    * The assumption here is that for builtins, the generated func
1915    * contains only a single opcode (NativeImpl), and there are no
1916    * non-argument locals.
1917    */
1918   assert(func->numIterators() == 0 && func->isBuiltin());
1919   assert(func->numLocals() == func->numParams());
1920   assert(toOp(*func->getEntry()) == OpNativeImpl);
1921   assert(instrLen((Op*)func->getEntry()) == func->past() - func->base());
1922   Offset pcOffset = 0;  // NativeImpl is the only instruction in the func
1923   Offset stackOff = func->numLocals(); // Builtin stubs have no
1924                                        // non-arg locals
1925   recordSyncPoint(a, pcOffset, stackOff);
1926
1927   if (emitSavedRIPReturn) {
1928     // push the return address to get ready to ret.
1929     a.   push  (rVmFp[AROFF(m_savedRip)]);
1930   }
1931
1932   /*
1933    * The native implementation already put the return value on the
1934    * stack for us, and handled cleaning up the arguments.  We have to
1935    * update the frame pointer and the stack pointer, and load the
1936    * return value into the return register so the trace we are
1937    * returning to has it where it expects.
1938    *
1939    * TODO(#1273094): we should probably modify the actual builtins to
1940    * return values via registers (rax:edx) using the C ABI and do a
1941    * reg-to-reg move.
1942    */
1943   int nLocalCells = func->numSlotsInFrame();
1944   if (emitSavedRIPReturn) {
1945     a. add_imm64_reg64(sizeof(ActRec) + cellsToBytes(nLocalCells-1), rVmSp);
1946   }
1947   a.   load_reg64_disp_reg64(rVmFp, AROFF(m_savedRbp), rVmFp);
1948
1949   emitRB(a, RBTypeFuncExit, func->fullName()->data());
1950   if (emitSavedRIPReturn) {
1951     a. ret();
1952     translator_not_reached(a);
1953     return 0;
1954   }
1955   return sizeof(ActRec) + cellsToBytes(nLocalCells-1);
1956 }
1957
1958 /*
1959  * bindJmp --
1960  *
1961  *   Runtime service handler that patches a jmp to the translation of
1962  *   u:dest from toSmash.
1963  */
1964 TCA
1965 TranslatorX64::bindJmp(TCA toSmash, SrcKey destSk,
1966                        ServiceRequest req, bool& smashed) {
1967   TCA tDest = getTranslation(
1968     TranslArgs(destSk, false).interp(req == REQ_BIND_JMP_NO_IR)
1969                              .src(toSmash));
1970   if (!tDest) return nullptr;
1971   LeaseHolder writer(s_writeLease);
1972   if (!writer) return tDest;
1973   smashed = true;
1974   SrcRec* sr = getSrcRec(destSk);
1975   if (req == REQ_BIND_ADDR) {
1976     sr->chainFrom(IncomingBranch::addr(reinterpret_cast<TCA*>(toSmash)));
1977   } else if (req == REQ_BIND_JCC) {
1978     sr->chainFrom(IncomingBranch::jccFrom(toSmash));
1979   } else {
1980     sr->chainFrom(IncomingBranch::jmpFrom(toSmash));
1981   }
1982   return tDest;
1983 }
1984
1985 /*
1986  * When we end a tracelet with a conditional jump, emitCondJmp first emits:
1987  *
1988  *   1:         j<CC> stubJmpccFirst
1989  *              jmp   stubJmpccFirst
1990  *
1991  * Our "taken" argument tells us whether the branch at 1: was taken or
1992  * not; and therefore which of offTaken and offNotTaken to continue executing.
1993  * If we did take the branch, we now rewrite the code so that the branch is
1994  * straightened. This predicts that subsequent executions will go the same way
1995  * as the first execution.
1996  *
1997  *              jn<CC> stubJmpccSecond:offNotTaken
1998  *              nop5   ; fallthru, or jmp if there's already a translation.
1999  * offTaken:
2000  *
2001  * If we did not take the branch, we leave the sense of the condition
2002  * intact, while patching it up to go to the unexplored code:
2003  *
2004  *              j<CC> stubJmpccSecond:offTaken
2005  *              nop5
2006  * offNotTaken:
2007  */
2008 TCA
2009 TranslatorX64::bindJmpccFirst(TCA toSmash,
2010                               Offset offTaken, Offset offNotTaken,
2011                               bool taken,
2012                               ConditionCode cc,
2013                               bool& smashed) {
2014   const Func* f = liveFunc();
2015   LeaseHolder writer(s_writeLease);
2016   if (!writer) return nullptr;
2017   Offset offWillExplore = taken ? offTaken : offNotTaken;
2018   Offset offWillDefer = taken ? offNotTaken : offTaken;
2019   SrcKey dest(f, offWillExplore);
2020   TRACE(3, "bindJmpccFirst: explored %d, will defer %d; overwriting cc%02x "
2021         "taken %d\n",
2022         offWillExplore, offWillDefer, cc, taken);
2023
2024   // We want the branch to point to whichever side has not been explored
2025   // yet.
2026   if (taken) cc = ccNegate(cc);
2027   TCA stub = emitServiceReq(REQ_BIND_JMPCC_SECOND, toSmash, offWillDefer, cc);
2028
2029   Asm& as = getAsmFor(toSmash);
2030   // Its not clear where chainFrom should go to if as is astubs
2031   assert(&as != &astubs);
2032
2033   // can we just directly fall through?
2034   // a jmp + jz takes 5 + 6 = 11 bytes
2035   bool fallThru = toSmash + kJmpccLen + kJmpLen == as.frontier() &&
2036     !m_srcDB.find(dest);
2037
2038   TCA tDest;
2039   tDest = getTranslation(TranslArgs(dest, !fallThru).src(toSmash));
2040   if (!tDest) {
2041     return 0;
2042   }
2043   smashed = true;
2044   assert(s_writeLease.amOwner());
2045   /*
2046    * Roll over the jcc and the jmp/fallthru. E.g., from:
2047    *
2048    *     toSmash:    jcc   <jmpccFirstStub>
2049    *     toSmash+6:  jmp   <jmpccFirstStub>
2050    *     toSmash+11: <probably the new translation == tdest>
2051    *
2052    * to:
2053    *
2054    *     toSmash:    j[n]z <jmpccSecondStub>
2055    *     toSmash+6:  nop5
2056    *     toSmash+11: newHotness
2057    */
2058   CodeCursor cg(as, toSmash);
2059   as.jcc(cc, stub);
2060   getSrcRec(dest)->chainFrom(IncomingBranch::jmpFrom(as.frontier()));
2061   TRACE(5, "bindJmpccFirst: overwrote with cc%02x taken %d\n", cc, taken);
2062   return tDest;
2063 }
2064
2065 // smashes a jcc to point to a new destination
2066 TCA
2067 TranslatorX64::bindJmpccSecond(TCA toSmash, const Offset off,
2068                                ConditionCode cc, bool& smashed) {
2069   const Func* f = liveFunc();
2070   SrcKey dest(f, off);
2071   TCA branch = getTranslation(TranslArgs(dest, true).src(toSmash));
2072   LeaseHolder writer(s_writeLease, LeaseAcquire::NO_ACQUIRE);
2073   if (branch && writer.acquire()) {
2074     smashed = true;
2075     SrcRec* destRec = getSrcRec(dest);
2076     destRec->chainFrom(IncomingBranch::jccFrom(toSmash));
2077   }
2078   return branch;
2079 }
2080
2081 static void emitJmpOrJcc(X64Assembler& a, ConditionCode cc, TCA addr) {
2082   if (cc == CC_None) {
2083     a.   jmp(addr);
2084   } else {
2085     a.   jcc((ConditionCode)cc, addr);
2086   }
2087 }
2088
2089 /*
2090  * emitBindJ --
2091  *
2092  *   Emit code to lazily branch (optionally on condition cc) to the
2093  *   srckey in next.
2094  *   Assumes current basic block is closed (outputs synced, etc.).
2095  */
2096 void
2097 TranslatorX64::emitBindJ(X64Assembler& _a, ConditionCode cc,
2098                          SrcKey dest, ServiceRequest req) {
2099   prepareForSmash(_a, cc == CC_None ? (int)kJmpLen : kJmpccLen);
2100   TCA toSmash = _a.frontier();
2101   if (&_a == &astubs) {
2102     emitJmpOrJcc(_a, cc, toSmash);
2103   }
2104
2105   setJmpTransID(toSmash);
2106
2107   TCA sr = emitServiceReq(SRFlags::None, req,
2108                           toSmash, dest.offset());
2109
2110   if (&_a == &astubs) {
2111     CodeCursor cursor(_a, toSmash);
2112     emitJmpOrJcc(_a, cc, sr);
2113   } else {
2114     emitJmpOrJcc(_a, cc, sr);
2115   }
2116 }
2117
2118 void
2119 TranslatorX64::emitBindJcc(X64Assembler& _a, ConditionCode cc,
2120                            SrcKey dest,
2121                            ServiceRequest req /* = REQ_BIND_JCC */) {
2122   emitBindJ(_a, cc, dest, req);
2123 }
2124
2125 void
2126 TranslatorX64::emitBindJmp(X64Assembler& _a,
2127                            SrcKey dest,
2128                            ServiceRequest req /* = REQ_BIND_JMP */) {
2129   emitBindJ(_a, CC_None, dest, req);
2130 }
2131
2132 void
2133 TranslatorX64::emitBindJmp(SrcKey dest) {
2134   emitBindJmp(a, dest);
2135 }
2136
2137 void TranslatorX64::emitResolvedDeps(const ChangeMap& resolvedDeps) {
2138   for (const auto dep : resolvedDeps) {
2139     m_irTrans->assertType(dep.first, dep.second->rtt);
2140   }
2141 }
2142
2143 void
2144 TranslatorX64::emitFallbackJmp(SrcRec& dest, ConditionCode cc /* = CC_NZ */) {
2145   emitFallbackJmp(a, dest, cc);
2146 }
2147
2148 void
2149 TranslatorX64::emitFallbackJmp(Asm& as, SrcRec& dest,
2150                                ConditionCode cc /* = CC_NZ */) {
2151   prepareForSmash(as, kJmpccLen);
2152   dest.emitFallbackJump(as.frontier(), cc);
2153 }
2154
2155 void
2156 TranslatorX64::emitFallbackUncondJmp(Asm& as, SrcRec& dest) {
2157   prepareForSmash(as, kJmpLen);
2158   dest.emitFallbackJump(as.frontier());
2159 }
2160
2161 void
2162 TranslatorX64::emitFallbackCondJmp(Asm& as, SrcRec& dest, ConditionCode cc) {
2163   prepareForSmash(as, kJmpccLen);
2164   dest.emitFallbackJump(as.frontier(), cc);
2165 }
2166
2167 void TranslatorX64::emitReqRetransNoIR(Asm& as, const SrcKey& sk) {
2168   prepareForSmash(as, kJmpLen);
2169   TCA toSmash = as.frontier();
2170   if (&as == &astubs) {
2171     as.jmp(toSmash);
2172   }
2173
2174   TCA sr = emitServiceReq(REQ_RETRANSLATE_NO_IR,
2175                           toSmash, sk.offset());
2176
2177   if (&as == &astubs) {
2178     CodeCursor cc(as, toSmash);
2179     as.jmp(sr);
2180   } else {
2181     as.jmp(sr);
2182   }
2183 }
2184
2185 void TranslatorX64::emitReqRetransOpt(Asm& as, const SrcKey& sk,
2186                                       TransID transId) {
2187   emitServiceReq(REQ_RETRANSLATE_OPT,
2188                  sk.getFuncId(), sk.offset(), transId);
2189 }
2190
2191 void
2192 TranslatorX64::checkRefs(X64Assembler& a,
2193                          SrcKey sk,
2194                          const RefDeps& refDeps,
2195                          SrcRec& fail) {
2196   if (refDeps.size() == 0) {
2197     return;
2198   }
2199
2200   // Set up guards for each pushed ActRec that we've made reffiness
2201   // assumptions about
2202   for (RefDeps::ArMap::const_iterator it = refDeps.m_arMap.begin();
2203        it != refDeps.m_arMap.end(); ++it) {
2204     // Be careful! The actual Func might have fewer refs than the number
2205     // of args we're passing. To forestall this, we always prepare at
2206     // least 64 bits in the Func, and always fill out the refBitVec
2207     // to a multiple of 64 bits
2208
2209     int entryArDelta = it->first;
2210
2211     m_irTrans->hhbcTrans().guardRefs(entryArDelta,
2212                                      it->second.m_mask,
2213                                      it->second.m_vals);
2214   }
2215 }
2216
2217 /*
2218  * emitRetFromInterpretedFrame --
2219  *
2220  *   When the interpreter pushes a call frame, there is necessarily no
2221  *   machine RIP available to return to. This helper fishes out the
2222  *   destination from the frame and redirects execution to it via enterTC.
2223  */
2224 TCA
2225 TranslatorX64::emitRetFromInterpretedFrame() {
2226   int32_t arBase = sizeof(ActRec) - sizeof(Cell);
2227   moveToAlign(astubs);
2228   TCA stub = astubs.frontier();
2229   // Marshall our own args by hand here.
2230   astubs.   lea  (rVmSp[-arBase], serviceReqArgRegs[0]);
2231   astubs.   movq (rVmFp, serviceReqArgRegs[1]);
2232   emitServiceReq(SRFlags::JmpInsteadOfRet, REQ_POST_INTERP_RET);
2233   return stub;
2234 }
2235
2236 /*
2237  * Same as above, except has different logic for fetching the AR we are trying
2238  * to return from, because generators have ARs in different places.
2239  */
2240 TCA
2241 TranslatorX64::emitRetFromInterpretedGeneratorFrame() {
2242   // We have to get the Continuation object from the current AR's $this, then
2243   // find where its embedded AR is.
2244   moveToAlign(astubs);
2245   TCA stub = astubs.frontier();
2246
2247   PhysReg rContAR = serviceReqArgRegs[0];
2248   astubs.    loadq (rVmFp[AROFF(m_this)], rContAR);
2249   astubs.    loadq (rContAR[CONTOFF(m_arPtr)], rContAR);
2250   astubs.    movq  (rVmFp, serviceReqArgRegs[1]);
2251   emitServiceReq(SRFlags::JmpInsteadOfRet, REQ_POST_INTERP_RET);
2252   return stub;
2253 }
2254
2255 class FreeRequestStubTrigger : public Treadmill::WorkItem {
2256   TCA m_stub;
2257  public:
2258   explicit FreeRequestStubTrigger(TCA stub) : m_stub(stub) {
2259     TRACE(3, "FreeStubTrigger @ %p, stub %p\n", this, m_stub);
2260   }
2261   virtual void operator()() {
2262     TRACE(3, "FreeStubTrigger: Firing @ %p , stub %p\n", this, m_stub);
2263     if (TranslatorX64::Get()->freeRequestStub(m_stub) != true) {
2264       // If we can't free the stub, enqueue again to retry.
2265       TRACE(3, "FreeStubTrigger: write lease failed, requeueing %p\n", m_stub);
2266       enqueue(new FreeRequestStubTrigger(m_stub));
2267     }
2268   }
2269 };
2270
2271 #ifdef DEBUG
2272
2273 struct DepthGuard {
2274   static __thread int m_depth;
2275   DepthGuard()  { m_depth++; TRACE(2, "DepthGuard: %d {\n", m_depth); }
2276   ~DepthGuard() { TRACE(2, "DepthGuard: %d }\n", m_depth); m_depth--; }
2277
2278   bool depthOne() const { return m_depth == 1; }
2279 };
2280 __thread int DepthGuard::m_depth;
2281
2282 #else
2283
2284 struct DepthGuard { bool depthOne() const { return false; } };
2285
2286 #endif
2287
2288 /*
2289  * enterTCHelper does not save callee-saved registers except %rbp. This means
2290  * when we call it from C++, we have to tell gcc to clobber all the other
2291  * callee-saved registers.
2292  */
2293 #if defined(__x86_64__)
2294 #  define CALLEE_SAVED_BARRIER() \
2295   asm volatile("" : : : "rbx", "r12", "r13", "r14", "r15")
2296 #elif defined(__AARCH64EL__)
2297 #  define CALLEE_SAVED_BARRIER() \
2298   asm volatile("" : : : "x19", "x20", "x21", "x22", "x23", "x24", "x25", \
2299                "x26", "x27", "x28")
2300 #else
2301 #  error What are the callee-saved registers on your system?
2302 #endif
2303
2304 /*
2305  * enterTCHelper is a handwritten assembly function that transfers control in
2306  * and out of the TC.
2307  */
2308 static_assert(rVmSp == rbx &&
2309               rVmFp == rbp &&
2310               rVmTl == r12 &&
2311               rStashedAR == r15,
2312               "__enterTCHelper needs to be modified to use the correct ABI");
2313 static_assert(kReservedRSPScratchSpace == 0x280,
2314               "enterTCHelper needs to be updated for changes to "
2315               "kReservedRSPScratchSpace");
2316 static_assert(REQ_BIND_CALL == 0x1,
2317               "Update assembly test for REQ_BIND_CALL in __enterTCHelper");
2318 extern "C" void enterTCHelper(Cell* vm_sp,
2319                               Cell* vm_fp,
2320                               TCA start,
2321                               TReqInfo* infoPtr,
2322                               ActRec* firstAR,
2323                               void* targetCacheBase);
2324
2325
2326 struct TReqInfo {
2327   uintptr_t requestNum;
2328   uintptr_t args[5];
2329
2330   // Some TC registers need to be preserved across service requests.
2331   uintptr_t saved_rStashedAr;
2332
2333   // Stub addresses are passed back to allow us to recycle used stubs.
2334   TCA stubAddr;
2335 };
2336
2337
2338 void
2339 TranslatorX64::enterTC(TCA start, void* data) {
2340   using namespace TargetCache;
2341
2342   if (debug) {
2343     fflush(stdout);
2344     fflush(stderr);
2345   }
2346   DepthGuard d;
2347   TReqInfo info;
2348   SrcKey sk;
2349
2350   if (LIKELY(start != nullptr)) {
2351     info.requestNum = data ? REQ_BIND_CALL : -1;
2352     info.saved_rStashedAr = (uintptr_t)data;
2353   } else {
2354     info.requestNum = -1;
2355     info.saved_rStashedAr = 0;
2356     sk = *(SrcKey*)data;
2357     start = getTranslation(TranslArgs(sk, true));
2358   }
2359   for (;;) {
2360     assert(sizeof(Cell) == 16);
2361     assert(((uintptr_t)vmsp() & (sizeof(Cell) - 1)) == 0);
2362     assert(((uintptr_t)vmfp() & (sizeof(Cell) - 1)) == 0);
2363
2364     s_writeLease.gremlinUnlock();
2365     // Keep dispatching until we end up somewhere the translator
2366     // recognizes, or we luck out and the leaseholder exits.
2367     while (!start) {
2368       TRACE(2, "enterTC forwarding BB to interpreter\n");
2369       g_vmContext->m_pc = sk.unit()->at(sk.offset());
2370       INC_TPC(interp_bb);
2371       g_vmContext->dispatchBB();
2372       PC newPc = g_vmContext->getPC();
2373       if (!newPc) { g_vmContext->m_fp = 0; return; }
2374       sk = SrcKey(liveFunc(), newPc);
2375       start = getTranslation(TranslArgs(sk, true));
2376     }
2377     assert(start == (TCA)HPHP::Transl::funcBodyHelperThunk ||
2378            isValidCodeAddress(start) ||
2379            (start == (TCA)HPHP::Transl::fcallHelperThunk &&
2380             info.saved_rStashedAr == (uintptr_t)data));
2381     assert(!s_writeLease.amOwner());
2382     const Func* func = (vmfp() ? (ActRec*)vmfp() : (ActRec*)data)->m_func;
2383     func->validate();
2384     INC_TPC(enter_tc);
2385
2386     TRACE(1, "enterTC: %p fp%p(%s) sp%p enter {\n", start,
2387           vmfp(), func->name()->data(), vmsp());
2388     tl_regState = VMRegState::DIRTY;
2389
2390     // We have to force C++ to spill anything that might be in a callee-saved
2391     // register (aside from rbp). enterTCHelper does not save them.
2392     CALLEE_SAVED_BARRIER();
2393     enterTCHelper(vmsp(), vmfp(), start, &info, vmFirstAR(),
2394                   tl_targetCaches);
2395     CALLEE_SAVED_BARRIER();
2396     assert(g_vmContext->m_stack.isValidAddress((uintptr_t)vmsp()));
2397
2398     tl_regState = VMRegState::CLEAN; // Careful: pc isn't sync'ed yet.
2399     TRACE(1, "enterTC: %p fp%p sp%p } return\n", start,
2400           vmfp(), vmsp());
2401
2402     if (debug) {
2403       // Debugging code: cede the write lease half the time.
2404       if (RuntimeOption::EvalJitStressLease) {
2405         if (d.depthOne() == 1 && (rand() % 2) == 0) {
2406           s_writeLease.gremlinLock();
2407         }
2408       }
2409       // Ensure that each case either returns, or drives start to a valid
2410       // value.
2411       start = TCA(0xbee5face);
2412     }
2413
2414     TRACE(2, "enterTC: request(%s) args: %" PRIxPTR " %" PRIxPTR " %"
2415              PRIxPTR " %" PRIxPTR " %" PRIxPTR "\n",
2416           reqName(info.requestNum),
2417           info.args[0], info.args[1], info.args[2], info.args[3],
2418           info.args[4]);
2419
2420     if (LIKELY(info.requestNum == REQ_EXIT)) {
2421       vmfp() = nullptr;
2422       return;
2423     }
2424     if (!handleServiceRequest(info, start, sk)) return;
2425   }
2426 }
2427
2428 /*
2429  * The contract is that each case will set sk to the place where
2430  * execution should resume, and optionally set start to the hardware
2431  * translation of the resumption point (or otherwise set it to null).
2432  * Returns false if we need to halt this nesting of the VM.
2433  *
2434  * start and sk might be subtly different; i.e., there are cases where
2435  * start != NULL && start != getTranslation(sk). For instance,
2436  * REQ_BIND_CALL has not finished executing the OpCall when it gets
2437  * here, and has even done some work on its behalf. sk == OpFCall,
2438  * while start == the point in the TC that's "half-way through" the
2439  * Call instruction. If we punt to the interpreter, the interpreter
2440  * will redo some of the work that the translator has already done.
2441  */
2442 bool TranslatorX64::handleServiceRequest(TReqInfo& info,
2443                                          TCA& start,
2444                                          SrcKey& sk) {
2445   const uintptr_t& requestNum = info.requestNum;
2446   auto* const args = info.args;
2447   assert(requestNum != REQ_EXIT);
2448   INC_TPC(service_req);
2449
2450   bool smashed = false;
2451   switch (requestNum) {
2452   case REQ_BIND_CALL: {
2453     ReqBindCall* req = (ReqBindCall*)args[0];
2454     ActRec* calleeFrame = (ActRec*)args[1];
2455     TCA toSmash = req->m_toSmash;
2456     Func *func = const_cast<Func*>(calleeFrame->m_func);
2457     int nArgs = req->m_nArgs;
2458     bool isImmutable = req->m_isImmutable;
2459     TCA dest = tx64->funcPrologue(func, nArgs);
2460     TRACE(2, "enterTC: bindCall %s -> %p\n", func->name()->data(), dest);
2461     if (!isImmutable) {
2462       // We dont know we're calling the right function, so adjust
2463       // dest to point to the dynamic check of ar->m_func.
2464       dest = funcPrologueToGuard(dest, func);
2465     } else {
2466       TRACE(2, "enterTC: bindCall immutably %s -> %p\n",
2467             func->fullName()->data(), dest);
2468     }
2469     LeaseHolder writer(s_writeLease, LeaseAcquire::NO_ACQUIRE);
2470     if (dest && writer.acquire()) {
2471       TRACE(2, "enterTC: bindCall smash %p -> %p\n", toSmash, dest);
2472       smashCall(tx64->getAsmFor(toSmash), toSmash, dest);
2473       smashed = true;
2474       // sk: stale, but doesn't matter since we have a valid dest TCA.
2475     } else {
2476       // We need translator help; we're not at the callee yet, so
2477       // roll back. The prelude has done some work already, but it
2478       // should be safe to redo.
2479       TRACE(2, "enterTC: bindCall rollback smash %p -> %p\n",
2480             toSmash, dest);
2481       sk = req->m_sourceInstr;
2482     }
2483     start = dest;
2484     if (!start) {
2485       // EnterTCHelper pushes the return ip onto the stack when the
2486       // requestNum is REQ_BIND_CALL, but if start is NULL, it will
2487       // interpret in doFCall, so we clear out the requestNum in this
2488       // case to prevent enterTCHelper from pushing the return ip
2489       // onto the stack.
2490       info.requestNum = ~REQ_BIND_CALL;
2491     }
2492   } break;
2493
2494   case REQ_BIND_SIDE_EXIT:
2495   case REQ_BIND_JMP:
2496   case REQ_BIND_JCC:
2497   case REQ_BIND_JMP_NO_IR:
2498   case REQ_BIND_ADDR:
2499   {
2500     TCA toSmash = (TCA)args[0];
2501     Offset off = args[1];
2502     sk = SrcKey(liveFunc(), off);
2503     if (requestNum == REQ_BIND_SIDE_EXIT) {
2504       SKTRACE(3, sk, "side exit taken!\n");
2505     }
2506     start = bindJmp(toSmash, sk, (ServiceRequest)requestNum, smashed);
2507   } break;
2508
2509   case REQ_BIND_JMPCC_FIRST: {
2510     TCA toSmash = (TCA)args[0];
2511     Offset offTaken = (Offset)args[1];
2512     Offset offNotTaken = (Offset)args[2];
2513     ConditionCode cc = ConditionCode(args[3]);
2514     bool taken = int64_t(args[4]) & 1;
2515     start = bindJmpccFirst(toSmash, offTaken, offNotTaken,
2516                            taken, cc, smashed);
2517     // SrcKey: we basically need to emulate the fail
2518     sk = SrcKey(liveFunc(), taken ? offTaken : offNotTaken);
2519   } break;
2520
2521   case REQ_BIND_JMPCC_SECOND: {
2522     TCA toSmash = (TCA)args[0];
2523     Offset off = (Offset)args[1];
2524     ConditionCode cc = ConditionCode(args[2]);
2525     start = bindJmpccSecond(toSmash, off, cc, smashed);
2526     sk = SrcKey(liveFunc(), off);
2527   } break;
2528
2529   case REQ_RETRANSLATE_NO_IR: {
2530     TCA toSmash = (TCA)args[0];
2531     sk = SrcKey(liveFunc(), (Offset)args[1]);
2532     start = retranslateAndPatchNoIR(sk, true, toSmash);
2533     SKTRACE(1, sk, "retranslated (without IR) @%p\n", start);
2534   } break;
2535
2536   case REQ_RETRANSLATE_OPT: {
2537     FuncId  funcId  = (FuncId) args[0];
2538     Offset  offset  = (Offset) args[1];
2539     TransID transId = (TransID)args[2];
2540     sk = SrcKey(funcId, offset);
2541     start = retranslateOpt(transId, false);
2542     SKTRACE(2, sk, "retranslated-OPT: transId = %d  start: @%p\n", transId,
2543             start);
2544     break;
2545   }
2546
2547   case REQ_RETRANSLATE: {
2548     INC_TPC(retranslate);
2549     sk = SrcKey(liveFunc(), (Offset)args[0]);
2550     start = retranslate(TranslArgs(sk, true));
2551     SKTRACE(2, sk, "retranslated @%p\n", start);
2552   } break;
2553
2554   case REQ_INTERPRET: {
2555     Offset off = args[0];
2556     int numInstrs = args[1];
2557     g_vmContext->m_pc = liveUnit()->at(off);
2558     /*
2559      * We know the compilation unit has not changed; basic blocks do
2560      * not span files. I claim even exceptions do not violate this
2561      * axiom.
2562      */
2563     assert(numInstrs >= 0);
2564     SKTRACE(5, SrcKey(liveFunc(), off), "interp: enter\n");
2565     if (numInstrs) {
2566       s_perfCounters[tpc_interp_instr] += numInstrs;
2567       g_vmContext->dispatchN(numInstrs);
2568     } else {
2569       // numInstrs == 0 means it wants to dispatch until BB ends
2570       INC_TPC(interp_bb);
2571       g_vmContext->dispatchBB();
2572     }
2573     PC newPc = g_vmContext->getPC();
2574     if (!newPc) { g_vmContext->m_fp = 0; return false; }
2575     SrcKey newSk(liveFunc(), newPc);
2576     SKTRACE(5, newSk, "interp: exit\n");
2577     sk = newSk;
2578     start = getTranslation(TranslArgs(newSk, true));
2579   } break;
2580
2581   case REQ_POST_INTERP_RET: {
2582     // This is only responsible for the control-flow aspect of the Ret:
2583     // getting to the destination's translation, if any.
2584     ActRec* ar = (ActRec*)args[0];
2585     ActRec* caller = (ActRec*)args[1];
2586     assert((Cell*) caller == vmfp());
2587     Unit* destUnit = caller->m_func->unit();
2588     // Set PC so logging code in getTranslation doesn't get confused.
2589     vmpc() = destUnit->at(caller->m_func->base() + ar->m_soff);
2590     SrcKey dest(caller->m_func, vmpc());
2591     sk = dest;
2592     start = getTranslation(TranslArgs(dest, true));
2593     TRACE(3, "REQ_POST_INTERP_RET: from %s to %s\n",
2594           ar->m_func->fullName()->data(),
2595           caller->m_func->fullName()->data());
2596   } break;
2597
2598   case REQ_RESUME: {
2599     if (UNLIKELY(vmpc() == 0)) {
2600       g_vmContext->m_fp = 0;
2601       return false;
2602     }
2603     SrcKey dest(liveFunc(), vmpc());
2604     sk = dest;
2605     start = getTranslation(TranslArgs(dest, true));
2606   } break;
2607
2608   case REQ_STACK_OVERFLOW: {
2609     /*
2610      * we need to construct the pc of the fcall from the return
2611      * address (which will be after the fcall). Because fcall is
2612      * a variable length instruction, and because we sometimes
2613      * delete instructions from the instruction stream, we
2614      * need to use fpi regions to find the fcall.
2615      */
2616     const FPIEnt* fe = liveFunc()->findPrecedingFPI(
2617       liveUnit()->offsetOf(vmpc()));
2618     vmpc() = liveUnit()->at(fe->m_fcallOff);
2619     assert(isFCallStar(toOp(*vmpc())));
2620     raise_error("Stack overflow");
2621     NOT_REACHED();
2622   }
2623   }
2624
2625   if (smashed && info.stubAddr) {
2626     Treadmill::WorkItem::enqueue(new FreeRequestStubTrigger(info.stubAddr));
2627   }
2628
2629   return true;
2630 }
2631
2632 /*
2633  * Support for the stub freelist.
2634  */
2635 TCA FreeStubList::maybePop() {
2636   StubNode* ret = m_list;
2637   if (ret) {
2638     TRACE(1, "alloc stub %p\n", ret);
2639     m_list = ret->m_next;
2640     ret->m_freed = ~kStubFree;
2641   }
2642   return (TCA)ret;
2643 }
2644
2645 void FreeStubList::push(TCA stub) {
2646   /*
2647    * A freed stub may be released by Treadmill more than once if multiple
2648    * threads execute the service request before it is freed. We detect
2649    * duplicates by marking freed stubs
2650    */
2651   StubNode* n = (StubNode *)stub;
2652   if (n->m_freed == kStubFree) {
2653     TRACE(1, "already freed stub %p\n", stub);
2654     return;
2655   }
2656   n->m_freed = kStubFree;
2657   n->m_next = m_list;
2658   TRACE(1, "free stub %p (-> %p)\n", stub, m_list);
2659   m_list = n;
2660 }
2661
2662 bool
2663 TranslatorX64::freeRequestStub(TCA stub) {
2664   LeaseHolder writer(s_writeLease);
2665   /*
2666    * If we can't acquire the write lock, the caller
2667    * (FreeRequestStubTrigger) retries
2668    */
2669   if (!writer) return false;
2670   assert(astubs.contains(stub));
2671   m_freeStubs.push(stub);
2672   return true;
2673 }
2674
2675 TCA TranslatorX64::getFreeStub() {
2676   TCA ret = m_freeStubs.maybePop();
2677   if (ret) {
2678     Stats::inc(Stats::Astubs_Reused);
2679     assert(m_freeStubs.m_list == nullptr ||
2680            astubs.contains(TCA(m_freeStubs.m_list)));
2681     TRACE(1, "recycle stub %p\n", ret);
2682   } else {
2683     ret = astubs.frontier();
2684     Stats::inc(Stats::Astubs_New);
2685     TRACE(1, "alloc new stub %p\n", ret);
2686   }
2687   return ret;
2688 }
2689
2690 /*
2691  * emitServiceReqWork --
2692  *
2693  *   Call a translator service co-routine. The code emitted here
2694  *   reenters the enterTC loop, invoking the requested service. Control
2695  *   will be returned non-locally to the next logical instruction in
2696  *   the TC.
2697  *
2698  *   Return value is a destination; we emit the bulky service
2699  *   request code into astubs.
2700  *
2701  *   Returns a continuation that will run after the arguments have been
2702  *   emitted. This is gross, but is a partial workaround for the inability
2703  *   to capture argument packs in the version of gcc we're using.
2704  */
2705 TCA
2706 TranslatorX64::emitServiceReqWork(SRFlags flags, ServiceRequest req,
2707                                   const TranslatorX64::ServiceReqArgVec& argv) {
2708   /*
2709    * Some requests can be recycled after they've fired once. Since this is
2710    * a special situation, we enumerate them here rather than forcing every
2711    * call site to choose.
2712    */
2713   auto requestIsEphemeral = [](ServiceRequest req) {
2714     switch (req) {
2715     case REQ_BIND_JMPCC_SECOND:
2716     case REQ_BIND_JMPCC_FIRST:
2717     case REQ_BIND_JMP:
2718       return true;
2719     default:
2720       return false;
2721     };
2722   };
2723   static const std::unordered_set<ServiceRequest> ephemeralReqs {
2724     REQ_BIND_JMPCC_SECOND,
2725     REQ_BIND_JMPCC_FIRST,
2726     REQ_BIND_JMP
2727   };
2728
2729   const bool emitInA = flags & SRFlags::EmitInA;
2730   const bool align   = (flags & SRFlags::Align) && !emitInA;
2731   const bool persist = !requestIsEphemeral(req);
2732   Asm&   as = emitInA ? a : astubs;
2733   TCA start = emitInA ? a.frontier() :
2734               persist ? astubs.frontier() :
2735               getFreeStub();
2736   /*
2737    * Remember previous state of the code cache.
2738    */
2739   boost::optional<CodeCursor> maybeCc = boost::none;
2740   if (start != as.frontier()) {
2741     maybeCc = boost::in_place<CodeCursor>(boost::ref(as), start);
2742   }
2743
2744   /* max space for moving to align, saving VM regs plus emitting args */
2745   static const int
2746     kVMRegSpace = 0x14,
2747     kMovSize = 0xa,
2748     kNumServiceRegs = sizeof(serviceReqArgRegs) / sizeof(PhysReg),
2749     kMaxStubSpace = kJmpTargetAlign - 1 + kVMRegSpace +
2750       kNumServiceRegs * kMovSize;
2751   if (align) {
2752     moveToAlign(as);
2753   }
2754   TCA retval = as.frontier();
2755   TRACE(3, "Emit Service Req @%p %s(", start, reqName(req));
2756   /*
2757    * Move args into appropriate regs. Eager VMReg save may bash flags,
2758    * so set the CondCode arguments first.
2759    */
2760   for (int i = 0; i < argv.size(); ++i) {
2761     assert(i < kNumServiceReqArgRegs);
2762     auto reg = serviceReqArgRegs[i];
2763     const auto& argInfo = argv[i];
2764     switch(argv[i].m_kind) {
2765       case ServiceReqArgInfo::Immediate: {
2766         TRACE(3, "%" PRIx64 ", ", argInfo.m_imm);
2767         as.    emitImmReg(argInfo.m_imm, reg);
2768       } break;
2769       case ServiceReqArgInfo::CondCode: {
2770         // Already set before VM reg save.
2771         DEBUG_ONLY TCA start = as.frontier();
2772         as.    setcc(argInfo.m_cc, rbyte(reg));
2773         assert(start - as.frontier() <= kMovSize);
2774         TRACE(3, "cc(%x), ", argInfo.m_cc);
2775       } break;
2776       default: not_reached();
2777     }
2778   }
2779   emitEagerVMRegSave(as, SaveFP);
2780   if (persist) {
2781     as.  emitImmReg(0, rAsm);
2782   } else {
2783     as.  emitImmReg((uint64_t)start, rAsm);
2784   }
2785   TRACE(3, ")\n");
2786   as.    emitImmReg(req, rdi);
2787
2788   /*
2789    * Weird hand-shaking with enterTC: reverse-call a service routine.
2790    *
2791    * In the case of some special stubs (m_callToExit, m_retHelper), we
2792    * have already unbalanced the return stack by doing a ret to
2793    * something other than enterTCHelper.  In that case
2794    * SRJmpInsteadOfRet indicates to fake the return.
2795    */
2796   if (flags & SRFlags::JmpInsteadOfRet) {
2797     as.  pop(rax);
2798     as.  jmp(rax);
2799   } else {
2800     as.  ret();
2801   }
2802   recordBCInstr(OpServiceRequest, as, retval);
2803   translator_not_reached(as);
2804   if (!persist) {
2805     /*
2806      * Recycled stubs need to be uniformly sized. Make space for the
2807      * maximal possible service requests.
2808      */
2809     assert(as.frontier() - start <= kMaxStubSpace);
2810     as.emitNop(start + kMaxStubSpace - as.frontier());
2811     assert(as.frontier() - start == kMaxStubSpace);
2812   }
2813   return retval;
2814 }
2815
2816 TCA
2817 TranslatorX64::emitTransCounterInc(X64Assembler& a) {
2818   TCA start = a.frontier();
2819   if (!isTransDBEnabled()) return start;
2820
2821   a.    movq (getTransCounterAddr(), rAsm);
2822   a.    lock ();
2823   a.    incq (*rAsm);
2824
2825   return start;
2826 }
2827
2828 void
2829 TranslatorX64::getInputsIntoXMMRegs(const NormalizedInstruction& ni,
2830                                     PhysReg lr, PhysReg rr,
2831                                     RegXMM lxmm,
2832                                     RegXMM rxmm) {
2833   const DynLocation& l = *ni.inputs[0];
2834   const DynLocation& r = *ni.inputs[1];
2835   // Get the values into their appropriate xmm locations
2836   auto intoXmm = [&](const DynLocation& l, PhysReg src, RegXMM xmm) {
2837     if (l.isInt()) {
2838       // cvtsi2sd doesn't modify the high bits of its target, which can
2839       // cause false dependencies to prevent register renaming from kicking
2840       // in. Break the dependency chain by zeroing out the destination reg.
2841       a.  pxor_xmm_xmm(xmm, xmm);
2842       a.  cvtsi2sd_reg64_xmm(src, xmm);
2843     } else {
2844       a.  mov_reg64_xmm(src, xmm);
2845     }
2846   };
2847   intoXmm(l, lr, lxmm);
2848   intoXmm(r, rr, rxmm);
2849 }
2850
2851 #define O(opcode, imm, pusph, pop, flags) \
2852 /**
2853  * The interpOne methods saves m_pc, m_fp, and m_sp ExecutionContext,
2854  * calls into the interpreter, and then return a pointer to the
2855  * current ExecutionContext.
2856  */  \
2857 VMExecutionContext*                                                     \
2858 interpOne##opcode(ActRec* ar, Cell* sp, Offset pcOff) {                 \
2859   interp_set_regs(ar, sp, pcOff);                                       \
2860   SKTRACE(5, SrcKey(liveFunc(), vmpc()), "%40s %p %p\n",                \
2861           "interpOne" #opcode " before (fp,sp)",                        \
2862           vmfp(), vmsp());                                              \
2863   assert(toOp(*vmpc()) == Op::opcode);                                  \
2864   VMExecutionContext* ec = g_vmContext;                                 \
2865   Stats::inc(Stats::Instr_InterpOne ## opcode);                         \
2866   if (Trace::moduleEnabled(Trace::interpOne, 1)) {                      \
2867     static const StringData* cat = StringData::GetStaticString("interpOne"); \
2868     static const StringData* name = StringData::GetStaticString(#opcode);    \
2869     Stats::incStatGrouped(cat, name, 1);                                \
2870   }                                                                     \
2871   INC_TPC(interp_one)                                                   \
2872   /* Correct for over-counting in TC-stats. */                          \
2873   Stats::inc(Stats::Instr_TC, -1);                                      \
2874   ec->op##opcode();                                                     \
2875   /*
2876    * Only set regstate back to dirty if an exception is not
2877    * propagating.  If an exception is throwing, regstate for this call
2878    * is actually still correct, and we don't have information in the
2879    * fixup map for interpOne calls anyway.
2880    */ \
2881   tl_regState = VMRegState::DIRTY;                                      \
2882   return ec;                                                            \
2883 }
2884
2885 OPCODES
2886 #undef O
2887
2888 void* interpOneEntryPoints[] = {
2889 #define O(opcode, imm, pusph, pop, flags) \
2890   (void*)(interpOne ## opcode),
2891 OPCODES
2892 #undef O
2893 };
2894
2895 void TranslatorX64::fixupWork(VMExecutionContext* ec,
2896                               ActRec* rbp) const {
2897   assert(RuntimeOption::EvalJit);
2898
2899   TRACE_SET_MOD(fixup);
2900   TRACE(1, "fixup(begin):\n");
2901
2902   auto isVMFrame = [] (ActRec* ar) {
2903     assert(ar);
2904     bool ret = uintptr_t(ar) - Util::s_stackLimit >= Util::s_stackSize;
2905     assert(!ret ||
2906            (ar >= g_vmContext->m_stack.getStackLowAddress() &&
2907             ar < g_vmContext->m_stack.getStackHighAddress()) ||
2908            ar->m_func->isGenerator());
2909     return ret;
2910   };
2911
2912   auto* nextRbp = rbp;
2913   rbp = 0;
2914   do {
2915     auto* prevRbp = rbp;
2916     rbp = nextRbp;
2917     assert(rbp && "Missing fixup for native call");
2918     nextRbp = reinterpret_cast<ActRec*>(rbp->m_savedRbp);
2919     TRACE(2, "considering frame %p, %p\n", rbp, (void*)rbp->m_savedRip);
2920
2921     if (isVMFrame(nextRbp)) {
2922       TRACE(2, "fixup checking vm frame %s\n",
2923                nextRbp->m_func->name()->data());
2924       FixupMap::VMRegs regs;
2925       if (m_fixupMap.getFrameRegs(rbp, prevRbp, &regs)) {
2926         TRACE(2, "fixup(end): func %s fp %p sp %p pc %p\n",
2927               regs.m_fp->m_func->name()->data(),
2928               regs.m_fp, regs.m_sp, regs.m_pc);
2929         ec->m_fp = const_cast<ActRec*>(regs.m_fp);
2930         ec->m_pc = regs.m_pc;
2931         vmsp() = regs.m_sp;
2932         return;
2933       }
2934     }
2935   } while (rbp && rbp != nextRbp);
2936
2937   // OK, we've exhausted the entire actRec chain.  We are only
2938   // invoking ::fixup() from contexts that were known to be called out
2939   // of the TC, so this cannot happen.
2940   NOT_REACHED();
2941 }
2942
2943 void TranslatorX64::fixup(VMExecutionContext* ec) const {
2944   // Start looking for fixup entries at the current (C++) frame.  This
2945   // will walk the frames upward until we find a TC frame.
2946   DECLARE_FRAME_POINTER(framePtr);
2947   fixupWork(ec, framePtr);
2948 }
2949
2950 TCA TranslatorX64::getTranslatedCaller() const {
2951   DECLARE_FRAME_POINTER(fp);
2952   ActRec* framePtr = fp;  // can't directly mutate the register-mapped one
2953   for (; framePtr; framePtr = (ActRec*)framePtr->m_savedRbp) {
2954     TCA rip = (TCA)framePtr->m_savedRip;
2955     if (isValidCodeAddress(rip)) {
2956       return rip;
2957     }
2958   }
2959   return nullptr;
2960 }
2961
2962 void
2963 TranslatorX64::syncWork() {
2964   assert(tl_regState == VMRegState::DIRTY);
2965   fixup(g_vmContext);
2966   tl_regState = VMRegState::CLEAN;
2967   Stats::inc(Stats::TC_Sync);
2968 }
2969
2970 // could be static but used in hopt/codegen.cpp
2971 void raiseUndefVariable(StringData* nm) {
2972   raise_notice(Strings::UNDEFINED_VARIABLE, nm->data());
2973   // FIXME: do we need to decref the string if an exception is propagating?
2974   decRefStr(nm);
2975 }
2976
2977 // This intentionally excludes Int/Int, which is handled separately
2978 // from cases involving the FPU.
2979 bool
2980 mathEquivTypes(RuntimeType lt, RuntimeType rt) {
2981   return (lt.isDouble() && rt.isDouble()) ||
2982    (lt.isInt() && rt.isDouble()) ||
2983    (lt.isDouble() && rt.isInt());
2984 }
2985
2986 /* This is somewhat hacky. It decides which helpers/builtins should
2987  * use eager vmreganchor based on profile information. Using eager
2988  * vmreganchor for all helper calls is a perf regression. */
2989 bool TranslatorX64::eagerRecord(const Func* func) {
2990   const char* list[] = {
2991     "func_get_args",
2992     "get_called_class",
2993     "func_num_args",
2994     "array_filter",
2995     "array_map",
2996   };
2997
2998   for (int i = 0; i < sizeof(list)/sizeof(list[0]); i++) {
2999     if (!strcmp(func->name()->data(), list[i])) {
3000       return true;
3001     }
3002   }
3003   if (func->cls() && !strcmp(func->cls()->name()->data(), "WaitHandle")
3004       && !strcmp(func->name()->data(), "join")) {
3005     return true;
3006   }
3007   return false;
3008 }
3009
3010 ObjectData*
3011 HOT_FUNC_VM
3012 newInstanceHelper(Class* cls, int numArgs, ActRec* ar, ActRec* prevAr) {
3013   const Func* f = cls->getCtor();
3014   ObjectData* ret = nullptr;
3015   if (UNLIKELY(!(f->attrs() & AttrPublic))) {
3016     VMRegAnchor _;
3017     UNUSED MethodLookup::LookupResult res =
3018       g_vmContext->lookupCtorMethod(f, cls, true /*raise*/);
3019     assert(res == MethodLookup::LookupResult::MethodFoundWithThis);
3020   }
3021   // Don't start pushing the AR until newInstance returns; it may reenter.
3022   ret = newInstance(cls);
3023   f->validate();
3024   ar->m_func = f;
3025   ar->initNumArgs(numArgs, true /*fromCtor*/);
3026   // Count stack and this.
3027   ret->incRefCount();
3028   ret->incRefCount();
3029   ar->setThis(ret);
3030   ar->setVarEnv(nullptr);
3031   arSetSfp(ar, prevAr);
3032   TRACE(2, "newInstanceHelper: AR %p: f %p, savedRbp %#" PRIx64
3033         ", savedRip %#" PRIx64 ", this %p\n",
3034         ar, ar->m_func, ar->m_savedRbp, ar->m_savedRip, ar->m_this);
3035   return ret;
3036 }
3037
3038 TCA
3039 TranslatorX64::emitNativeTrampoline(TCA helperAddr) {
3040   auto& a = atrampolines;
3041
3042   if (!a.canEmit(m_trampolineSize)) {
3043     // not enough space to emit a trampoline, so just return the
3044     // helper address and emitCall will the emit the right sequence
3045     // to call it indirectly
3046     TRACE(1, "Ran out of space to emit a trampoline for %p\n", helperAddr);
3047     always_assert(false);
3048     return helperAddr;
3049   }
3050   uint32_t index = m_numNativeTrampolines++;
3051   TCA trampAddr = a.frontier();
3052   if (Stats::enabled()) {
3053     Stats::emitInc(a, &Stats::tl_helper_counters[0], index);
3054     char* name = Util::getNativeFunctionName(helperAddr);
3055     const size_t limit = 50;
3056     if (strlen(name) > limit) {
3057       name[limit] = '\0';
3058     }
3059     Stats::helperNames[index] = name;
3060   }
3061
3062   /*
3063    * For stubs that take arguments in rAsm, we need to make sure
3064    * we're not damaging its contents here.  (If !jmpDeltaFits, the jmp
3065    * opcode will need to movabs the address into rAsm before
3066    * jumping.)
3067    */
3068   auto DEBUG_ONLY stubUsingRScratch = [&](TCA tca) {
3069     return tca == m_dtorGenericStubRegs;
3070   };
3071
3072   assert(IMPLIES(stubUsingRScratch(helperAddr), a.jmpDeltaFits(helperAddr)));
3073   a.    jmp    (helperAddr);
3074   a.    ud2    ();
3075
3076   trampolineMap[helperAddr] = trampAddr;
3077   if (m_trampolineSize == 0) {
3078     m_trampolineSize = a.frontier() - trampAddr;
3079     assert(m_trampolineSize >= kMinPerTrampolineSize);
3080   }
3081   recordBCInstr(OpNativeTrampoline, a, trampAddr);
3082   return trampAddr;
3083 }
3084
3085 TCA
3086 TranslatorX64::getNativeTrampoline(TCA helperAddr) {
3087   if (!RuntimeOption::EvalJitTrampolines && !Stats::enabled()) {
3088     return helperAddr;
3089   }
3090   TCA trampAddr = (TCA)mapGet<PointerMap>(trampolineMap, helperAddr);
3091   if (trampAddr) {
3092     return trampAddr;
3093   }
3094   return emitNativeTrampoline(helperAddr);
3095 }
3096
3097 static void defClsHelper(PreClass *preClass) {
3098   assert(tl_regState == VMRegState::DIRTY);
3099   tl_regState = VMRegState::CLEAN;
3100   Unit::defClass(preClass);
3101
3102   /*
3103    * m_defClsHelper sync'd the registers for us already.  This means
3104    * if an exception propagates we want to leave things as
3105    * VMRegState::CLEAN, since we're still in sync.  Only set it to dirty
3106    * if we are actually returning to run in the TC again.
3107    */
3108   tl_regState = VMRegState::DIRTY;
3109 }
3110
3111 template <typename T>
3112 static int64_t switchBoundsCheck(T v, int64_t base, int64_t nTargets) {
3113   // I'm relying on gcc to be smart enough to optimize away the next
3114   // two lines when T is int64.
3115   if (int64_t(v) == v) {
3116     int64_t ival = v;
3117     if (ival >= base && ival < (base + nTargets)) {
3118       return ival - base;
3119     }
3120   }
3121   return nTargets + 1;
3122 }
3123
3124 int64_t switchDoubleHelper(int64_t val, int64_t base, int64_t nTargets) {
3125   union {
3126     int64_t intbits;
3127     double dblval;
3128   } u;
3129   u.intbits = val;
3130   return switchBoundsCheck(u.dblval, base, nTargets);
3131 }
3132
3133 int64_t switchStringHelper(StringData* s, int64_t base, int64_t nTargets) {
3134   int64_t ival;
3135   double dval;
3136   switch (s->isNumericWithVal(ival, dval, 1)) {
3137     case KindOfNull:
3138       ival = switchBoundsCheck(0, base, nTargets);
3139       break;
3140
3141     case KindOfDouble:
3142       ival = switchBoundsCheck(dval, base, nTargets);
3143       break;
3144
3145     case KindOfInt64:
3146       ival = switchBoundsCheck(ival, base, nTargets);
3147       break;
3148
3149     default:
3150       not_reached();
3151   }
3152   decRefStr(s);
3153   return ival;
3154 }
3155
3156 int64_t switchObjHelper(ObjectData* o, int64_t base, int64_t nTargets) {
3157   int64_t ival = o->o_toInt64();
3158   decRefObj(o);
3159   return switchBoundsCheck(ival, base, nTargets);
3160 }
3161
3162 bool
3163 TranslatorX64::reachedTranslationLimit(SrcKey sk,
3164                                        const SrcRec& srcRec) const {
3165   if (srcRec.translations().size() == RuntimeOption::EvalJitMaxTranslations) {
3166     INC_TPC(max_trans);
3167     if (debug && Trace::moduleEnabled(Trace::tx64, 2)) {
3168       const vector<TCA>& tns = srcRec.translations();
3169       TRACE(1, "Too many (%zd) translations: %s, BC offset %d\n",
3170             tns.size(), sk.unit()->filepath()->data(),
3171             sk.offset());
3172       SKTRACE(2, sk, "{\n");
3173       TCA topTrans = srcRec.getTopTranslation();
3174       for (size_t i = 0; i < tns.size(); ++i) {
3175         const TransRec* rec = getTransRec(tns[i]);
3176         assert(rec);
3177         SKTRACE(2, sk, "%zd %p\n", i, tns[i]);
3178         if (tns[i] == topTrans) {
3179           SKTRACE(2, sk, "%zd: *Top*\n", i);
3180         }
3181         if (rec->kind == TransAnchor) {
3182           SKTRACE(2, sk, "%zd: Anchor\n", i);
3183         } else {
3184           SKTRACE(2, sk, "%zd: guards {\n", i);
3185           for (unsigned j = 0; j < rec->dependencies.size(); ++j) {
3186             TRACE(2, rec->dependencies[j]);
3187           }
3188           SKTRACE(2, sk, "%zd } guards\n", i);
3189         }
3190       }
3191       SKTRACE(2, sk, "} /* Too many translations */\n");
3192     }
3193     return true;
3194   }
3195
3196   return false;
3197 }
3198
3199 void
3200 TranslatorX64::emitGuardChecks(X64Assembler& a,
3201                                SrcKey sk,
3202                                const ChangeMap& dependencies,
3203                                const RefDeps& refDeps,
3204                                SrcRec& fail) {
3205   if (Trace::moduleEnabled(Trace::stats, 2)) {
3206     Stats::emitInc(a, Stats::TraceletGuard_enter);
3207   }
3208
3209   emitRB(a, RBTypeTraceletGuards, sk);
3210   for (auto const& dep : dependencies) {
3211     m_irTrans->checkType(dep.first, dep.second->rtt);
3212   }
3213
3214   checkRefs(a, sk, refDeps, fail);
3215
3216   if (Trace::moduleEnabled(Trace::stats, 2)) {
3217     Stats::emitInc(a, Stats::TraceletGuard_execute);
3218   }
3219 }
3220
3221
3222 void dumpTranslationInfo(const Tracelet& t, TCA postGuards) {
3223   if (!debug) return;
3224
3225   SrcKey sk = t.m_sk;
3226   DEBUG_ONLY auto unit = sk.unit();
3227
3228   TRACE(3, "----------------------------------------------\n");
3229   TRACE(3, "  Translating from file %s:%d %s at %p:\n",
3230         unit->filepath()->data(),
3231         unit->getLineNumber(sk.offset()),
3232         sk.func()->name()->data(),
3233         postGuards);
3234   TRACE(3, "  preconds:\n");
3235   TRACE(3, "    types:\n");
3236   for (DepMap::const_iterator i = t.m_dependencies.begin();
3237        i != t.m_dependencies.end(); ++i) {
3238     TRACE(3, "      %-5s\n", i->second->pretty().c_str());
3239   }
3240   if (t.m_refDeps.size() != 0) {
3241     TRACE(3, "    refs:\n");
3242     for (RefDeps::ArMap::const_iterator i = t.m_refDeps.m_arMap.begin();
3243         i != t.m_refDeps.m_arMap.end();
3244         ++i) {
3245       TRACE(3, "      (ActRec %" PRId64 " : %-5s)\n", i->first,
3246         i->second.pretty().c_str());
3247     }
3248   }
3249   TRACE(3, "  postconds:\n");
3250   for (ChangeMap::const_iterator i = t.m_changes.begin();
3251        i != t.m_changes.end(); ++i) {
3252     TRACE(3, "    %-5s\n", i->second->pretty().c_str());
3253   }
3254   for (auto ni = t.m_instrStream.first; ni; ni = ni->next) {
3255     TRACE(3, "  %6d: %s\n", ni->source.offset(),
3256       instrToString((Op*)ni->pc()).c_str());
3257     if (ni->breaksTracelet) break;
3258   }
3259   TRACE(3, "----------------------------------------------\n");
3260   if (Trace::moduleEnabled(Trace::tx64, 5)) {
3261     // prettyStack() expects to use vmpc(). Leave it in the state we
3262     // found it since this code is debug-only, and we don't want behavior
3263     // to vary across the optimized/debug builds.
3264     PC oldPC = vmpc();
3265     vmpc() = unit->at(sk.offset());
3266     TRACE(3, g_vmContext->prettyStack(string(" tx64 ")));
3267     vmpc() = oldPC;
3268     TRACE(3, "----------------------------------------------\n");
3269   }
3270 }
3271
3272 void
3273 TranslatorX64::translateWork(const TranslArgs& args) {
3274   auto sk = args.m_sk;
3275   std::unique_ptr<Tracelet> tp = analyze(sk);
3276   Tracelet& t = *tp;
3277
3278   SKTRACE(1, sk, "translateWork\n");
3279   assert(m_srcDB.find(sk));
3280
3281   TCA        start = a.frontier();
3282   TCA        stubStart = astubs.frontier();
3283   TCA        counterStart = 0;
3284   uint8_t    counterLen = 0;
3285   SrcRec&    srcRec = *getSrcRec(sk);
3286   TransKind  transKind = TransInterp;
3287   UndoMarker undoA(a);
3288   UndoMarker undoAstubs(astubs);
3289
3290   auto resetState = [&] {
3291     undoA.undo();
3292     undoAstubs.undo();
3293     m_pendingFixups.clear();
3294     m_bcMap.clear();
3295     srcRec.clearInProgressTailJumps();
3296   };
3297
3298   auto assertCleanState = [&] {
3299     assert(a.frontier() == start);
3300     assert(astubs.frontier() == stubStart);
3301     assert(m_pendingFixups.empty());
3302     assert(m_bcMap.empty());
3303     assert(srcRec.inProgressTailJumps().empty());
3304   };
3305
3306   if (!args.m_interp && !reachedTranslationLimit(sk, srcRec)) {
3307     // Attempt to create a region at this SrcKey
3308     JIT::RegionDescPtr region;
3309     if (RuntimeOption::EvalJitPGO) {
3310       if (m_mode == TransOptimize) {
3311         TransID transId = args.m_transId;
3312         assert(transId != InvalidID);
3313         region = JIT::selectHotRegion(transId, this);
3314         if (region && region->blocks.size() == 0) region = nullptr;
3315       } else {
3316         // We always go through the tracelet translator in this case
3317       }
3318     } else {
3319       JIT::RegionContext rContext { sk.func(), sk.offset(), liveSpOff() };
3320       FTRACE(2, "populating live context for region\n");
3321       populateLiveContext(rContext);
3322       region = JIT::selectRegion(rContext, &t);
3323     }
3324
3325     TranslateResult result = Retry;
3326     RegionBlacklist regionInterps;
3327     while (result == Retry) {
3328       traceStart(sk.offset());
3329
3330       // Try translating a region if we have one, then fall back to using the
3331       // Tracelet.
3332       if (region) {
3333         try {
3334           assertCleanState();
3335           result = translateRegion(*region, regionInterps);
3336           FTRACE(2, "translateRegion finished with result {}\n",
3337                  translateResultName(result));
3338         } catch (const std::exception& e) {
3339           FTRACE(1, "translateRegion failed with '{}'\n", e.what());
3340           result = Failure;
3341         }
3342         if (result == Failure) {
3343           traceFree();
3344           traceStart(sk.offset());
3345           resetState();
3346         }
3347       }
3348       if (!region || result == Failure) {
3349         FTRACE(1, "trying irTranslateTracelet\n");
3350         assertCleanState();
3351         if (m_mode == TransOptimize) {
3352           m_mode = TransLive;
3353         }
3354         result = translateTracelet(t);
3355       }
3356
3357       if (result != Success) {
3358         // Translation failed. Free resources for this trace, rollback the
3359         // translation cache frontiers, and discard any pending fixups.
3360         resetState();
3361       }
3362       traceFree();
3363     }
3364
3365     if (result == Success) {
3366       assert(m_mode == TransLive    ||
3367              m_mode == TransProfile ||
3368              m_mode == TransOptimize);
3369       transKind = m_mode;
3370     }
3371   }
3372
3373   if (transKind == TransInterp) {
3374     assertCleanState();
3375     TRACE(1,
3376           "emitting %d-instr interp request for failed translation\n",
3377           int(t.m_numOpcodes));
3378     // Add a counter for the translation if requested
3379     if (RuntimeOption::EvalJitTransCounters) {
3380       emitTransCounterInc(a);
3381     }
3382     a.    jmp(emitServiceReq(REQ_INTERPRET,
3383                              t.m_sk.offset(), t.m_numOpcodes));
3384     // Fall through.
3385   }
3386
3387   for (uint i = 0; i < m_pendingFixups.size(); i++) {
3388     TCA tca = m_pendingFixups[i].m_tca;
3389     assert(isValidCodeAddress(tca));
3390     m_fixupMap.recordFixup(tca, m_pendingFixups[i].m_fixup);
3391   }
3392   m_pendingFixups.clear();
3393
3394   addTranslation(TransRec(sk, sk.unit()->md5(), transKind, t, start,
3395                           a.frontier() - start, stubStart,
3396                           astubs.frontier() - stubStart,
3397                           counterStart, counterLen,
3398                           m_bcMap));
3399   m_bcMap.clear();
3400
3401   recordGdbTranslation(sk, sk.func(), a, start,
3402                        false, false);
3403   recordGdbTranslation(sk, sk.func(), astubs, stubStart,
3404                        false, false);
3405   if (RuntimeOption::EvalJitPGO) {
3406     m_profData->addTrans(t, transKind);
3407   }
3408   // SrcRec::newTranslation() makes this code reachable. Do this last;
3409   // otherwise there's some chance of hitting in the reader threads whose
3410   // metadata is not yet visible.
3411   TRACE(1, "newTranslation: %p  sk: (func %d, bcOff %d)\n",
3412         start, sk.getFuncId(), sk.offset());
3413   srcRec.newTranslation(start);
3414   TRACE(1, "tx64: %zd-byte tracelet\n", a.frontier() - start);
3415   if (Trace::moduleEnabledRelease(Trace::tcspace, 1)) {
3416     Trace::traceRelease("%s", getUsage().c_str());
3417   }
3418 }
3419
3420 TranslatorX64::TranslateResult
3421 TranslatorX64::translateTracelet(Tracelet& t) {
3422   FTRACE(2, "attempting to translate tracelet:\n{}\n", t.toString());
3423   assert(!Translator::liveFrameIsPseudoMain());
3424   const SrcKey &sk = t.m_sk;
3425   SrcRec& srcRec = *getSrcRec(sk);
3426   HhbcTranslator& ht = m_irTrans->hhbcTrans();
3427
3428   assert(srcRec.inProgressTailJumps().size() == 0);
3429   try {
3430     emitResolvedDeps(t.m_resolvedDeps);
3431     emitGuardChecks(a, sk, t.m_dependencies, t.m_refDeps, srcRec);
3432
3433     dumpTranslationInfo(t, a.frontier());
3434
3435     // after guards, add a counter for the translation if requested
3436     if (RuntimeOption::EvalJitTransCounters) {
3437       ht.emitIncTransCounter();
3438     }
3439
3440     if (m_mode == TransProfile) {
3441       ht.emitCheckCold(m_profData->curTransID());
3442     }
3443
3444     emitRB(a, RBTypeTraceletBody, t.m_sk);
3445     Stats::emitInc(a, Stats::Instr_TC, t.m_numOpcodes);
3446
3447     // Profiling on function entry.
3448     if (t.m_sk.offset() == t.func()->base()) {
3449       ht.profileFunctionEntry("Normal");
3450     }
3451
3452     /*
3453      * Profiling on the shapes of tracelets that are whole functions.
3454      * (These are the things we might consider trying to support
3455      * inlining.)
3456      */
3457     [&]{
3458       static const bool enabled = Stats::enabledAny() &&
3459                                   getenv("HHVM_STATS_FUNCSHAPE");
3460       if (!enabled) return;
3461       if (t.m_sk.offset() != t.func()->base()) return;
3462       if (auto last = t.m_instrStream.last) {
3463         if (last->op() != OpRetC && last->op() != OpRetV) {
3464           return;
3465         }
3466       }
3467       ht.profileSmallFunctionShape(traceletShape(t));
3468     }();
3469
3470     Unit::MetaHandle metaHand;
3471     // Translate each instruction in the tracelet
3472     for (auto* ni = t.m_instrStream.first; ni && !ht.hasExit();
3473          ni = ni->next) {
3474       readMetaData(metaHand, *ni, m_irTrans->hhbcTrans(), MetaMode::Legacy);
3475
3476       try {
3477         SKTRACE(1, ni->source, "HHIR: translateInstr\n");
3478         assert(!(m_mode == TransProfile && ni->outputPredicted && ni->next));
3479         m_irTrans->translateInstr(*ni);
3480       } catch (JIT::FailedIRGen& fcg) {
3481         always_assert(!ni->interp);
3482         ni->interp = true;
3483         FTRACE(1, "HHIR: RETRY Translation {}: will interpOne BC instr {} "
3484                "after failing to generate ir: {} \n\n",
3485                getCurrentTransID(), ni->toString(), fcg.what());
3486         return Retry;
3487       }
3488       assert(ni->source.offset() >= t.func()->base());
3489       // We sometimes leave the tail of a truncated tracelet in place to aid
3490       // analysis, but breaksTracelet is authoritative.
3491       if (ni->breaksTracelet || m_irTrans->hhbcTrans().hasExit()) break;
3492     }
3493     traceEnd();
3494
3495     try {
3496       traceCodeGen();
3497       TRACE(1, "HHIR: SUCCEEDED to generate code for Translation %d\n\n\n",
3498             getCurrentTransID());
3499       return Success;
3500     } catch (JIT::FailedCodeGen& fcg) {
3501       // Code-gen failed. Search for the bytecode instruction that caused the
3502       // problem, flag it to be interpreted, and retranslate the tracelet.
3503       for (auto ni = t.m_instrStream.first; ni; ni = ni->next) {
3504         if (ni->source.offset() == fcg.bcOff) {
3505           always_assert(!ni->interp);
3506           ni->interp = true;
3507           FTRACE(1, "HHIR: RETRY Translation {}: will interpOne BC instr {} "
3508                  "after failing to code-gen \n\n",
3509                  getCurrentTransID(), ni->toString(), fcg.what());
3510           return Retry;
3511         }
3512       }
3513       throw fcg;
3514     }
3515   } catch (JIT::FailedCodeGen& fcg) {
3516     TRACE(1, "HHIR: FAILED to generate code for Translation %d "
3517           "@ %s:%d (%s)\n", getCurrentTransID(),
3518           fcg.file, fcg.line, fcg.func);
3519     // HHIR:TODO Remove extra TRACE and adjust tools
3520     TRACE(1, "HHIR: FAILED to translate @ %s:%d (%s)\n",
3521           fcg.file, fcg.line, fcg.func);
3522   } catch (JIT::FailedIRGen& x) {
3523     TRACE(1, "HHIR: FAILED to translate @ %s:%d (%s)\n",
3524           x.file, x.line, x.func);
3525   } catch (const FailedAssertion& fa) {
3526     fa.print();
3527     StackTraceNoHeap::AddExtraLogging(
3528       "Assertion failure",
3529       folly::format("{}\n\nActive Trace:\n{}\n",
3530                     fa.summary, ht.trace()->toString()).str());
3531     abort();
3532   } catch (const std::exception& e) {
3533     FTRACE(1, "HHIR: FAILED with exception: {}\n", e.what());
3534     assert(0);
3535   }
3536   return Failure;
3537 }
3538
3539 void TranslatorX64::traceCodeGen() {
3540   using namespace JIT;
3541
3542   HhbcTranslator& ht = m_irTrans->hhbcTrans();
3543   HPHP::JIT::IRTrace* trace = ht.trace();
3544   auto finishPass = [&](const char* msg, int level,
3545                         const RegAllocInfo* regs,
3546                         const LifetimeInfo* lifetime) {
3547     dumpTrace(level, trace, msg, regs, lifetime);
3548     assert(checkCfg(trace, ht.irFactory()));
3549   };
3550
3551   finishPass(" after initial translation ", kIRLevel, nullptr, nullptr);
3552   optimizeTrace(trace, ht.traceBuilder());
3553   finishPass(" after optimizing ", kOptLevel, nullptr, nullptr);
3554
3555   auto* factory = &ht.irFactory();
3556   recordBCInstr(OpTraceletGuard, a, a.frontier());
3557   if (dumpIREnabled() || RuntimeOption::EvalJitCompareHHIR) {
3558     LifetimeInfo lifetime(factory);
3559     RegAllocInfo regs = allocRegsForTrace(trace, factory, &lifetime);
3560     finishPass(" after reg alloc ", kRegAllocLevel, &regs, &lifetime);
3561     assert(checkRegisters(trace, *factory, regs));
3562     AsmInfo ai(factory);
3563     genCodeForTrace(trace, a, astubs, factory, &m_bcMap, this, regs,
3564                     &lifetime, &ai);
3565     if (RuntimeOption::EvalJitCompareHHIR) {
3566       std::ostringstream out;
3567       dumpTraceImpl(trace, out, &regs, &lifetime, &ai);
3568     } else {
3569       dumpTrace(kCodeGenLevel, trace, " after code gen ", &regs,
3570                 &lifetime, &ai);
3571     }
3572   } else {
3573     RegAllocInfo regs = allocRegsForTrace(trace, factory);
3574     finishPass(" after reg alloc ", kRegAllocLevel, nullptr, nullptr);
3575     assert(checkRegisters(trace, *factory, regs));
3576     genCodeForTrace(trace, a, astubs, factory, &m_bcMap, this, regs);
3577   }
3578
3579   m_numHHIRTrans++;
3580 }
3581
3582 /*
3583  * Defines functions called by emitGenericReturn, and
3584  * cgGenericRetDecRefs.
3585  */
3586 void TranslatorX64::emitFreeLocalsHelpers() {
3587   Label doRelease;
3588   Label release;
3589   Label loopHead;
3590
3591   /*
3592    * Note: the IR currently requires that we preserve r13/r14 across
3593    * calls to these free locals helpers.
3594    */
3595   static_assert(rVmSp == rbx, "");
3596   auto const rIter     = rbx;
3597   auto const rFinished = r15;
3598   auto const rType     = esi;
3599   auto const rData     = rdi;
3600
3601   moveToAlign(a, kNonFallthroughAlign);
3602
3603   TRACE(1, "HOTSTUB: freeLocalsHelpers starts %lx\n", uintptr_t(a.frontier()));
3604
3605 asm_label(a, release);
3606   a.    loadq  (rIter[TVOFF(m_data)], rData);
3607   a.    cmpl   (RefCountStaticValue, rData[FAST_REFCOUNT_OFFSET]);
3608   jccBlock<CC_Z>(a, [&] {
3609     a.  decl   (rData[FAST_REFCOUNT_OFFSET]);
3610     a.  jz8    (doRelease);
3611   });
3612   a.    ret    ();
3613 asm_label(a, doRelease);
3614   jumpDestructor(a, PhysReg(rType), rax);
3615
3616   moveToAlign(a, kJmpTargetAlign);
3617   m_freeManyLocalsHelper = a.frontier();
3618   a.    lea    (rVmFp[-cellsToBytes(kNumFreeLocalsHelpers)], rFinished);
3619
3620   auto emitDecLocal = [&] {
3621     Label skipDecRef;
3622
3623     emitLoadTVType(a, rIter[TVOFF(m_type)], rType);
3624     emitCmpTVType(a, KindOfRefCountThreshold, rType);
3625     a.  jle8   (skipDecRef);
3626     a.  call   (release);
3627     recordIndirectFixup(a.frontier(), 0);
3628   asm_label(a, skipDecRef);
3629   };
3630
3631   // Loop for the first few locals, but unroll the final
3632   // kNumFreeLocalsHelpers.
3633 asm_label(a, loopHead);
3634   emitDecLocal();
3635   a.    addq   (sizeof(TypedValue), rIter);
3636   a.    cmpq   (rIter, rFinished);
3637   a.    jnz8   (loopHead);
3638
3639   for (int i = 0; i < kNumFreeLocalsHelpers; ++i) {
3640     m_freeLocalsHelpers[kNumFreeLocalsHelpers - i - 1] = a.frontier();
3641     TRACE(1, "HOTSTUB: m_freeLocalsHelpers[%d] = %p\n",
3642           kNumFreeLocalsHelpers - i - 1, a.frontier());
3643     emitDecLocal();
3644     if (i != kNumFreeLocalsHelpers - 1) {
3645       a.addq (sizeof(TypedValue), rIter);
3646     }
3647   }
3648
3649   a.    addq   (AROFF(m_r) + sizeof(TypedValue), rVmSp);
3650   a.    ret    (8);
3651
3652   TRACE(1, "STUB freeLocals helpers: %zu bytes\n",
3653            size_t(a.frontier() - m_freeManyLocalsHelper));
3654 }
3655
3656 TranslatorX64::TranslatorX64()
3657 : m_numNativeTrampolines(0),
3658   m_trampolineSize(0),
3659   m_defClsHelper(0),
3660   m_funcPrologueRedispatch(0),
3661   m_numHHIRTrans(0),
3662   m_catchTraceMap(128)
3663 {
3664   static const size_t kRoundUp = 2 << 20;
3665   const size_t kAHotSize   = RuntimeOption::VMTranslAHotSize;
3666   const size_t kAProfSize  = RuntimeOption::EvalJitPGO ?
3667                              RuntimeOption::VMTranslAProfSize : 0;
3668   const size_t kASize      = RuntimeOption::VMTranslASize;
3669   const size_t kAStubsSize = RuntimeOption::VMTranslAStubsSize;
3670   const size_t kGDataSize  = RuntimeOption::VMTranslGDataSize;
3671   m_totalSize = kAHotSize + kASize + kAStubsSize + kAProfSize +
3672     kTrampolinesBlockSize + kGDataSize;
3673
3674   TRACE(1, "TranslatorX64@%p startup\n", this);
3675   tx64 = this;
3676
3677   if ((kAHotSize < (2 << 20)) ||
3678       (kASize < (10 << 20)) ||
3679       (kAStubsSize < (10 << 20)) ||
3680       (kGDataSize < (2 << 20))) {
3681     fprintf(stderr, "Allocation sizes ASize, AStubsSize, and GlobalDataSize "
3682                     "are too small.\n");
3683     exit(1);
3684   }
3685
3686   if (m_totalSize > (2ul << 30)) {
3687     fprintf(stderr,"Combined size of ASize, AStubSize, and GlobalDataSize "
3688                    "must be < 2GiB to support 32-bit relative addresses\n");
3689     exit(1);
3690   }
3691
3692   static bool profileUp = false;
3693   if (!profileUp) {
3694     profileInit();
3695     profileUp = true;
3696   }
3697
3698   auto enhugen = [&](void* base, int numMB) {
3699     if (RuntimeOption::EvalMapTCHuge) {
3700       assert((uintptr_t(base) & (kRoundUp - 1)) == 0);
3701       hintHuge(base, numMB << 20);
3702     }
3703   };
3704
3705   // We want to ensure that the block for "a", "astubs",
3706   // "atrampolines", and "m_globalData" are nearby so that we can
3707   // short jump/point between them. Thus we allocate one slab and
3708   // divide it between "a", "astubs", and "atrampolines".
3709
3710   // Using sbrk to ensure its in the bottom 2G, so we avoid
3711   // the need for trampolines, and get to use shorter
3712   // instructions for tc addresses.
3713   const size_t allocationSize = m_totalSize + kRoundUp - 1;
3714   uint8_t *base = (uint8_t*)sbrk(allocationSize);
3715   if (base == (uint8_t*)-1) {
3716     base = (uint8_t*)low_malloc(allocationSize);
3717     if (!base) {
3718       base = (uint8_t*)malloc(allocationSize);
3719     }
3720     if (!base) {
3721       fprintf(stderr, "could not allocate %zd bytes for translation cache\n",
3722               allocationSize);
3723       exit(1);
3724     }
3725   }
3726   assert(base);
3727   tcStart = base;
3728   base += -(uint64_t)base & (kRoundUp - 1);
3729   enhugen(base, RuntimeOption::EvalTCNumHugeHotMB);
3730   TRACE(1, "init atrampolines @%p\n", base);
3731   atrampolines.init(base, kTrampolinesBlockSize);
3732   base += kTrampolinesBlockSize;
3733
3734   m_unwindRegistrar = register_unwind_region(base, m_totalSize);
3735   TRACE(1, "init ahot @%p\n", base);
3736   ahot.init(base, kAHotSize);
3737   base += kAHotSize;
3738   TRACE(1, "init a @%p\n", base);
3739   a.init(base, kASize);
3740   aStart = base;
3741   base += kASize;
3742   TRACE(1, "init aprof @%p\n", base);
3743   aprof.init(base, kAProfSize);
3744   base += kAProfSize;
3745   base += -(uint64_t)base & (kRoundUp - 1);
3746   TRACE(1, "init astubs @%p\n", base);
3747   astubs.init(base, kAStubsSize);
3748   enhugen(base, RuntimeOption::EvalTCNumHugeColdMB);
3749   base += kAStubsSize;
3750   TRACE(1, "init gdata @%p\n", base);
3751   m_globalData.init(base, kGDataSize);
3752
3753   // put the stubs into ahot, rather than a
3754   AsmSelector asmSel(AsmSelector::Args(this).hot(true));
3755
3756   // Emit some special helpers that are shared across translations.
3757
3758   // Emit a byte of padding. This is a kind of hacky way to
3759   // avoid hitting an assert in recordGdbStub when we call
3760   // it with m_callToExit - 1 as the start address.
3761   astubs.emitNop(1);
3762
3763   // Call to exit with whatever value the program leaves on
3764   // the return stack.
3765   m_callToExit = emitServiceReq(SRFlags::Align | SRFlags::JmpInsteadOfRet,
3766                                 REQ_EXIT);
3767
3768   /*
3769    * Helpers for returning from a function where the ActRec was pushed
3770    * by the interpreter.
3771    */
3772   m_retHelper = emitRetFromInterpretedFrame();
3773   m_genRetHelper = emitRetFromInterpretedGeneratorFrame();
3774
3775   /*
3776    * Returning from a function where the ActRec was pushed by an
3777    * inlined call.  This is separate from m_retHelper just for
3778    * debugability---it does the same thing.
3779    */
3780   m_retInlHelper = emitRetFromInterpretedFrame();
3781   FTRACE(1, "retInlHelper: {}\n", (void*)m_retInlHelper);
3782
3783   moveToAlign(astubs);
3784   m_resumeHelperRet = astubs.frontier();
3785   emitPopRetIntoActRec(astubs);
3786   m_resumeHelper = astubs.frontier();
3787   emitGetGContext(astubs, rax);
3788   astubs.   load_reg64_disp_reg64(rax, offsetof(VMExecutionContext, m_fp),
3789                                        rVmFp);
3790   astubs.   load_reg64_disp_reg64(rax, offsetof(VMExecutionContext, m_stack) +
3791                                        Stack::topOfStackOffset(), rVmSp);
3792   emitServiceReq(REQ_RESUME);
3793
3794   // Helper for DefCls, in astubs.
3795   {
3796     auto& a = astubs;
3797     if (false) {
3798       PreClass *preClass = 0;
3799       defClsHelper(preClass);
3800     }
3801     m_defClsHelper = TCA(a.frontier());
3802     PhysReg rEC = argNumToRegName[2];
3803     emitGetGContext(a, rEC);
3804     a.   storeq (rVmFp, rEC[offsetof(VMExecutionContext, m_fp)]);
3805     a.   storeq (argNumToRegName[1],
3806                     rEC[offsetof(VMExecutionContext, m_pc)]);
3807     a.   storeq (rax, rEC[offsetof(VMExecutionContext, m_stack) +
3808                       Stack::topOfStackOffset()]);
3809     a.   jmp    (TCA(defClsHelper));
3810   }
3811
3812   // The decRef helper for when we bring the count down to zero. Callee needs to
3813   // bring the value into rdi. These can be burned in for all time, and for all
3814   // translations.
3815   typedef void* vp;
3816
3817   TCA strDtor, arrDtor, objDtor, resDtor, refDtor;
3818   strDtor = emitUnaryStub(astubs, CppCall(getMethodPtr(&StringData::release)));
3819   arrDtor = emitUnaryStub(astubs,
3820                           CppCall(getVTableOffset(&HphpArray::release)));
3821   objDtor = emitUnaryStub(astubs, CppCall(getMethodPtr(&ObjectData::release)));
3822   resDtor = emitUnaryStub(astubs,
3823                           CppCall(getMethodPtr(&ResourceData::release)));
3824   refDtor = emitUnaryStub(astubs, CppCall(vp(getMethodPtr(&RefData::release))));
3825
3826   m_dtorStubs[0] = nullptr;
3827   m_dtorStubs[typeToDestrIndex(BitwiseKindOfString)] = strDtor;
3828   m_dtorStubs[typeToDestrIndex(KindOfArray)]         = arrDtor;
3829   m_dtorStubs[typeToDestrIndex(KindOfObject)]        = objDtor;
3830   m_dtorStubs[typeToDestrIndex(KindOfResource)]      = resDtor;
3831   m_dtorStubs[typeToDestrIndex(KindOfRef)]           = refDtor;
3832
3833   // Hot helper stubs in A:
3834   emitGenericDecRefHelpers();
3835   emitFreeLocalsHelpers();
3836   m_funcPrologueRedispatch = emitPrologueRedispatch(a);
3837   TRACE(1, "HOTSTUB: all stubs finished: %lx\n",
3838         uintptr_t(a.frontier()));
3839
3840   if (trustSigSegv) {
3841     // Install SIGSEGV handler for timeout exceptions
3842     struct sigaction sa;
3843     struct sigaction old_sa;
3844     sa.sa_sigaction = &TranslatorX64::SEGVHandler;
3845     sa.sa_flags = SA_SIGINFO;
3846     sigemptyset(&sa.sa_mask);
3847     if (sigaction(SIGSEGV, &sa, &old_sa) != 0) {
3848       throw std::runtime_error(
3849         std::string("Failed to install SIGSEGV handler: ") +
3850           strerror(errno));
3851     }
3852     m_segvChain = old_sa.sa_flags & SA_SIGINFO ?
3853       old_sa.sa_sigaction : (sigaction_t)old_sa.sa_handler;
3854   }
3855
3856   moveToAlign(astubs);
3857   m_stackOverflowHelper = astubs.frontier();
3858   // We are called from emitStackCheck, with the new stack frame in
3859   // rStashedAR. Get the caller's PC into rdi and save it off.
3860   astubs.    load_reg64_disp_reg64(rVmFp, AROFF(m_func), rax);
3861   astubs.    load_reg64_disp_reg32(rStashedAR, AROFF(m_soff), rdi);
3862   astubs.    load_reg64_disp_reg64(rax, Func::sharedOffset(), rax);
3863   astubs.    load_reg64_disp_reg32(rax, Func::sharedBaseOffset(), rax);
3864   astubs.    add_reg32_reg32(rax, rdi);
3865   emitEagerVMRegSave(astubs, SaveFP | SavePC);
3866   emitServiceReq(REQ_STACK_OVERFLOW);
3867 }
3868
3869 // do gdb specific initialization. This has to happen after
3870 // the TranslatorX64 constructor is called, because gdb initialization
3871 // calls backs into TranslatorX64::Get()
3872 void TranslatorX64::initGdb() {
3873   // On a backtrace, gdb tries to locate the calling frame at address
3874   // returnRIP-1. However, for the first VM frame, there is no code at
3875   // returnRIP-1, since the AR was set up manually. For this frame,
3876   // record the tracelet address as starting from callToExit-1, so gdb
3877   // does not barf
3878   recordGdbStub(astubs, m_callToExit - 1, "HHVM::callToExit");
3879
3880   recordBCInstr(OpRetFromInterp, astubs, m_retHelper);
3881   recordGdbStub(astubs, m_retHelper - 1, "HHVM::retHelper");
3882   recordBCInstr(OpResumeHelper, astubs, m_resumeHelper);
3883   recordBCInstr(OpDefClsHelper, astubs, m_defClsHelper);
3884   recordBCInstr(OpDtorStub, astubs,
3885                 m_dtorStubs[typeToDestrIndex(BitwiseKindOfString)]);
3886   recordGdbStub(astubs, m_dtorStubs[typeToDestrIndex(BitwiseKindOfString)],
3887                         "HHVM::destructorStub");
3888 }
3889
3890 TranslatorX64*
3891 TranslatorX64::Get() {
3892   /*
3893    * Called from outrageously early, pre-main code, and will
3894    * allocate the first translator space.
3895    */
3896   if (!nextTx64) {
3897     nextTx64 = new TranslatorX64();
3898     nextTx64->initGdb();
3899   }
3900   if (!tx64) {
3901     tx64 = nextTx64;
3902   }
3903   assert(tx64);
3904   return tx64;
3905 }
3906
3907 template<int Arity>
3908 TCA TranslatorX64::emitNAryStub(X64Assembler& a, CppCall c) {
3909   static_assert(Arity < kNumRegisterArgs, "");
3910
3911   // The callNAryStub has already saved these regs on a.
3912   RegSet alreadySaved;
3913   for (size_t i = 0; i < Arity; ++i) {
3914     alreadySaved |= RegSet(argNumToRegName[i]);
3915   }
3916
3917   /*
3918    * We've made a call instruction, and pushed Arity args on the
3919    * stack.  So the stack address will be odd coming into the stub if
3920    * Arity + 1 (for the call) is odd.  We need to correct for this
3921    * when saving other registers below to keep SSE-friendly alignment
3922    * of the stack.
3923    */
3924   const int Parity = (Arity + 1) % 2;
3925
3926   // These dtor stubs are meant to be called with the call
3927   // instruction, unlike most translator code.
3928   moveToAlign(a);
3929   TCA start = a.frontier();
3930   /*
3931    * Preserve most caller-saved regs. The calling code has already
3932    * preserved regs in `alreadySaved'; we push the rest of the caller
3933    * saved regs and rbp.  It should take 9 qwords in total, and the
3934    * incoming call instruction made it 10.  This is an even number of
3935    * pushes, so we preserve the SSE-friendliness of our execution
3936    * environment (without real intervention from PhysRegSaverParity).
3937    *
3938    * Note that we don't need to clean all registers because the only
3939    * reason we could need those locations written back is if stack
3940    * unwinding were to happen.  These stubs can re-enter due to user
3941    * destructors, but exceptions are not allowed to propagate out of
3942    * those, so it's not a problem.
3943    */
3944   a.    push (rbp); // {
3945   a.    movq (rsp, rbp);
3946   {
3947     RegSet s = kGPCallerSaved - alreadySaved;
3948     PhysRegSaverParity rs(Parity, a, s);
3949     emitCall(a, c);
3950   }
3951   a.    pop  (rbp);  // }
3952   a.    ret  ();
3953   return start;
3954 }
3955
3956 TCA TranslatorX64::emitUnaryStub(X64Assembler& a, CppCall c) {
3957   return emitNAryStub<1>(a, c);
3958 }
3959
3960 void TranslatorX64::registerCatchTrace(CTCA ip, TCA trace) {
3961   FTRACE(1, "registerCatchTrace: afterCall: {} trace: {}\n", ip, trace);
3962   m_catchTraceMap.insert(ip, trace);
3963 }
3964
3965 TCA TranslatorX64::getCatchTrace(CTCA ip) const {
3966   TCA* found = m_catchTraceMap.find(ip);
3967   return found ? *found : nullptr;
3968 }
3969
3970 void
3971 TranslatorX64::requestInit() {
3972   TRACE(1, "in requestInit(%" PRId64 ")\n", g_vmContext->m_currentThreadIdx);
3973   tl_regState = VMRegState::CLEAN;
3974   PendQ::drain();
3975   requestResetHighLevelTranslator();
3976   Treadmill::startRequest(g_vmContext->m_currentThreadIdx);
3977   memset(&s_perfCounters, 0, sizeof(s_perfCounters));
3978   Stats::init();
3979 }
3980
3981 void
3982 TranslatorX64::requestExit() {
3983   if (s_writeLease.amOwner()) {
3984     s_writeLease.drop();
3985   }
3986   TRACE_MOD(txlease, 2, "%" PRIx64 " write lease stats: %15" PRId64
3987             " kept, %15" PRId64 " grabbed\n",
3988             Process::GetThreadIdForTrace(), s_writeLease.m_hintKept,
3989             s_writeLease.m_hintGrabbed);
3990   PendQ::drain();
3991   Treadmill::finishRequest(g_vmContext->m_currentThreadIdx);
3992   TRACE(1, "done requestExit(%" PRId64 ")\n", g_vmContext->m_currentThreadIdx);
3993   Stats::dump();
3994   Stats::clear();
3995
3996   if (Trace::moduleEnabledRelease(Trace::tx64stats, 1)) {
3997     Trace::traceRelease("TranslatorX64 perf counters for %s:\n",
3998                         g_context->getRequestUrl(50).c_str());
3999     for (int i = 0; i < tpc_num_counters; i++) {
4000       Trace::traceRelease("%-20s %10" PRId64 "\n",
4001                           kPerfCounterNames[i], s_perfCounters[i]);
4002     }
4003     Trace::traceRelease("\n");
4004   }
4005 }
4006
4007 bool
4008 TranslatorX64::isPseudoEvent(const char* event) {
4009   for (auto name : kPerfCounterNames) {
4010     if (!strcmp(event, name)) {
4011       return true;
4012     }
4013   }
4014   return false;
4015 }
4016
4017 void
4018 TranslatorX64::getPerfCounters(Array& ret) {
4019   for (int i = 0; i < tpc_num_counters; i++) {
4020     // Until Perflab can automatically scale the values we give it to
4021     // an appropriate range, we have to fudge these numbers so they
4022     // look more like reasonable hardware counter values.
4023     ret.set(String::FromCStr(kPerfCounterNames[i]),
4024             s_perfCounters[i] * 1000);
4025   }
4026
4027   if (RuntimeOption::EnableInstructionCounts) {
4028     auto doCounts = [&](unsigned begin, const char* const name) {
4029       int64_t count = 0;
4030       for (; begin < Stats::Instr_InterpOneHighInvalid;
4031            begin += STATS_PER_OPCODE) {
4032         count += Stats::tl_counters[Stats::StatCounter(begin)];
4033       }
4034       ret.set(String::FromCStr(name), count);
4035     };
4036
4037     doCounts(Stats::Instr_TranslLowInvalid + STATS_PER_OPCODE,
4038              kInstrCountTx64Name);
4039     doCounts(Stats::Instr_TranslIRPostLowInvalid + STATS_PER_OPCODE,
4040              kInstrCountIRName);
4041   }
4042 }
4043
4044 TranslatorX64::~TranslatorX64() {
4045   freeSlab(atrampolines.base(), m_totalSize);
4046 }
4047
4048 static Debug::TCRange rangeFrom(const X64Assembler& a, const TCA addr,
4049                                 bool isAstubs) {
4050   assert(a.contains(addr));
4051   return Debug::TCRange(addr, a.frontier(), isAstubs);
4052 }
4053
4054 void TranslatorX64::recordBCInstr(uint32_t op,
4055                                   const X64Assembler& a,
4056                                   const TCA addr) {
4057   if (addr != a.frontier()) {
4058     m_debugInfo.recordBCInstr(Debug::TCRange(addr, a.frontier(),
4059                                              &a == &astubs ? true : false), op);
4060   }
4061 }
4062
4063 void TranslatorX64::recordGdbTranslation(SrcKey sk,
4064                                          const Func* srcFunc,
4065                                          const X64Assembler& a,
4066                                          const TCA start,
4067                                          bool exit,
4068                                          bool inPrologue) {
4069   if (start != a.frontier()) {
4070     assert(s_writeLease.amOwner());
4071     if (!RuntimeOption::EvalJitNoGdb) {
4072       m_debugInfo.recordTracelet(rangeFrom(a, start,
4073                                            &a == &astubs ? true : false),
4074                                  srcFunc,
4075                                  srcFunc->unit() ?
4076                                    srcFunc->unit()->at(sk.offset()) : nullptr,
4077                                  exit, inPrologue);
4078     }
4079     if (RuntimeOption::EvalPerfPidMap) {
4080       m_debugInfo.recordPerfMap(rangeFrom(a, start,
4081                                           &a == &astubs ? true : false),
4082                                 srcFunc, exit, inPrologue);
4083     }
4084   }
4085 }
4086
4087 void TranslatorX64::recordGdbStub(const X64Assembler& a,
4088                                   const TCA start, const char* name) {
4089   if (!RuntimeOption::EvalJitNoGdb) {
4090     m_debugInfo.recordStub(rangeFrom(a, start, &a == &astubs ? true : false),
4091                            name);
4092   }
4093 }
4094
4095 size_t TranslatorX64::getCodeSize() {
4096   return a.used();
4097 }
4098
4099 size_t TranslatorX64::getStubSize() {
4100   return astubs.used();
4101 }
4102
4103 size_t TranslatorX64::getTargetCacheSize() {
4104   return TargetCache::s_frontier;
4105 }
4106
4107 std::string TranslatorX64::getUsage() {
4108   std::string usage;
4109   size_t aHotUsage  = ahot.used();
4110   size_t aProfUsage = aprof.used();
4111   size_t aUsage     = a.used();
4112   size_t stubsUsage = astubs.used();
4113   size_t dataUsage  = m_globalData.frontier - m_globalData.base;
4114   size_t tcUsage    = TargetCache::s_frontier;
4115   size_t persistentUsage =
4116     TargetCache::s_persistent_frontier - TargetCache::s_persistent_start;
4117   Util::string_printf(
4118     usage,
4119     "tx64: %9zd bytes (%zd%%) in ahot.code\n"
4120     "tx64: %9zd bytes (%zd%%) in a.code\n"
4121     "tx64: %9zd bytes (%zd%%) in aprof.code\n"
4122     "tx64: %9zd bytes (%zd%%) in astubs.code\n"
4123     "tx64: %9zd bytes (%zd%%) in m_globalData\n"
4124     "tx64: %9zd bytes (%zd%%) in targetCache\n"
4125     "tx64: %9zd bytes (%zd%%) in persistentCache\n",
4126     aHotUsage,  100 * aHotUsage / ahot.capacity(),
4127     aUsage,     100 * aUsage / a.capacity(),
4128     aProfUsage, 100 * aProfUsage / aprof.capacity(),
4129     stubsUsage, 100 * stubsUsage / astubs.capacity(),
4130     dataUsage,  100 * dataUsage / m_globalData.size,
4131     tcUsage,
4132     400 * tcUsage / RuntimeOption::EvalJitTargetCacheSize / 3,
4133     persistentUsage,
4134     400 * persistentUsage / RuntimeOption::EvalJitTargetCacheSize);
4135   return usage;
4136 }
4137
4138 bool TranslatorX64::addDbgGuards(const Unit* unit) {
4139   // TODO refactor
4140   // It grabs the write lease and iterating through whole SrcDB...
4141   bool locked = s_writeLease.acquire(true);
4142   if (!locked) {
4143     return false;
4144   }
4145   struct timespec tsBegin, tsEnd;
4146   Timer::GetMonotonicTime(tsBegin);
4147   // Doc says even find _could_ invalidate iterator, in pactice it should
4148   // be very rare, so go with it now.
4149   for (SrcDB::iterator it = m_srcDB.begin(); it != m_srcDB.end(); ++it) {
4150     SrcKey const sk = SrcKey::fromAtomicInt(it->first);
4151     SrcRec& sr = *it->second;
4152     if (sr.unitMd5() == unit->md5() &&
4153         !sr.hasDebuggerGuard() &&
4154         isSrcKeyInBL(sk)) {
4155       addDbgGuardImpl(sk, sr);
4156     }
4157   }
4158   s_writeLease.drop();
4159   Timer::GetMonotonicTime(tsEnd);
4160   int64_t elapsed = gettime_diff_us(tsBegin, tsEnd);
4161   if (Trace::moduleEnabledRelease(Trace::tx64, 5)) {
4162     Trace::traceRelease("addDbgGuards got lease for %" PRId64 " us\n", elapsed);
4163   }
4164   return true;
4165 }
4166
4167 bool TranslatorX64::addDbgGuard(const Func* func, Offset offset) {
4168   SrcKey sk(func, offset);
4169   {
4170     if (SrcRec* sr = m_srcDB.find(sk)) {
4171       if (sr->hasDebuggerGuard()) {
4172         return true;
4173       }
4174     } else {
4175       // no translation yet
4176       return true;
4177     }
4178   }
4179   if (debug) {
4180     if (!isSrcKeyInBL(sk)) {
4181       TRACE(5, "calling addDbgGuard on PC that is not in blacklist");
4182       return false;
4183     }
4184   }
4185   bool locked = s_writeLease.acquire(true);
4186   if (!locked) {
4187     return false;
4188   }
4189   {
4190     if (SrcRec* sr = m_srcDB.find(sk)) {
4191       addDbgGuardImpl(sk, *sr);
4192     }
4193   }
4194   s_writeLease.drop();
4195   return true;
4196 }
4197
4198 void TranslatorX64::addDbgGuardImpl(SrcKey sk, SrcRec& srcRec) {
4199   TCA dbgGuard = a.frontier();
4200   // Emit the checks for debugger attach
4201   emitTLSLoad<ThreadInfo>(a, ThreadInfo::s_threadInfo, rAsm);
4202   static COff dbgOff = offsetof(ThreadInfo, m_reqInjectionData) +
4203     RequestInjectionData::debuggerReadOnlyOffset();
4204   a.   load_reg64_disp_reg32(rAsm, dbgOff, rAsm);
4205   a.   testb((int8_t)0xff, rbyte(rAsm));
4206   // Branch to a special REQ_INTERPRET if attached
4207   {
4208     TCA fallback = emitServiceReq(REQ_INTERPRET, sk.offset(), 0);
4209     a. jnz(fallback);
4210   }
4211   // Emit a jump to the actual code
4212   TCA realCode = srcRec.getTopTranslation();
4213   prepareForSmash(a, kJmpLen);
4214   TCA dbgBranchGuardSrc = a.frontier();
4215   a.   jmp(realCode);
4216   // Add it to srcRec
4217   srcRec.addDebuggerGuard(dbgGuard, dbgBranchGuardSrc);
4218 }
4219
4220 bool TranslatorX64::dumpTCCode(const char* filename) {
4221 #define OPEN_FILE(F, SUFFIX)                            \
4222   string F ## name = string(filename).append(SUFFIX);   \
4223   FILE* F = fopen(F ## name .c_str(),"wb");             \
4224   if (F == nullptr) return false;                       \
4225   SCOPE_EXIT{ fclose(F); };
4226
4227   OPEN_FILE(aFile,          "_a");
4228   OPEN_FILE(aprofFile,      "_aprof");
4229   OPEN_FILE(astubFile,      "_astub");
4230   OPEN_FILE(helperAddrFile, "_helpers_addrs.txt");
4231
4232 #undef OPEN_FILE
4233
4234   // dump starting from the trampolines; this assumes processInit() places
4235   // trampolines before the translation cache
4236   size_t count = a.frontier() - atrampolines.base();
4237   bool result = (fwrite(atrampolines.base(), 1, count, aFile) == count);
4238   if (result) {
4239     count = aprof.used();
4240     result = (fwrite(aprof.base(), 1, count, aprofFile) == count);
4241   }
4242   if (result) {
4243     count = astubs.used();
4244     result = (fwrite(astubs.base(), 1, count, astubFile) == count);
4245   }
4246   if (result) {
4247     for(PointerMap::iterator iter = trampolineMap.begin();
4248         iter != trampolineMap.end();
4249         iter++) {
4250       void* helperAddr = iter->first;
4251       void* trampAddr = iter->second;
4252       char* functionName = Util::getNativeFunctionName(helperAddr);
4253       fprintf(helperAddrFile,"%10p %10p %s\n",
4254               trampAddr, helperAddr,
4255               functionName);
4256       free(functionName);
4257     }
4258   }
4259   return result;
4260 }
4261
4262 // Returns true on success
4263 bool TranslatorX64::dumpTC(bool ignoreLease) {
4264   if (!ignoreLease && !s_writeLease.acquire(true)) return false;
4265   bool success = dumpTCData();
4266   if (success) {
4267     success = dumpTCCode("/tmp/tc_dump");
4268   }
4269   if (!ignoreLease) s_writeLease.drop();
4270   return success;
4271 }
4272
4273 // Returns true on success
4274 bool tc_dump(void) {
4275   return TranslatorX64::Get() && TranslatorX64::Get()->dumpTC();
4276 }
4277
4278 // Returns true on success
4279 bool TranslatorX64::dumpTCData() {
4280   gzFile tcDataFile = gzopen("/tmp/tc_data.txt.gz", "w");
4281   if (!tcDataFile) return false;
4282
4283   if (!gzprintf(tcDataFile,
4284                 "repo_schema     = %s\n"
4285                 "a.base          = %p\n"
4286                 "a.frontier      = %p\n"
4287                 "aprof.base      = %p\n"
4288                 "aprof.frontier  = %p\n"
4289                 "astubs.base     = %p\n"
4290                 "astubs.frontier = %p\n\n",
4291                 kRepoSchemaId,
4292                 atrampolines.base(), a.frontier(),
4293                 aprof.base(), aprof.frontier(),
4294                 astubs.base(), astubs.frontier())) {
4295     return false;
4296   }
4297
4298   if (!gzprintf(tcDataFile, "total_translations = %zu\n\n",
4299                 m_translations.size())) {
4300     return false;
4301   }
4302
4303   for (size_t t = 0; t < m_translations.size(); t++) {
4304     if (gzputs(tcDataFile,
4305                m_translations[t].print(getTransCounter(t)).c_str()) == -1) {
4306       return false;
4307     }
4308   }
4309
4310   gzclose(tcDataFile);
4311   return true;
4312 }
4313
4314 void TranslatorX64::invalidateSrcKey(SrcKey sk) {
4315   assert(!RuntimeOption::RepoAuthoritative || RuntimeOption::EvalJitPGO);
4316   assert(s_writeLease.amOwner());
4317   /*
4318    * Reroute existing translations for SrcKey to an as-yet indeterminate
4319    * new one.
4320    */
4321   SrcRec* sr = m_srcDB.find(sk);
4322   assert(sr);
4323   /*
4324    * Since previous translations aren't reachable from here, we know we
4325    * just created some garbage in the TC. We currently have no mechanism
4326    * to reclaim this.
4327    */
4328   sr->replaceOldTranslations();
4329 }
4330
4331 void TranslatorX64::setJmpTransID(TCA jmp) {
4332   if (m_mode != TransProfile) return;
4333
4334   TransID transId = m_profData->curTransID();
4335   FTRACE(5, "setJmpTransID: adding {} => {}\n", jmp, transId);
4336   m_jmpToTransID[jmp] = transId;
4337 }
4338
4339 TranslatorX64::AsmSelector::AsmSelector(const Args& args)
4340     : m_tx(args.getTranslator())
4341     , m_select(args.getSelection()) {
4342
4343   // If an assembler other an 'a' has already been selected, then just
4344   // keep that selection.
4345   if (m_tx->a.base() != m_tx->aStart) {
4346     m_select = AsmSelection::Default;
4347   }
4348
4349   swap();
4350 }
4351
4352 /*
4353  * Swap 'a' with 'ahot' or 'aprof'.
4354  * Note that, although we don't write to either tx->ahot or tx->aprof directly,
4355  * we still need to make  sure that all assembler code areas are available
4356  * in a, astubs, aprof, and ahot, for example when we call asmChoose(addr, ...).
4357  */
4358 void TranslatorX64::AsmSelector::swap() {
4359   switch (m_select) {
4360     case AsmSelection::Profile: std::swap(m_tx->a, m_tx->aprof); break;
4361     case AsmSelection::Hot    : std::swap(m_tx->a, m_tx->ahot) ; break;
4362     case AsmSelection::Default: break; // nothing to do
4363   }
4364 }
4365
4366 TranslatorX64::AsmSelector::~AsmSelector() {
4367   swap();
4368 }
4369
4370 TranslatorX64::AsmSelector::Args::Args(TranslatorX64* tx)
4371     : m_tx(tx)
4372     , m_select(AsmSelection::Default) {
4373   assert(m_tx != nullptr);
4374 }
4375
4376 static const int kMaxTranslationBytes = 8192;
4377
4378 TranslatorX64::AsmSelector::Args&
4379 TranslatorX64::AsmSelector::Args::hot(bool isHot) {
4380   // Profile has precedence over Hot.
4381   if (m_select == AsmSelection::Profile) return *this;
4382
4383   // Make sure there's enough room left in ahot.
4384   if (isHot && m_tx->ahot.available() > kMaxTranslationBytes) {
4385     m_select = AsmSelection::Hot;
4386   } else {
4387     m_select = AsmSelection::Default;
4388   }
4389   return *this;
4390 }
4391
4392 TranslatorX64::AsmSelector::Args&
4393 TranslatorX64::AsmSelector::Args::profile(bool isProf) {
4394   if (isProf) {
4395     m_select = AsmSelection::Profile;
4396   } else if (m_select == AsmSelection::Profile) {
4397     m_select = AsmSelection::Default;
4398   }
4399   return *this;
4400 }
4401
4402 TranslatorX64::AsmSelection
4403 TranslatorX64::AsmSelector::Args::getSelection() const {
4404   return m_select;
4405 }
4406
4407 TranslatorX64*
4408 TranslatorX64::AsmSelector::Args::getTranslator() const {
4409   return m_tx;
4410 }
4411
4412 } // HPHP::Transl
4413
4414 } // HPHP