hphp/runtime/vm/jit/vasm-x64.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/runtime/vm/jit/vasm-emit.h"
  18
  19 #include "hphp/runtime/base/runtime-option.h"
  20
  21 #include "hphp/runtime/vm/jit/abi-x64.h"
  22 #include "hphp/runtime/vm/jit/block.h"
  23 #include "hphp/runtime/vm/jit/code-gen-helpers.h"
  24 #include "hphp/runtime/vm/jit/print.h"
  25 #include "hphp/runtime/vm/jit/prof-data.h"
  26 #include "hphp/runtime/vm/jit/service-requests.h"
  27 #include "hphp/runtime/vm/jit/smashable-instr-x64.h"
  28 #include "hphp/runtime/vm/jit/target-cache.h"
  29 #include "hphp/runtime/vm/jit/timer.h"
  30 #include "hphp/runtime/vm/jit/vasm.h"
  31 #include "hphp/runtime/vm/jit/vasm-instr.h"
  32 #include "hphp/runtime/vm/jit/vasm-internal.h"
  33 #include "hphp/runtime/vm/jit/vasm-lower.h"
  34 #include "hphp/runtime/vm/jit/vasm-print.h"
  35 #include "hphp/runtime/vm/jit/vasm-prof.h"
  36 #include "hphp/runtime/vm/jit/vasm-unit.h"
  37 #include "hphp/runtime/vm/jit/vasm-util.h"
  38 #include "hphp/runtime/vm/jit/vasm-visit.h"
  39
  40 #include <algorithm>
  41 #include <tuple>
  42
  43 TRACE_SET_MOD(vasm);
  44
  45 namespace HPHP { namespace jit {
  46 ///////////////////////////////////////////////////////////////////////////////
  47
  48 using namespace reg;
  49 using namespace x64;
  50
  51 namespace x64 { struct ImmFolder; }
  52
  53 namespace {
  54 ///////////////////////////////////////////////////////////////////////////////
  55
  56 static_assert(folly::kIsLittleEndian,
  57   "Code contains little-endian specific optimizations.");
  58
  59 template<class X64Asm>
  60 struct Vgen {
  61   explicit Vgen(Venv& env)
  62     : env(env)
  63     , a(*env.cb)
  64     , current(env.current)
  65     , next(env.next)
  66     , jmps(env.jmps)
  67     , jccs(env.jccs)
  68     , catches(env.catches)
  69   {}
  70
  71   static void emitVeneers(Venv& env) {}
  72   static void handleLiterals(Venv& env) {}
  73   static void patch(Venv& env);
  74   static void pad(CodeBlock& cb);
  75
  76   /////////////////////////////////////////////////////////////////////////////
  77
  78   template<class Inst> void emit(const Inst& i) {
  79     always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
  80                        vinst_names[Vinstr(i).op], size_t(current));
  81   }
  82
  83   // intrinsics
  84   void emit(const copy& i);
  85   void emit(const copy2& i);
  86   void emit(const debugtrap& /*i*/) { a.int3(); }
  87   void emit(const fallthru& /*i*/) {}
  88   void emit(const ldimmb& i);
  89   void emit(const ldimml& i);
  90   void emit(const ldimmq& i);
  91   void emit(const load& i);
  92   void emit(const store& i);
  93   void emit(const mcprep& i);
  94
  95   // native function abi
  96   void emit(const call& i);
  97   void emit(const callm& i) { a.call(i.target); }
  98   void emit(const callr& i) { a.call(i.target); }
  99   void emit(const calls& i);
 100   void emit(const ret& /*i*/) { a.ret(); }
 101
 102   // stub function abi
 103   void emit(const stubret& i);
 104   void emit(const callstub& i);
 105   void emit(const callfaststub& i);
 106   void emit(const tailcallstub& i);
 107
 108   // php function abi
 109   void emit(const phpret& i);
 110   void emit(const tailcallphp& i);
 111   void emit(const callunpack& i);
 112   void emit(const contenter& i);
 113
 114   // vm entry abi
 115   void emit(const inittc& /*i*/) {}
 116   void emit(const calltc&);
 117   void emit(const leavetc&) { a.ret(); }
 118
 119   // exceptions
 120   void emit(const landingpad& /*i*/) {}
 121   void emit(const nothrow& i);
 122   void emit(const syncpoint& i);
 123   void emit(const unwind& i);
 124
 125   // instructions
 126   void emit(absdbl i) { unary(i); a.psllq(1, i.d); a.psrlq(1, i.d); }
 127   void emit(andb i) { commuteSF(i); a.andb(i.s0, i.d); }
 128   void emit(andbi i) { binary(i); a.andb(i.s0, i.d); }
 129   void emit(const andbim& i) { a.andb(i.s, i.m); }
 130   void emit(andw i) { commuteSF(i); a.andw(i.s0, i.d); }
 131   void emit(andwi i) { binary(i); a.andw(i.s0, i.d); }
 132   void emit(andl i) { commuteSF(i); a.andl(i.s0, i.d); }
 133   void emit(andli i) { binary(i); a.andl(i.s0, i.d); }
 134   void emit(andq i) { commuteSF(i); a.andq(i.s0, i.d); }
 135   void emit(andqi i);
 136   void emit(const addwm& i) { a.addw(i.s0, i.m); }
 137   void emit(addli i) { binary(i); a.addl(i.s0, i.d); }
 138   void emit(const addlm& i) { a.addl(i.s0, i.m); }
 139   void emit(const addlim& i);
 140   void emit(addq i) { commuteSF(i); a.addq(i.s0, i.d); }
 141   void emit(addqi i) { binary(i); a.addq(i.s0, i.d); }
 142   void emit(const addqmr& i);
 143   void emit(const addqrm& i);
 144   void emit(const addqim& i);
 145   void emit(addsd i) { commute(i); a.addsd(i.s0, i.d); }
 146   void emit(const cloadq& i);
 147   template<class cmov> void emit_cmov(const cmov& i);
 148   void emit(const cmovb& i) { emit_cmov(i); }
 149   void emit(const cmovw& i) { emit_cmov(i); }
 150   void emit(const cmovl& i) { emit_cmov(i); }
 151   void emit(const cmovq& i) { emit_cmov(i); }
 152   void emit(const cmpb& i) { a.cmpb(i.s0, i.s1); }
 153   void emit(const cmpbi& i) { a.cmpb(i.s0, i.s1); }
 154   void emit(const cmpbim& i) { a.prefix(i.s1.mr()).cmpb(i.s0, i.s1); }
 155   void emit(const cmpbm& i) { a.prefix(i.s1.mr()).cmpb(i.s0, i.s1); }
 156   void emit(const cmpw& i) { a.cmpw(i.s0, i.s1); }
 157   void emit(const cmpwi& i) { a.cmpw(i.s0, i.s1); }
 158   void emit(const cmpwim& i) { a.cmpw(i.s0, i.s1); }
 159   void emit(const cmpwm& i) { a.cmpw(i.s0, i.s1); }
 160   void emit(const cmpl& i) { a.cmpl(i.s0, i.s1); }
 161   void emit(const cmpli& i) { a.cmpl(i.s0, i.s1); }
 162   void emit(const cmplim& i) { a.cmpl(i.s0, i.s1); }
 163   void emit(const cmplm& i) { a.cmpl(i.s0, i.s1); }
 164   void emit(const cmpq& i) { a.cmpq(i.s0, i.s1); }
 165   void emit(const cmpqi& i) { a.cmpq(i.s0, i.s1); }
 166   void emit(const cmpqim& i) { a.cmpq(i.s0, i.s1); }
 167   void emit(const cmpqm& i) { a.cmpq(i.s0, i.s1); }
 168   void emit(cmpsd i) { noncommute(i); a.cmpsd(i.s0, i.d, i.pred); }
 169   void emit(const cqo& /*i*/) { a.cqo(); }
 170   void emit(const cvttsd2siq& i) { a.cvttsd2siq(i.s, i.d); }
 171   void emit(const cvtsi2sd& i);
 172   void emit(const cvtsi2sdm& i);
 173   void emit(decl i) { unary(i); a.decl(i.d); }
 174   void emit(const declm& i) { a.decl(i.m); }
 175   void emit(decq i) { unary(i); a.decq(i.d); }
 176   void emit(const decqm& i) { a.decq(i.m); }
 177   void emit(const decqmlock& i) { a.decqlock(i.m); }
 178   void emit(divsd i) { noncommute(i); a.divsd(i.s0, i.d); }
 179   void emit(imul i) { commuteSF(i); a.imul(i.s0, i.d); }
 180   void emit(const idiv& i) { a.idiv(i.s); }
 181   void emit(incl i) { unary(i); a.incl(i.d); }
 182   void emit(const inclm& i) { a.incl(i.m); }
 183   void emit(incq i) { unary(i); a.incq(i.d); }
 184   void emit(const incqm& i) { a.incq(i.m); }
 185   void emit(const incwm& i) { a.incw(i.m); }
 186   void emit(const jcc& i);
 187   void emit(const jcci& i);
 188   void emit(const jmp& i);
 189   void emit(const jmpr& i) { a.jmp(i.target); }
 190   void emit(const jmpm& i) { a.jmp(i.target); }
 191   void emit(const jmpi& i);
 192   void emit(const lea& i);
 193   void emit(const leap& i) { a.lea(i.s, i.d); }
 194   void emit(const leav& i);
 195   void emit(const lead& i) { a.lea(rip[(intptr_t)i.s.get()], i.d); }
 196   void emit(const loadups& i) { a.movups(i.s, i.d); }
 197   void emit(const loadtqb& i) { a.loadb(i.s, i.d); }
 198   void emit(const loadb& i) { a.loadb(i.s, i.d); }
 199   void emit(const loadw& i) { a.loadw(i.s, i.d); }
 200   void emit(const loadtql& i) { a.loadl(i.s, i.d); }
 201   void emit(const loadl& i) { a.loadl(i.s, i.d); }
 202   void emit(const loadqp& i) { a.loadq(i.s, i.d); }
 203   void emit(const loadqd& i) { a.loadq(rip[(intptr_t)i.s.get()], i.d); }
 204   void emit(const loadsd& i) { a.movsd(i.s, i.d); }
 205   void emit(const loadzbl& i) { a.loadzbl(i.s, i.d); }
 206   void emit(const loadzbq& i) { a.loadzbl(i.s, Reg32(i.d)); }
 207   void emit(const loadsbq& i) { a.loadsbq(i.s, i.d); }
 208   void emit(const loadzlq& i) { a.loadl(i.s, Reg32(i.d)); }
 209   void emit(const movb& i) { a.movb(i.s, i.d); }
 210   void emit(const movl& i) { a.movl(i.s, i.d); }
 211   void emit(const movzbw& i) { a.movzbl(i.s, Reg32(i.d)); }
 212   void emit(const movzbl& i) { a.movzbl(i.s, i.d); }
 213   void emit(const movzbq& i) { a.movzbl(i.s, Reg32(i.d)); }
 214   void emit(const movzwl& i) { a.movzwl(i.s, i.d); }
 215   void emit(const movzwq& i) { a.movzwl(i.s, Reg32(i.d)); }
 216   void emit(const movzlq& i) { a.movl(i.s, Reg32(i.d)); }
 217   void emit(const movsbq& i) { a.movsbq(i.s, i.d); }
 218   void emit(mulsd i) { commute(i); a.mulsd(i.s0, i.d); }
 219   void emit(neg i) { unary(i); a.neg(i.d); }
 220   void emit(const nop& /*i*/) { a.nop(); }
 221   void emit(not i) { unary(i); a.not(i.d); }
 222   void emit(notb i) { unary(i); a.notb(i.d); }
 223   void emit(orbi i) { binary(i); a.orb(i.s0, i.d); }
 224   void emit(const orbim& i) { a.orb(i.s0, i.m); }
 225   void emit(const orwim& i) { a.orw(i.s0, i.m); }
 226   void emit(const orlim& i) { a.orl(i.s0, i.m); }
 227   void emit(orq i) { commuteSF(i); a.orq(i.s0, i.d); }
 228   void emit(orwi i) { binary(i); a.orw(i.s0, i.d); }
 229   void emit(orli i) { binary(i); a.orl(i.s0, i.d); }
 230   void emit(orqi i) { binary(i); a.orq(i.s0, i.d); }
 231   void emit(const orqim& i) { a.orq(i.s0, i.m); }
 232   void emit(const pop& i) { a.pop(i.d); }
 233   void emit(const popm& i) { a.pop(i.d); }
 234   void emit(const popf& i) { assertx(i.d == RegSF{0}); a.popf(); }
 235   void emit(const push& i) { a.push(i.s); }
 236   void emit(const pushm& i) { a.push(i.s); }
 237   void emit(const pushf& i) { assertx(i.s == RegSF{0}); a.pushf(); }
 238   void emit(const roundsd& i) { a.roundsd(i.dir, i.s, i.d); }
 239   void emit(const sarq& i) { unary(i); a.sarq(i.d); }
 240   void emit(sarqi i) { binary(i); a.sarq(i.s0, i.d); }
 241   void emit(const setcc& i) { a.setcc(i.cc, i.d); }
 242   void emit(shlli i) { binary(i); a.shll(i.s0, i.d); }
 243   void emit(shlq i) { unary(i); a.shlq(i.d); }
 244   void emit(shrq i) { unary(i); a.shrq(i.d); }
 245   void emit(shlqi i) { binary(i); a.shlq(i.s0, i.d); }
 246   void emit(shrli i) { binary(i); a.shrl(i.s0, i.d); }
 247   void emit(shrqi i) { binary(i); a.shrq(i.s0, i.d); }
 248   void emit(const sqrtsd& i) { a.sqrtsd(i.s, i.d); }
 249   void emit(const storeups& i) { a.movups(i.s, i.m); }
 250   void emit(const storeb& i) { a.storeb(i.s, i.m); }
 251   void emit(const storebi& i);
 252   void emit(const storel& i) { a.storel(i.s, i.m); }
 253   void emit(const storeli& i) { a.storel(i.s, i.m); }
 254   void emit(const storeqi& i);
 255   void emit(const storesd& i) { a.movsd(i.s, i.m); }
 256   void emit(const storew& i) { a.storew(i.s, i.m); }
 257   void emit(const storewi& i) { a.storew(i.s, i.m); }
 258   void emit(subl i) { noncommute(i); a.subl(i.s0, i.d); }
 259   void emit(subli i) { binary(i); a.subl(i.s0, i.d); }
 260   void emit(subq i) { noncommute(i); a.subq(i.s0, i.d); }
 261   void emit(subqi i) { binary(i); a.subq(i.s0, i.d); }
 262   void emit(subsd i) { noncommute(i); a.subsd(i.s0, i.d); }
 263   void emit(const testb& i) { a.testb(i.s0, i.s1); }
 264   void emit(const testbi& i) { a.testb(i.s0, i.s1); }
 265   void emit(const testbm& i) { a.testb(i.s0, i.s1); }
 266   void emit(const testbim& i) { a.testb(i.s0, i.s1); }
 267   void emit(const testw& i) { a.testw(i.s0, i.s1); }
 268   void emit(const testwi& i);
 269   void emit(const testwm& i) { a.testw(i.s0, i.s1); }
 270   void emit(const testwim& i);
 271   void emit(const testl& i) { a.testl(i.s0, i.s1); }
 272   void emit(const testli& i);
 273   void emit(const testlm& i) { a.testl(i.s0, i.s1); }
 274   void emit(const testlim& i);
 275   void emit(const testq& i) { a.testq(i.s0, i.s1); }
 276   void emit(const testqi& i);
 277   void emit(const testqm& i) { a.testq(i.s0, i.s1); }
 278   void emit(const testqim& i);
 279   void emit(const trap& i);
 280   void emit(const ucomisd& i) { a.ucomisd(i.s0, i.s1); }
 281   void emit(unpcklpd i) { noncommute(i); a.unpcklpd(i.s0, i.d); }
 282   void emit(xorb i) { commuteSF(i); a.xorb(i.s0, i.d); }
 283   void emit(xorbi i) { binary(i); a.xorb(i.s0, i.d); }
 284   void emit(xorl i) { commuteSF(i); a.xorl(i.s0, i.d); }
 285   void emit(xorq i);
 286   void emit(xorqi i) { binary(i); a.xorq(i.s0, i.d); }
 287   void emit(const conjure& /*i*/) { always_assert(false); }
 288   void emit(const conjureuse& /*i*/) { always_assert(false); }
 289
 290   void emit_nop() {
 291     emit(lea{rax[8], rax});
 292     emit(lea{rax[-8], rax});
 293   }
 294
 295 private:
 296   // helpers
 297   void prep(Reg8 s, Reg8 d) { if (s != d) a.movb(s, d); }
 298   void prep(Reg16 s, Reg16 d) { if (s != d) a.movw(s, d); }
 299   void prep(Reg32 s, Reg32 d) { if (s != d) a.movl(s, d); }
 300   void prep(Reg64 s, Reg64 d) { if (s != d) a.movq(s, d); }
 301   void prep(RegXMM s, RegXMM d) { if (s != d) a.movdqa(s, d); }
 302   void emit_simd_imm(int64_t, Vreg);
 303
 304   template<class Inst> void unary(Inst& i) { prep(i.s, i.d); }
 305   template<class Inst> void binary(Inst& i) { prep(i.s1, i.d); }
 306
 307   template<class Inst> void commuteSF(Inst&);
 308   template<class Inst> void commute(Inst&);
 309   template<class Inst> void noncommute(Inst&);
 310
 311   CodeBlock& frozen() { return env.text.frozen().code; }
 312
 313 private:
 314   Venv& env;
 315   X64Asm a;
 316
 317   const Vlabel current;
 318   const Vlabel next;
 319   jit::vector<Venv::LabelPatch>& jmps;
 320   jit::vector<Venv::LabelPatch>& jccs;
 321   jit::vector<Venv::LabelPatch>& catches;
 322 };
 323
 324 ///////////////////////////////////////////////////////////////////////////////
 325 /*
 326  * Prepare a binary op that is not commutative.
 327  *
 328  * s0 must be a different register than s1 so we don't clobber it.
 329  */
 330 template<class X64Asm>
 331 template<class Inst> void Vgen<X64Asm>::noncommute(Inst& i) {
 332   assertx(i.s1 == i.d || i.s0 != i.d); // do not clobber s0
 333   binary(i);
 334 }
 335
 336 /*
 337  * Prepare a binary op that is commutative.
 338  *
 339  * Swap operands if the dest is s0.
 340  */
 341 template<class X64Asm>
 342 template<class Inst> void Vgen<X64Asm>::commuteSF(Inst& i) {
 343   if (i.s1 != i.d && i.s0 == i.d) {
 344     i = Inst{i.s1, i.s0, i.d, i.sf};
 345   } else {
 346     binary(i);
 347   }
 348 }
 349
 350 template<class X64Asm>
 351 template<class Inst> void Vgen<X64Asm>::commute(Inst& i) {
 352   if (i.s1 != i.d && i.s0 == i.d) {
 353     i = Inst{i.s1, i.s0, i.d};
 354   } else {
 355     binary(i);
 356   }
 357 }
 358
 359 ///////////////////////////////////////////////////////////////////////////////
 360
 361 /*
 362  * Returns true iff the status flags necessary to take a j<a> imply that a j<b>
 363  * will also be taken.
 364  */
 365 bool ccImplies(ConditionCode a, ConditionCode b) {
 366   if (a == b) return true;
 367
 368   switch (a) {
 369     case CC_None:
 370     case CC_O:  case CC_NO:
 371     case CC_AE: case CC_BE:
 372     case CC_NE:
 373     case CC_S:  case CC_NS:
 374     case CC_P:  case CC_NP:
 375     case CC_GE: case CC_LE:
 376       return false;
 377
 378     case CC_B: return b == CC_BE;
 379     case CC_E: return b == CC_BE || b == CC_LE;
 380     case CC_A: return b == CC_AE || b == CC_NE;
 381     case CC_L: return b == CC_LE;
 382     case CC_G: return b == CC_NE || b == CC_GE;
 383   }
 384   always_assert(false);
 385 }
 386
 387 static CodeAddress toReal(Venv& env, CodeAddress a) {
 388   if (env.text.main().code.contains(a)) {
 389     return env.text.main().code.toDestAddress(a);
 390   }
 391   if (env.text.cold().code.contains(a)) {
 392     return env.text.cold().code.toDestAddress(a);
 393   }
 394   if (env.text.frozen().code.contains(a)) {
 395     return env.text.frozen().code.toDestAddress(a);
 396   }
 397   return a;
 398 }
 399
 400 /*
 401  * When two jccs go to the same destination, the cc of the first is compatible
 402  * with the cc of the second, and they're within a one-byte offset of each
 403  * other, retarget the first to jump to the second. This will allow the
 404  * relocator to shrink the first one, and the extra jmp shouldn't matter since
 405  * we try to only do this to rarely taken jumps.
 406  */
 407 void retargetJumps(Venv& env,
 408                    const jit::hash_map<TCA, jit::vector<TCA>>& jccs) {
 409   jit::hash_set<TCA> retargeted;
 410   for (auto& pair : jccs) {
 411     auto const& jmps = pair.second;
 412     if (jmps.size() < 2) continue;
 413
 414     for (size_t i = 0; i < jmps.size(); ++i) {
 415       DecodedInstruction di(toReal(env, jmps[i]), jmps[i]);
 416       // Don't bother if the jump is already a short jump.
 417       if (di.size() != 6) continue;
 418
 419       for (size_t j = jmps.size() - 1; j > i; --j) {
 420         auto const delta = jmps[j] - jmps[i] + 2;
 421         // Backwards jumps are probably not guards, and don't retarget to a
 422         // dest that's more than a one-byte offset away.
 423         if (delta < 0 || !deltaFits(delta, sz::byte)) continue;
 424
 425         DecodedInstruction dj(toReal(env, jmps[j]), jmps[j]);
 426         if (!ccImplies(di.jccCondCode(), dj.jccCondCode())) continue;
 427
 428         di.setPicAddress(jmps[j]);
 429         retargeted.insert(jmps[i]);
 430
 431         // We might've converted a smashable jump to a regular in-unit jump, so
 432         // remove any smashable alignments.
 433         auto range = env.meta.alignments.equal_range(jmps[i]);
 434         while (range.first != range.second) {
 435           auto iter = range.first;
 436           ++range.first;
 437
 438           auto& align = iter->second;
 439           if (align.first == Alignment::SmashJcc &&
 440               align.second == AlignContext::Live) {
 441             env.meta.alignments.erase(iter);
 442           }
 443         }
 444
 445         break;
 446       }
 447     }
 448   }
 449
 450   // Finally, remove any retargeted jmps from inProgressTailJumps.
 451   if (!retargeted.empty()) {
 452     GrowableVector<IncomingBranch> newTailJumps;
 453     for (auto& jmp : env.meta.inProgressTailJumps) {
 454       if (retargeted.count(jmp.toSmash()) == 0) {
 455         newTailJumps.push_back(jmp);
 456       }
 457     }
 458     env.meta.inProgressTailJumps.swap(newTailJumps);
 459   }
 460   // If the retarged jumps were smashable, now they aren't anymore, so remove
 461   // them from smashableJumpData.
 462   for (auto jmp : retargeted) {
 463     if (env.meta.smashableJumpData.erase(jmp) > 0) {
 464       FTRACE(3, "retargetJumps: removed {} from smashableJumpData\n", jmp);
 465     }
 466   }
 467 }
 468
 469 template<class X64Asm>
 470 void Vgen<X64Asm>::patch(Venv& env) {
 471   for (auto const& p : env.jmps) {
 472     assertx(env.addrs[p.target]);
 473     X64Asm::patchJmp(toReal(env, p.instr), p.instr, env.addrs[p.target]);
 474   }
 475
 476   auto const optLevel = RuntimeOption::EvalJitRetargetJumps;
 477   jit::hash_map<TCA, jit::vector<TCA>> jccs;
 478   for (auto const& p : env.jccs) {
 479     assertx(env.addrs[p.target]);
 480     X64Asm::patchJcc(toReal(env, p.instr), p.instr, env.addrs[p.target]);
 481     if (optLevel >= 2 ||
 482         (optLevel == 1 && p.target >= env.unit.blocks.size())) {
 483       jccs[env.addrs[p.target]].emplace_back(p.instr);
 484     }
 485   }
 486
 487   if (!jccs.empty()) retargetJumps(env, jccs);
 488
 489   for (auto const& p : env.leas) {
 490     assertx(env.vaddrs[p.target]);
 491     DecodedInstruction di(toReal(env, p.instr), p.instr);
 492     assertx(di.hasPicOffset());
 493     di.setPicAddress(env.vaddrs[p.target]);
 494   }
 495 }
 496
 497 template<class X64Asm>
 498 void Vgen<X64Asm>::pad(CodeBlock& cb) {
 499   X64Asm a { cb };
 500   a.pad();
 501 }
 502
 503 ///////////////////////////////////////////////////////////////////////////////
 504
 505 template<class X64Asm>
 506 void Vgen<X64Asm>::emit(const copy& i) {
 507   if (i.s == i.d) return;
 508   if (i.s.isGP()) {
 509     if (i.d.isGP()) {                 // GP => GP
 510       a.movq(i.s, i.d);
 511     } else {                             // GP => XMM
 512       assertx(i.d.isSIMD());
 513       // This generates a movq x86 instruction, which zero extends
 514       // the 64-bit value in srcReg into a 128-bit XMM register
 515       a.movq_rx(i.s, i.d);
 516     }
 517   } else {
 518     if (i.d.isGP()) {                 // XMM => GP
 519       a.movq_xr(i.s, i.d);
 520     } else {                             // XMM => XMM
 521       assertx(i.d.isSIMD());
 522       // This copies all 128 bits in XMM,
 523       // thus avoiding partial register stalls
 524       a.movdqa(i.s, i.d);
 525     }
 526   }
 527 }
 528
 529 template<class X64Asm>
 530 void Vgen<X64Asm>::emit(const copy2& i) {
 531   assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid());
 532   auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1;
 533   assertx(d0 != d1);
 534   if (d0 == s1) {
 535     if (d1 == s0) {
 536       a.xchgq(d0, d1);
 537     } else {
 538       // could do this in a simplify pass
 539       if (s1 != d1) a.movq(s1, d1); // save s1 first; d1 != s0
 540       if (s0 != d0) a.movq(s0, d0);
 541     }
 542   } else {
 543     // could do this in a simplify pass
 544     if (s0 != d0) a.movq(s0, d0);
 545     if (s1 != d1) a.movq(s1, d1);
 546   }
 547 }
 548
 549 template<class X64Asm>
 550 void Vgen<X64Asm>::emit_simd_imm(int64_t val, Vreg d) {
 551   if (val == 0) {
 552     a.pxor(d, d); // does not modify flags
 553   } else {
 554     auto addr = alloc_literal(env, val);
 555     a.movsd(rip[(intptr_t)addr], d);
 556   }
 557 }
 558
 559 template<class X64Asm>
 560 void Vgen<X64Asm>::emit(const ldimmb& i) {
 561   // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t
 562   auto val = i.s.ub();
 563   if (i.d.isGP()) {
 564     Vreg8 d8 = i.d;
 565     a.movb(static_cast<int8_t>(val), d8);
 566   } else {
 567     emit_simd_imm(val, i.d);
 568   }
 569 }
 570
 571 template<class X64Asm>
 572 void Vgen<X64Asm>::emit(const ldimml& i) {
 573   // ldimml is for Vconst::Long, which is treated as unsigned uint32_t
 574   auto val = i.s.l();
 575   if (i.d.isGP()) {
 576     Vreg32 d32 = i.d;
 577     a.movl(val, d32);
 578   } else {
 579     emit_simd_imm(uint32_t(val), i.d);
 580   }
 581 }
 582
 583 template<class X64Asm>
 584 void Vgen<X64Asm>::emit(const ldimmq& i) {
 585   auto val = i.s.q();
 586   if (i.d.isGP()) {
 587     if (val == 0) {
 588       Vreg32 d32 = i.d;
 589       a.movl(0, d32); // because emitImmReg tries the xor optimization
 590     } else {
 591       a.emitImmReg(i.s, i.d);
 592     }
 593   } else {
 594     emit_simd_imm(val, i.d);
 595   }
 596 }
 597
 598 template<class X64Asm>
 599 void Vgen<X64Asm>::emit(const load& i) {
 600   auto mref = i.s.mr();
 601   a.prefix(mref);
 602   if (i.d.isGP()) {
 603     a.loadq(mref, i.d);
 604   } else {
 605     assertx(i.d.isSIMD());
 606     a.movsd(mref, i.d);
 607   }
 608 }
 609
 610 template<class X64Asm>
 611 void Vgen<X64Asm>::emit(const store& i) {
 612   auto const mref = i.d.mr();
 613   a.prefix(mref);
 614   if (i.s.isGP()) {
 615     a.storeq(i.s, i.d);
 616   } else {
 617     assertx(i.s.isSIMD());
 618     a.movsd(i.s, i.d);
 619   }
 620 }
 621
 622 ///////////////////////////////////////////////////////////////////////////////
 623
 624 template<class X64Asm>
 625 void Vgen<X64Asm>::emit(const mcprep& i) {
 626   /*
 627    * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
 628    * address of the movq) so that we can find the movq from the handler.
 629    *
 630    * We set the low bit for two reasons: the Class* will never be a valid
 631    * Class*, so we'll always miss the inline check before it's smashed, and
 632    * MethodCache::handleStaticCall can tell it's not been smashed yet
 633    */
 634   auto const mov_addr = emitSmashableMovq(a.code(), env.meta, 0, r64(i.d));
 635   auto const imm = reinterpret_cast<uint64_t>(mov_addr);
 636   smashMovq(a.toDestAddress(mov_addr), (imm << 1) | 1);
 637
 638   env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm));
 639 }
 640
 641 ///////////////////////////////////////////////////////////////////////////////
 642
 643 template<class X64Asm>
 644 void Vgen<X64Asm>::emit(const call& i) {
 645   if (a.jmpDeltaFits(i.target)) {
 646     a.call(i.target);
 647   } else {
 648     // can't do a near call; store address in data section.
 649     // call by loading the address using rip-relative addressing.  This
 650     // assumes the data section is near the current code section.  Since
 651     // this sequence is directly in-line, rip-relative like this is
 652     // more compact than loading a 64-bit immediate.
 653     auto addr = alloc_literal(env, (uint64_t)i.target);
 654     a.call(rip[(intptr_t)addr]);
 655   }
 656   if (i.watch) {
 657     *i.watch = a.frontier();
 658     env.meta.watchpoints.push_back(i.watch);
 659   }
 660 }
 661
 662 template<class X64Asm>
 663 void Vgen<X64Asm>::emit(const calls& i) {
 664   emitSmashableCall(a.code(), env.meta, i.target);
 665 }
 666
 667 ///////////////////////////////////////////////////////////////////////////////
 668
 669 template<class X64Asm>
 670 void Vgen<X64Asm>::emit(const stubret& i) {
 671   if (i.saveframe) {
 672     a.pop(rvmfp());
 673   } else {
 674     a.addq(8, reg::rsp);
 675   }
 676   a.ret();
 677 }
 678
 679 template<class X64Asm>
 680 void Vgen<X64Asm>::emit(const callstub& i) {
 681   emit(call{i.target, i.args});
 682 }
 683
 684 template<class X64Asm>
 685 void Vgen<X64Asm>::emit(const callfaststub& i) {
 686   emit(call{i.target, i.args});
 687   emit(syncpoint{i.fix});
 688 }
 689
 690 template<class X64Asm>
 691 void Vgen<X64Asm>::emit(const tailcallstub& i) {
 692   a.addq(8, reg::rsp);
 693   emit(jmpi{i.target, i.args});
 694 }
 695
 696 ///////////////////////////////////////////////////////////////////////////////
 697
 698 template<class X64Asm>
 699 void Vgen<X64Asm>::emit(const phpret& i) {
 700   a.push(i.fp[AROFF(m_savedRip)]);
 701   if (!i.noframe) {
 702     a.loadq(i.fp[AROFF(m_sfp)], i.d);
 703   }
 704   a.ret();
 705 }
 706
 707 template<class X64Asm>
 708 void Vgen<X64Asm>::emit(const tailcallphp& i) {
 709   emit(pushm{i.fp[AROFF(m_savedRip)]});
 710   emit(jmpr{i.target, i.args});
 711 }
 712
 713 template<class X64Asm>
 714 void Vgen<X64Asm>::emit(const callunpack& i) {
 715   emit(call{i.target, i.args});
 716 }
 717
 718 template<class X64Asm>
 719 void Vgen<X64Asm>::emit(const contenter& i) {
 720   Label Stub, End;
 721   Reg64 fp = i.fp, target = i.target;
 722   a.jmp8(End);
 723
 724   asm_label(a, Stub);
 725   a.pop(fp[AROFF(m_savedRip)]);
 726   a.jmp(target);
 727
 728   asm_label(a, End);
 729   a.call(Stub);
 730   // m_savedRip will point here.
 731   emit(unwind{{i.targets[0], i.targets[1]}});
 732 }
 733
 734 ///////////////////////////////////////////////////////////////////////////////
 735 template<class X64Asm>
 736 void Vgen<X64Asm>::emit(const calltc& i) {
 737   a.push(i.exittc);
 738   a.push(i.fp[AROFF(m_savedRip)]);
 739
 740   Label stub;
 741   a.call(stub);
 742
 743   asm_label(a, stub);
 744   assertx(!i.args.contains(reg::rax));
 745   a.pop(reg::rax);  // unused
 746   a.jmp(i.target);
 747 }
 748
 749 ///////////////////////////////////////////////////////////////////////////////
 750
 751 template<class X64Asm>
 752 void Vgen<X64Asm>::emit(const nothrow& /*i*/) {
 753   env.meta.catches.emplace_back(a.frontier(), nullptr);
 754   env.record_inline_stack(a.frontier());
 755 }
 756
 757 template<class X64Asm>
 758 void Vgen<X64Asm>::emit(const syncpoint& i) {
 759   FTRACE(5, "IR recordSyncPoint: {} {} {}\n", a.frontier(),
 760          i.fix.pcOffset, i.fix.spOffset);
 761   env.meta.fixups.emplace_back(a.frontier(), i.fix);
 762   env.record_inline_stack(a.frontier());
 763 }
 764
 765 template<class X64Asm>
 766 void Vgen<X64Asm>::emit(const unwind& i) {
 767   catches.push_back({a.frontier(), i.targets[1]});
 768   env.record_inline_stack(a.frontier());
 769   emit(jmp{i.targets[0]});
 770 }
 771
 772 ///////////////////////////////////////////////////////////////////////////////
 773
 774 template<class X64Asm>
 775 void Vgen<X64Asm>::emit(andqi i) {
 776   if (magFits(i.s0.q(), sz::dword)) {
 777     emit(andli{int32_t(i.s0.q()), Reg32(i.s1), Reg32(i.d), i.sf});
 778     return;
 779   }
 780
 781   binary(i);
 782   a.andq(i.s0, i.d);
 783 }
 784
 785 template<class X64Asm>
 786 void Vgen<X64Asm>::emit(const addlim& i) {
 787   auto mref = i.m.mr();
 788   a.prefix(mref).addl(i.s0, mref);
 789 }
 790
 791 template<typename X64Asm>
 792 void Vgen<X64Asm>::emit(const addqmr& i) {
 793   binary(i);
 794   auto const mref = i.m.mr();
 795   a.prefix(mref).addq(mref, i.d);
 796 }
 797
 798 template<typename X64Asm>
 799 void Vgen<X64Asm>::emit(const addqrm& i) {
 800   auto const mref = i.m.mr();
 801   a.prefix(mref).addq(i.s1, mref);
 802 }
 803
 804 template<class X64Asm>
 805 void Vgen<X64Asm>::emit(const addqim& i) {
 806   auto mref = i.m.mr();
 807   a.prefix(mref).addq(i.s0, mref);
 808 }
 809
 810 template<class X64Asm>
 811 void Vgen<X64Asm>::emit(const cloadq& i) {
 812   auto m = i.t;
 813   always_assert(!m.index.isValid()); // not supported, but could be later.
 814   if (i.f != i.d) {
 815     if (i.d == m.base) {
 816       // We can't move f over d or we'll clobber the Vptr we need to load from.
 817       // Since cload does the load unconditionally anyway, we can just load and
 818       // cmov.
 819       a.loadq(i.t, i.d);
 820       a.cmov_reg64_reg64(ccNegate(i.cc), i.f, i.d);
 821       return;
 822     }
 823     a.movq(i.f, i.d);
 824   }
 825   a.cload_reg64_disp_reg64(i.cc, m.base, m.disp, i.d);
 826 }
 827
 828 // add s0 s1 d => mov s1->d; d += s0
 829 // cmov cc s d => if cc { mov s->d }
 830 template<class X64Asm>
 831 template<class cmov>
 832 void Vgen<X64Asm>::emit_cmov(const cmov& i) {
 833   if (i.f != i.d && i.t == i.d) {
 834     // negate the condition and swap t/f operands so we dont clobber i.t
 835     return emit(cmov{ccNegate(i.cc), i.sf, i.t, i.f, i.d});
 836   } else {
 837     prep(i.f, i.d);
 838   }
 839   a.cmov_reg64_reg64(i.cc, r64(i.t), r64(i.d));
 840 }
 841
 842 template<class X64Asm>
 843 void Vgen<X64Asm>::emit(const cvtsi2sd& i) {
 844   a.pxor(i.d, i.d);
 845   a.cvtsi2sd(i.s, i.d);
 846 }
 847
 848 template<class X64Asm>
 849 void Vgen<X64Asm>::emit(const cvtsi2sdm& i) {
 850   a.pxor(i.d, i.d);
 851   a.cvtsi2sd(i.s, i.d);
 852 }
 853
 854 template<class X64Asm>
 855 void Vgen<X64Asm>::emit(const jcc& i) {
 856   if (i.targets[1] != i.targets[0]) {
 857     if (next == i.targets[1]) {
 858       return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}});
 859     }
 860     auto taken = i.targets[1];
 861     jccs.push_back({a.frontier(), taken});
 862     a.jcc(i.cc, a.frontier());
 863   }
 864   emit(jmp{i.targets[0]});
 865 }
 866
 867 template<class X64Asm>
 868 void Vgen<X64Asm>::emit(const jcci& i) {
 869   a.jcc(i.cc, i.taken);
 870   emit(jmp{i.target});
 871 }
 872
 873 template<class X64Asm>
 874 void Vgen<X64Asm>::emit(const jmp& i) {
 875   if (next == i.target) return;
 876   jmps.push_back({a.frontier(), i.target});
 877   a.jmp(a.frontier());
 878 }
 879
 880 template<class X64Asm>
 881 void Vgen<X64Asm>::emit(const jmpi& i) {
 882   if (a.jmpDeltaFits(i.target)) {
 883     a.jmp(i.target);
 884   } else {
 885     // can't do a near jmp - use rip-relative addressing
 886     auto addr = alloc_literal(env, (uint64_t)i.target);
 887     a.jmp(rip[(intptr_t)addr]);
 888   }
 889 }
 890
 891 template<class X64Asm>
 892 void Vgen<X64Asm>::emit(const lea& i) {
 893   // could do this in a simplify pass
 894   if (i.s.disp == 0 && i.s.base.isValid() && !i.s.index.isValid()) {
 895     emit(copy{i.s.base, i.d});
 896   } else {
 897     a.lea(i.s, i.d);
 898   }
 899 }
 900
 901 template<class X64Asm>
 902 void Vgen<X64Asm>::emit(const leav& i) {
 903   auto const addr = a.frontier();
 904   emit(leap{reg::rip[0xdeadbeef], i.d});
 905   env.leas.push_back({addr, i.s});
 906 }
 907
 908 template<class X64Asm>
 909 void Vgen<X64Asm>::emit(const storebi& i) {
 910   auto mref = i.m.mr();
 911   a.prefix(mref).storeb(i.s, mref);
 912 }
 913
 914 template<class X64Asm>
 915 void Vgen<X64Asm>::emit(const storeqi& i) {
 916   auto mref = i.m.mr();
 917   a.prefix(mref).storeq(i.s, mref);
 918 }
 919
 920 template<class VgenImpl, typename Inst>
 921 bool testimHelper(VgenImpl& env, const Inst& i, uint64_t mask) {
 922   // If there's only 1 byte of meaningful bits in the mask, we can adjust the
 923   // pointer offset and use testbim instead.
 924   int off = 0;
 925   while (mask > 0xff && !(mask & 0xff)) {
 926     off++;
 927     mask >>= 8;
 928   }
 929
 930   if (mask > 0xff) return false;
 931
 932   env.emit(testbim{int8_t(mask), i.s1 + off, i.sf});
 933   return true;
 934 }
 935
 936 template<class X64Asm>
 937 void Vgen<X64Asm>::emit(const testwi& i) {
 938   if (i.s0.w() == -1) {
 939     return emit(testw{i.s1, i.s1, i.sf});
 940   }
 941   a.testw(i.s0, i.s1);
 942 }
 943
 944 template<class X64Asm>
 945 void Vgen<X64Asm>::Vgen::emit(const testwim& i) {
 946   if (testimHelper(*this, i, i.s0.w())) return;
 947   a.testw(i.s0, i.s1);
 948 }
 949
 950 template<class X64Asm>
 951 void Vgen<X64Asm>::Vgen::emit(const testlim& i) {
 952   if (testimHelper(*this, i, i.s0.l())) return;
 953   a.testl(i.s0, i.s1);
 954 }
 955
 956 template<class X64Asm>
 957 void Vgen<X64Asm>::Vgen::emit(const testli& i) {
 958   if (i.s0.l() == -1) {
 959     return emit(testl{i.s1, i.s1, i.sf});
 960   }
 961   a.testl(i.s0, i.s1);
 962 }
 963
 964 template<class X64Asm>
 965 void Vgen<X64Asm>::emit(const testqi& i) {
 966   auto const imm = i.s0.q();
 967   if (magFits(imm, sz::byte)) {
 968     a.testb(int8_t(imm), rbyte(i.s1));
 969   } else if (magFits(imm, sz::dword)) {
 970     emit(testli{int32_t(imm), Reg32(i.s1), i.sf});
 971   } else if (imm == -1) {
 972     emit(testq{i.s1, i.s1, i.sf});
 973   } else {
 974     a.testq(i.s0, i.s1);
 975   }
 976 }
 977
 978 template<class X64Asm>
 979 void Vgen<X64Asm>::emit(const testqim& i) {
 980   if (testimHelper(*this, i, i.s0.q())) return;
 981   if (magFits(i.s0.q(), sz::dword)) {
 982     // For an unsigned 32 bit immediate, we can get the same results
 983     // by emitting a testlim.
 984     emit(testlim{int32_t(i.s0.q()), i.s1, i.sf});
 985   } else {
 986     a.testq(i.s0, i.s1);
 987   }
 988 }
 989
 990 template<class X64Asm>
 991 void Vgen<X64Asm>::emit(const trap& i) {
 992   env.meta.trapReasons.emplace_back(a.frontier(), i.reason);
 993   a.ud2();
 994 }
 995
 996 template<class X64Asm>
 997 void Vgen<X64Asm>::emit(xorq i) {
 998   if (i.s0 == i.s1) {
 999     // 32-bit xor{s, s, d} zeroes the upper bits of `d'.
1000     return emit(xorl{r32(i.s0), r32(i.s1), r32(i.d), i.sf});
1001   }
1002   commuteSF(i);
1003   a.xorq(i.s0, i.d);
1004 }
1005
1006 ///////////////////////////////////////////////////////////////////////////////
1007
1008 template<typename Lower>
1009 void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) {
1010   vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; });
1011 }
1012
1013 template <typename Inst>
1014 void lower(Vunit& /*unit*/, Inst& /*inst*/, Vlabel /*b*/, size_t /*i*/) {}
1015
1016 ///////////////////////////////////////////////////////////////////////////////
1017
1018 void lower(Vunit& unit, popp& inst, Vlabel b, size_t i) {
1019   lower_impl(unit, b, i, [&] (Vout& v) {
1020     v << pop{inst.d0};
1021     v << pop{inst.d1};
1022   });
1023 }
1024
1025 void lower(Vunit& unit, poppm& inst, Vlabel b, size_t i) {
1026   lower_impl(unit, b, i, [&] (Vout& v) {
1027     v << popm{inst.d0};
1028     v << popm{inst.d1};
1029   });
1030 }
1031
1032 void lower(Vunit& unit, pushp& inst, Vlabel b, size_t i) {
1033   lower_impl(unit, b, i, [&] (Vout& v) {
1034     v << push{inst.s0};
1035     v << push{inst.s1};
1036   });
1037 }
1038
1039 void lower(Vunit& unit, pushpm& inst, Vlabel b, size_t i) {
1040   lower_impl(unit, b, i, [&] (Vout& v) {
1041     v << pushm{inst.s0};
1042     v << pushm{inst.s1};
1043   });
1044 }
1045
1046 ///////////////////////////////////////////////////////////////////////////////
1047
1048 void lower(Vunit& unit, stublogue& inst, Vlabel b, size_t i) {
1049   if (inst.saveframe) {
1050     unit.blocks[b].code[i] = push{rvmfp()};
1051   } else {
1052     unit.blocks[b].code[i] = lea{reg::rsp[-8], reg::rsp};
1053   }
1054 }
1055
1056 void lower(Vunit& unit, stubunwind& /*inst*/, Vlabel b, size_t i) {
1057   unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
1058 }
1059
1060 void lower(Vunit& unit, stubtophp& /*inst*/, Vlabel b, size_t i) {
1061   unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
1062 }
1063
1064 void lower(Vunit& unit, loadstubret& inst, Vlabel b, size_t i) {
1065   unit.blocks[b].code[i] = load{reg::rsp[8], inst.d};
1066 }
1067
1068 void lower(Vunit& unit, phplogue& inst, Vlabel b, size_t i) {
1069   unit.blocks[b].code[i] = popm{inst.fp[AROFF(m_savedRip)]};
1070 }
1071
1072 void lower(Vunit& unit, resumetc& inst, Vlabel b, size_t i) {
1073   lower_impl(unit, b, i, [&] (Vout& v) {
1074     v << callr{inst.target, inst.args};
1075     v << jmpi{inst.exittc};
1076   });
1077 }
1078
1079 ///////////////////////////////////////////////////////////////////////////////
1080
1081 void lower(Vunit& unit, sar& inst, Vlabel b, size_t i) {
1082   lower_impl(unit, b, i, [&] (Vout& v) {
1083     v << copy{inst.s0, rcx};
1084     v << sarq{inst.s1, inst.d, inst.sf};
1085   });
1086 }
1087
1088 void lower(Vunit& unit, shl& inst, Vlabel b, size_t i) {
1089   lower_impl(unit, b, i, [&] (Vout& v) {
1090     v << copy{inst.s0, rcx};
1091     v << shlq{inst.s1, inst.d, inst.sf};
1092   });
1093 }
1094
1095 void lower(Vunit& unit, shr& inst, Vlabel b, size_t i) {
1096   lower_impl(unit, b, i, [&] (Vout& v) {
1097     v << copy{inst.s0, rcx};
1098     v << shrq{inst.s1, inst.d, inst.sf};
1099   });
1100 }
1101
1102 void lower(Vunit& unit, srem& inst, Vlabel b, size_t i) {
1103   lower_impl(unit, b, i, [&] (Vout& v) {
1104     v << copy{inst.s0, rax};
1105     v << cqo{};                      // sign-extend rax => rdx:rax
1106     v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
1107     v << copy{rdx, inst.d};
1108   });
1109 }
1110
1111 void lower(Vunit& unit, divint& inst, Vlabel b, size_t i) {
1112   lower_impl(unit, b, i, [&] (Vout& v) {
1113     v << copy{inst.s0, rax};
1114     v << cqo{};                      // sign-extend rax => rdx:rax
1115     v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
1116     v << copy{rax, inst.d};
1117   });
1118 }
1119
1120 ///////////////////////////////////////////////////////////////////////////////
1121
1122 void lower(Vunit& unit, movtqb& inst, Vlabel b, size_t i) {
1123   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1124 }
1125 void lower(Vunit& unit, movtdb& inst, Vlabel b, size_t i) {
1126   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1127 }
1128 void lower(Vunit& unit, movtdq& inst, Vlabel b, size_t i) {
1129   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1130 }
1131 void lower(Vunit& unit, movtqw& inst, Vlabel b, size_t i) {
1132   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1133 }
1134 void lower(Vunit& unit, movtql& inst, Vlabel b, size_t i) {
1135   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1136 }
1137
1138 ///////////////////////////////////////////////////////////////////////////////
1139
1140 /*
1141  * Lower a few abstractions to facilitate straightforward x64 codegen.
1142  */
1143 void lowerForX64(Vunit& unit) {
1144   vasm_lower(unit, [&](const VLS& /*env*/, Vinstr& inst, Vlabel b, size_t i) {
1145     switch (inst.op) {
1146 #define O(name, ...)                      \
1147       case Vinstr::name:                  \
1148         lower(unit, inst.name##_, b, i);  \
1149         break;
1150
1151       VASM_OPCODES
1152 #undef O
1153     }
1154   });
1155 }
1156
1157 ///////////////////////////////////////////////////////////////////////////////
1158 }
1159
1160 void optimizeX64(Vunit& unit, const Abi& abi, bool regalloc) {
1161   Timer timer(Timer::vasm_optimize, unit.log_entry);
1162
1163   auto const doPass = [&] (const char* name, auto fun) {
1164     rqtrace::EventGuard trace{name};
1165     fun(unit);
1166   };
1167
1168   doPass("VOPT_NOP",    removeTrivialNops);
1169   doPass("VOPT_PHI",    optimizePhis);
1170   doPass("VOPT_BRANCH", fuseBranches);
1171   doPass("VOPT_JMP",    [] (Vunit& u) { optimizeJmps(u); });
1172   doPass("VOPT_EXIT",   [] (Vunit& u) { optimizeExits(u); });
1173
1174   assertx(checkWidths(unit));
1175
1176   if (unit.context && !isProfiling(unit.context->kind) && abi.canSpill &&
1177       RuntimeOption::EvalProfBranchSampleFreq > 0) {
1178     // Even when branch profiling is on, we still only want to profile
1179     // non-profiling translations of PHP functions.  We also require that we
1180     // can spill, so that we can generate arbitrary profiling code, and also to
1181     // ensure we don't profile unique stubs and such.
1182     doPass("VOPT_PROF_BRANCH", profile_branches);
1183   }
1184
1185   doPass("VOPT_X64",      lowerForX64);
1186   doPass("VOPT_SIMPLIFY", simplify);
1187   doPass("VOPT_X64",      lowerForX64);
1188
1189   if (!unit.constToReg.empty()) {
1190     doPass("VOPT_FOLD_IMM", foldImms<x64::ImmFolder>);
1191   }
1192
1193   doPass("VOPT_COPY", [&] (Vunit& u) { optimizeCopies(u, abi); });
1194
1195   if (unit.needsRegAlloc()) {
1196     doPass("VOPT_DCE", [] (Vunit& u) { removeDeadCode(u); });
1197     doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u); });
1198     doPass("VOPT_DCE", [] (Vunit& u) { removeDeadCode(u); });
1199     if (regalloc) {
1200       if (RuntimeOption::EvalUseGraphColor &&
1201           unit.context &&
1202           (unit.context->kind == TransKind::Optimize ||
1203            unit.context->kind == TransKind::OptPrologue)) {
1204         rqtrace::EventGuard trace{"VOPT_GRAPH_COLOR"};
1205         allocateRegistersWithGraphColor(unit, abi);
1206       } else {
1207         rqtrace::EventGuard trace{"VOPT_XLS"};
1208         allocateRegistersWithXLS(unit, abi);
1209       }
1210       doPass("VOPT_SF_PEEPHOLES", [&] (Vunit& u) { sfPeepholes(u, abi); });
1211       doPass("VOPT_POST_RA_SIMPLIFY", postRASimplify);
1212     }
1213   }
1214   if (unit.blocks.size() > 1) {
1215     doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u); });
1216   }
1217 }
1218
1219 void emitX64(Vunit& unit, Vtext& text, CGMeta& fixups,
1220              AsmInfo* asmInfo) {
1221 #ifdef HAVE_LIBXED
1222   if (RuntimeOption::EvalUseXedAssembler) {
1223     return vasm_emit<Vgen<XedAssembler>>(unit, text, fixups, asmInfo);
1224   }
1225 #endif
1226   vasm_emit<Vgen<X64Assembler>>(unit, text, fixups, asmInfo);
1227 }
1228
1229 ///////////////////////////////////////////////////////////////////////////////
1230 }}