hphp/runtime/vm/jit/vasm-x64.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/runtime/vm/jit/vasm-emit.h"
  18
  19 #include "hphp/runtime/base/runtime-option.h"
  20 #include "hphp/runtime/base/tracing.h"
  21
  22 #include "hphp/runtime/vm/jit/abi-x64.h"
  23 #include "hphp/runtime/vm/jit/block.h"
  24 #include "hphp/runtime/vm/jit/code-gen-helpers.h"
  25 #include "hphp/runtime/vm/jit/print.h"
  26 #include "hphp/runtime/vm/jit/prof-data.h"
  27 #include "hphp/runtime/vm/jit/service-requests.h"
  28 #include "hphp/runtime/vm/jit/smashable-instr-x64.h"
  29 #include "hphp/runtime/vm/jit/target-cache.h"
  30 #include "hphp/runtime/vm/jit/timer.h"
  31 #include "hphp/runtime/vm/jit/vasm.h"
  32 #include "hphp/runtime/vm/jit/vasm-block-counters.h"
  33 #include "hphp/runtime/vm/jit/vasm-instr.h"
  34 #include "hphp/runtime/vm/jit/vasm-internal.h"
  35 #include "hphp/runtime/vm/jit/vasm-lower.h"
  36 #include "hphp/runtime/vm/jit/vasm-print.h"
  37 #include "hphp/runtime/vm/jit/vasm-prof.h"
  38 #include "hphp/runtime/vm/jit/vasm-unit.h"
  39 #include "hphp/runtime/vm/jit/vasm-util.h"
  40 #include "hphp/runtime/vm/jit/vasm-visit.h"
  41
  42 #include <algorithm>
  43 #include <tuple>
  44
  45 TRACE_SET_MOD(vasm);
  46
  47 namespace HPHP { namespace jit {
  48 ///////////////////////////////////////////////////////////////////////////////
  49
  50 using namespace reg;
  51 using namespace x64;
  52
  53 namespace x64 { struct ImmFolder; }
  54
  55 namespace {
  56 ///////////////////////////////////////////////////////////////////////////////
  57
  58 static_assert(folly::kIsLittleEndian,
  59   "Code contains little-endian specific optimizations.");
  60
  61 template<class X64Asm>
  62 struct Vgen {
  63   explicit Vgen(Venv& env)
  64     : env(env)
  65     , a(*env.cb)
  66     , current(env.current)
  67     , next(env.next)
  68     , jmps(env.jmps)
  69     , jccs(env.jccs)
  70     , catches(env.catches)
  71   {}
  72
  73   static void emitVeneers(Venv& env) {}
  74   static void handleLiterals(Venv& env) {}
  75   static void retargetBinds(Venv& env);
  76   static void patch(Venv& env);
  77   static void pad(CodeBlock& cb);
  78
  79   /////////////////////////////////////////////////////////////////////////////
  80
  81   template<class Inst> void emit(const Inst& i) {
  82     always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
  83                        vinst_names[Vinstr(i).op], size_t(current));
  84   }
  85
  86   // intrinsics
  87   void emit(const prefetch& i) { a.prefetch(i.m.mr()); }
  88   void emit(const copy& i);
  89   void emit(const copy2& i);
  90   void emit(const debugtrap& /*i*/) { a.int3(); }
  91   void emit(const fallthru&);
  92   void emit(const ldimmb& i);
  93   void emit(const ldimml& i);
  94   void emit(const ldimmq& i);
  95   void emit(const ldundefq& /*i*/) {}
  96   void emit(const load& i);
  97   void emit(const store& i);
  98   void emit(const mcprep& i);
  99
 100   // native function abi
 101   void emit(const call& i);
 102   void emit(const callm& i) { a.prefix(i.target.mr()).call(i.target); }
 103   void emit(const callr& i) { a.call(i.target); }
 104   void emit(const calls& i);
 105   void emit(const ret& /*i*/) { a.ret(); }
 106
 107   // stub function abi
 108   void emit(const stubret& i);
 109   void emit(const callstub& i);
 110   void emit(const callfaststub& i);
 111   void emit(const tailcallstub& i);
 112   void emit(const tailcallstubr& i);
 113
 114   // php function abi
 115   void emit(const callphp& i) {
 116     emit(call{i.target, i.args});
 117     setCallFuncId(env, a.frontier());
 118   }
 119   void emit(const callphpr& i) {
 120     emit(callr{i.target, i.args});
 121     setCallFuncId(env, a.frontier());
 122   }
 123   void emit(const phpret& i);
 124   void emit(const contenter& i);
 125
 126   // vm entry abi
 127   void emit(const inittc& /*i*/) {}
 128   void emit(const leavetc&) { a.ret(); }
 129
 130   // exceptions
 131   void emit(const landingpad& /*i*/) {}
 132   void emit(const nothrow& i);
 133   void emit(const syncpoint& i);
 134   void emit(const unwind& i);
 135
 136   // instructions
 137   void emit(absdbl i) { unary(i); a.psllq(1, i.d); a.psrlq(1, i.d); }
 138   void emit(andb i) { commuteSF(i); a.andb(i.s0, i.d); }
 139   void emit(andbi i) { binary(i); a.andb(i.s0, i.d); }
 140   void emit(const andbim& i) { a.prefix(i.m.mr()).andb(i.s, i.m); }
 141   void emit(andw i) { commuteSF(i); a.andw(i.s0, i.d); }
 142   void emit(andwi i) { binary(i); a.andw(i.s0, i.d); }
 143   void emit(andl i) { commuteSF(i); a.andl(i.s0, i.d); }
 144   void emit(andli i) { binary(i); a.andl(i.s0, i.d); }
 145   void emit(andq i) { commuteSF(i); a.andq(i.s0, i.d); }
 146   void emit(andqi i);
 147   void emit(const addwm& i) { a.prefix(i.m.mr()).addw(i.s0, i.m); }
 148   void emit(addl i) { commuteSF(i); a.addl(i.s0, i.d); }
 149   void emit(addli i) { binary(i); a.addl(i.s0, i.d); }
 150   void emit(const addlm& i) { a.prefix(i.m.mr()).addl(i.s0, i.m); }
 151   void emit(const addlim& i);
 152   void emit(addq i) { commuteSF(i); a.addq(i.s0, i.d); }
 153   void emit(addqi i) { binary(i); a.addq(i.s0, i.d); }
 154   void emit(const addqmr& i);
 155   void emit(const addqrm& i);
 156   void emit(const addqim& i);
 157   void emit(addsd i) { commute(i); a.addsd(i.s0, i.d); }
 158   void emit(const btrq& i) { binary(i); a.btrq(i.s0, i.d); }
 159   void emit(const cloadq& i);
 160   template<class cmov> void emit_cmov(const cmov& i);
 161   void emit(const cmovb& i) { emit_cmov(i); }
 162   void emit(const cmovw& i) { emit_cmov(i); }
 163   void emit(const cmovl& i) { emit_cmov(i); }
 164   void emit(const cmovq& i) { emit_cmov(i); }
 165   void emit(const cmpb& i) { a.cmpb(i.s0, i.s1); }
 166   void emit(const cmpbi& i) { a.cmpb(i.s0, i.s1); }
 167   void emit(const cmpbim& i) { a.prefix(i.s1.mr()).cmpb(i.s0, i.s1); }
 168   void emit(const cmpbm& i) { a.prefix(i.s1.mr()).cmpb(i.s0, i.s1); }
 169   void emit(const cmpw& i) { a.cmpw(i.s0, i.s1); }
 170   void emit(const cmpwi& i) { a.cmpw(i.s0, i.s1); }
 171   void emit(const cmpwim& i) { a.prefix(i.s1.mr()).cmpw(i.s0, i.s1); }
 172   void emit(const cmpwm& i) { a.prefix(i.s1.mr()).cmpw(i.s0, i.s1); }
 173   void emit(const cmpl& i) { a.cmpl(i.s0, i.s1); }
 174   void emit(const cmpli& i) { a.cmpl(i.s0, i.s1); }
 175   void emit(const cmplim& i) { a.prefix(i.s1.mr()).cmpl(i.s0, i.s1); }
 176   void emit(const cmplm& i) { a.prefix(i.s1.mr()).cmpl(i.s0, i.s1); }
 177   void emit(const cmpq& i) { a.cmpq(i.s0, i.s1); }
 178   void emit(const cmpqi& i) { a.cmpq(i.s0, i.s1); }
 179   void emit(const cmpqim& i) { a.prefix(i.s1.mr()).cmpq(i.s0, i.s1); }
 180   void emit(const cmpqm& i) { a.prefix(i.s1.mr()).cmpq(i.s0, i.s1); }
 181   void emit(cmpsd i) { noncommute(i); a.cmpsd(i.s0, i.d, i.pred); }
 182   void emit(const cqo& /*i*/) { a.cqo(); }
 183   void emit(const cvttsd2siq& i) { a.cvttsd2siq(i.s, i.d); }
 184   void emit(const cvtsi2sd& i);
 185   void emit(const cvtsi2sdm& i);
 186   void emit(decl i) { unary(i); a.decl(i.d); }
 187   void emit(const declm& i) { a.prefix(i.m.mr()).decl(i.m); }
 188   void emit(decq i) { unary(i); a.decq(i.d); }
 189   void emit(const decqm& i) { a.prefix(i.m.mr()).decq(i.m); }
 190   void emit(const decqmlock& i) { a.prefix(i.m.mr()).decqlock(i.m); }
 191   void emit(const decqmlocknosf&);
 192   void emit(divsd i) { noncommute(i); a.divsd(i.s0, i.d); }
 193   void emit(imul i) { commuteSF(i); a.imul(i.s0, i.d); }
 194   void emit(const idiv& i) { a.idiv(i.s); }
 195   void emit(incl i) { unary(i); a.incl(i.d); }
 196   void emit(const inclm& i) { a.prefix(i.m.mr()).incl(i.m); }
 197   void emit(incq i) { unary(i); a.incq(i.d); }
 198   void emit(const incqm& i) { a.prefix(i.m.mr()).incq(i.m); }
 199   void emit(const incwm& i) { a.prefix(i.m.mr()).incw(i.m); }
 200   void emit(const jcc& i);
 201   void emit(const jcci& i);
 202   void emit(const jmp& i);
 203   void emit(const jmpr& i) { a.jmp(i.target); }
 204   void emit(const jmpm& i) { a.prefix(i.target.mr()).jmp(i.target); }
 205   void emit(const jmpi& i);
 206   void emit(const ldbindretaddr& i);
 207   void emit(const lea& i);
 208   void emit(const leap& i) { a.lea(i.s, i.d); }
 209   void emit(const leav& i);
 210   void emit(const lead& i) { a.lea(rip[(intptr_t)i.s.get()], i.d); }
 211   void emit(const loadups& i) { a.prefix(i.s.mr()).movups(i.s, i.d); }
 212   void emit(const loadtqb& i) { a.prefix(i.s.mr()).loadb(i.s, i.d); }
 213   void emit(const loadb& i) { a.prefix(i.s.mr()).loadb(i.s, i.d); }
 214   void emit(const loadw& i) { a.prefix(i.s.mr()).loadw(i.s, i.d); }
 215   void emit(const loadtql& i) { a.prefix(i.s.mr()).loadl(i.s, i.d); }
 216   void emit(const loadl& i) { a.prefix(i.s.mr()).loadl(i.s, i.d); }
 217   void emit(const loadqp& i) { a.loadq(i.s, i.d); }
 218   void emit(const loadqd& i) { a.loadq(rip[(intptr_t)i.s.get()], i.d); }
 219   void emit(const loadsd& i) { a.prefix(i.s.mr()).movsd(i.s, i.d); }
 220   void emit(const loadzbl& i) { a.prefix(i.s.mr()).loadzbl(i.s, i.d); }
 221   void emit(const loadzbq& i) { a.prefix(i.s.mr()).loadzbl(i.s, Reg32(i.d)); }
 222   void emit(const loadsbq& i) { a.prefix(i.s.mr()).loadsbq(i.s, i.d); }
 223   void emit(const loadzwq& i) { a.prefix(i.s.mr()).loadzwl(i.s, Reg32(i.d)); }
 224   void emit(const loadzlq& i) { a.prefix(i.s.mr()).loadl(i.s, Reg32(i.d)); }
 225   void emit(const movb& i) { a.movb(i.s, i.d); }
 226   void emit(const movl& i) { a.movl(i.s, i.d); }
 227   void emit(const movzbw& i) { a.movzbl(i.s, Reg32(i.d)); }
 228   void emit(const movzbl& i) { a.movzbl(i.s, i.d); }
 229   void emit(const movzbq& i) { a.movzbl(i.s, Reg32(i.d)); }
 230   void emit(const movzwl& i) { a.movzwl(i.s, i.d); }
 231   void emit(const movzwq& i) { a.movzwl(i.s, Reg32(i.d)); }
 232   void emit(const movzlq& i) { a.movl(i.s, Reg32(i.d)); }
 233   void emit(const movsbq& i) { a.movsbq(i.s, i.d); }
 234   void emit(mulsd i) { commute(i); a.mulsd(i.s0, i.d); }
 235   void emit(neg i) { unary(i); a.neg(i.d); }
 236   void emit(const nop& /*i*/) { a.nop(); }
 237   void emit(not i) { unary(i); a.not(i.d); }
 238   void emit(notb i) { unary(i); a.notb(i.d); }
 239   void emit(orbi i) { binary(i); a.orb(i.s0, i.d); }
 240   void emit(const orbim& i) { a.prefix(i.m.mr()).orb(i.s0, i.m); }
 241   void emit(const orwim& i) { a.prefix(i.m.mr()).orw(i.s0, i.m); }
 242   void emit(const orlim& i) { a.prefix(i.m.mr()).orl(i.s0, i.m); }
 243   void emit(orq i) { commuteSF(i); a.orq(i.s0, i.d); }
 244   void emit(orwi i) { binary(i); a.orw(i.s0, i.d); }
 245   void emit(orli i) { binary(i); a.orl(i.s0, i.d); }
 246   void emit(orqi i) { binary(i); a.orq(i.s0, i.d); }
 247   void emit(const orqim& i) { a.prefix(i.m.mr()).orq(i.s0, i.m); }
 248   void emit(const pop& i) { a.pop(i.d); }
 249   void emit(const popm& i) { a.prefix(i.d.mr()).pop(i.d); }
 250   void emit(const popf& i) { assertx(i.d == RegSF{0}); a.popf(); }
 251   void emit(const push& i) { a.push(i.s); }
 252   void emit(const pushm& i) { a.prefix(i.s.mr()).push(i.s); }
 253   void emit(const pushf& i) { assertx(i.s == RegSF{0}); a.pushf(); }
 254   void emit(const roundsd& i) { a.roundsd(i.dir, i.s, i.d); }
 255   void emit(const sarq& i) { unary(i); a.sarq(i.d); }
 256   void emit(sarqi i) { binary(i); a.sarq(i.s0, i.d); }
 257   void emit(const setcc& i) { a.setcc(i.cc, i.d); }
 258   void emit(shlli i) { binary(i); a.shll(i.s0, i.d); }
 259   void emit(shlq i) { unary(i); a.shlq(i.d); }
 260   void emit(shrq i) { unary(i); a.shrq(i.d); }
 261   void emit(shlqi i) { binary(i); a.shlq(i.s0, i.d); }
 262   void emit(shrli i) { binary(i); a.shrl(i.s0, i.d); }
 263   void emit(shrqi i) { binary(i); a.shrq(i.s0, i.d); }
 264   void emit(const sqrtsd& i) { a.sqrtsd(i.s, i.d); }
 265   void emit(const storeups& i) { a.prefix(i.m.mr()).movups(i.s, i.m); }
 266   void emit(const storeb& i) { a.prefix(i.m.mr()).storeb(i.s, i.m); }
 267   void emit(const storebi& i);
 268   void emit(const storel& i) { a.prefix(i.m.mr()).storel(i.s, i.m); }
 269   void emit(const storeli& i) { a.prefix(i.m.mr()).storel(i.s, i.m); }
 270   void emit(const storeqi& i);
 271   void emit(const storesd& i) { a.prefix(i.m.mr()).movsd(i.s, i.m); }
 272   void emit(const storew& i) { a.prefix(i.m.mr()).storew(i.s, i.m); }
 273   void emit(const storewi& i) { a.prefix(i.m.mr()).storew(i.s, i.m); }
 274   void emit(subl i) { noncommute(i); a.subl(i.s0, i.d); }
 275   void emit(subli i) { binary(i); a.subl(i.s0, i.d); }
 276   void emit(subq i) { noncommute(i); a.subq(i.s0, i.d); }
 277   void emit(subqi i) { binary(i); a.subq(i.s0, i.d); }
 278   void emit(const subqim& i);
 279   void emit(subsd i) { noncommute(i); a.subsd(i.s0, i.d); }
 280   void emit(const testb& i) { a.testb(i.s0, i.s1); }
 281   void emit(const testbi& i) { a.testb(i.s0, i.s1); }
 282   void emit(const testbm& i) { a.prefix(i.s1.mr()).testb(i.s0, i.s1); }
 283   void emit(const testbim& i) { a.prefix(i.s1.mr()).testb(i.s0, i.s1); }
 284   void emit(const testw& i) { a.testw(i.s0, i.s1); }
 285   void emit(const testwi& i);
 286   void emit(const testwm& i) { a.prefix(i.s1.mr()).testw(i.s0, i.s1); }
 287   void emit(const testwim& i);
 288   void emit(const testl& i) { a.testl(i.s0, i.s1); }
 289   void emit(const testli& i);
 290   void emit(const testlm& i) { a.prefix(i.s1.mr()).testl(i.s0, i.s1); }
 291   void emit(const testlim& i);
 292   void emit(const testq& i) { a.testq(i.s0, i.s1); }
 293   void emit(const testqi& i);
 294   void emit(const testqm& i) { a.prefix(i.s1.mr()).testq(i.s0, i.s1); }
 295   void emit(const testqim& i);
 296   void emit(const trap& i);
 297   void emit(const ucomisd& i) { a.ucomisd(i.s0, i.s1); }
 298   void emit(unpcklpd i) { noncommute(i); a.unpcklpd(i.s0, i.d); }
 299   void emit(xorb i) { commuteSF(i); a.xorb(i.s0, i.d); }
 300   void emit(xorbi i) { binary(i); a.xorb(i.s0, i.d); }
 301   void emit(xorw i) { commuteSF(i); a.xorw(i.s0, i.d); }
 302   void emit(xorwi i) { binary(i); a.xorw(i.s0, i.d); }
 303   void emit(xorl i) { commuteSF(i); a.xorl(i.s0, i.d); }
 304   void emit(xorq i);
 305   void emit(xorqi i) { binary(i); a.xorq(i.s0, i.d); }
 306   void emit(const conjure& /*i*/) { always_assert(false); }
 307   void emit(const conjureuse& /*i*/) { always_assert(false); }
 308   void emit(const crc32q& i);
 309
 310   void emit_nop() {
 311     emit(lea{rax[8], rax});
 312     emit(lea{rax[-8], rax});
 313   }
 314
 315 private:
 316   // helpers
 317   void prep(Reg8 s, Reg8 d) { if (s != d) a.movb(s, d); }
 318   void prep(Reg16 s, Reg16 d) { if (s != d) a.movw(s, d); }
 319   void prep(Reg32 s, Reg32 d) { if (s != d) a.movl(s, d); }
 320   void prep(Reg64 s, Reg64 d) { if (s != d) a.movq(s, d); }
 321   void prep(RegXMM s, RegXMM d) { if (s != d) a.movdqa(s, d); }
 322   void emit_simd_imm(int64_t, Vreg);
 323
 324   template<class Inst> void unary(Inst& i) { prep(i.s, i.d); }
 325   template<class Inst> void binary(Inst& i) { prep(i.s1, i.d); }
 326
 327   template<class Inst> void commuteSF(Inst&);
 328   template<class Inst> void commute(Inst&);
 329   template<class Inst> void noncommute(Inst&);
 330
 331   CodeBlock& frozen() { return env.text.frozen().code; }
 332
 333 private:
 334   Venv& env;
 335   X64Asm a;
 336
 337   const Vlabel current;
 338   const Vlabel next;
 339   jit::vector<Venv::LabelPatch>& jmps;
 340   jit::vector<Venv::LabelPatch>& jccs;
 341   jit::vector<Venv::LabelPatch>& catches;
 342 };
 343
 344 ///////////////////////////////////////////////////////////////////////////////
 345 /*
 346  * Prepare a binary op that is not commutative.
 347  *
 348  * s0 must be a different register than s1 so we don't clobber it.
 349  */
 350 template<class X64Asm>
 351 template<class Inst> void Vgen<X64Asm>::noncommute(Inst& i) {
 352   assertx(i.s1 == i.d || i.s0 != i.d); // do not clobber s0
 353   binary(i);
 354 }
 355
 356 /*
 357  * Prepare a binary op that is commutative.
 358  *
 359  * Swap operands if the dest is s0.
 360  */
 361 template<class X64Asm>
 362 template<class Inst> void Vgen<X64Asm>::commuteSF(Inst& i) {
 363   if (i.s1 != i.d && i.s0 == i.d) {
 364     i = Inst{i.s1, i.s0, i.d, i.sf};
 365   } else {
 366     binary(i);
 367   }
 368 }
 369
 370 template<class X64Asm>
 371 template<class Inst> void Vgen<X64Asm>::commute(Inst& i) {
 372   if (i.s1 != i.d && i.s0 == i.d) {
 373     i = Inst{i.s1, i.s0, i.d};
 374   } else {
 375     binary(i);
 376   }
 377 }
 378
 379 ///////////////////////////////////////////////////////////////////////////////
 380
 381 /*
 382  * Returns true iff the status flags necessary to take a j<a> imply that a j<b>
 383  * will also be taken.
 384  */
 385 bool ccImplies(ConditionCode a, ConditionCode b) {
 386   if (a == b) return true;
 387
 388   switch (a) {
 389     case CC_None:
 390     case CC_O:  case CC_NO:
 391     case CC_AE: case CC_BE:
 392     case CC_NE:
 393     case CC_S:  case CC_NS:
 394     case CC_P:  case CC_NP:
 395     case CC_GE: case CC_LE:
 396       return false;
 397
 398     case CC_B: return b == CC_BE;
 399     case CC_E: return b == CC_BE || b == CC_LE;
 400     case CC_A: return b == CC_AE || b == CC_NE;
 401     case CC_L: return b == CC_LE;
 402     case CC_G: return b == CC_NE || b == CC_GE;
 403   }
 404   always_assert(false);
 405 }
 406
 407 /*
 408  * When two jccs go to the same destination, the cc of the first is compatible
 409  * with the cc of the second, and they're within a one-byte offset of each
 410  * other, retarget the first to jump to the second. This will allow the
 411  * relocator to shrink the first one, and the extra jmp shouldn't matter since
 412  * we try to only do this to rarely taken jumps.
 413  */
 414 template<typename Key, typename Hash>
 415 jit::hash_set<TCA> retargetJumps(
 416   Venv& env,
 417   const jit::hash_map<Key, jit::vector<TCA>, Hash>& jccs
 418 ) {
 419   jit::hash_set<TCA> retargeted;
 420   for (auto& pair : jccs) {
 421     auto const& jmps = pair.second;
 422     if (jmps.size() < 2) continue;
 423
 424     for (size_t i = 0; i < jmps.size(); ++i) {
 425       DecodedInstruction di(env.text.toDestAddress(jmps[i]), jmps[i]);
 426       // Don't bother if the jump is already a short jump.
 427       if (di.size() != 6) continue;
 428
 429       for (size_t j = jmps.size() - 1; j > i; --j) {
 430         auto const delta = jmps[j] - jmps[i] + 2;
 431         // Backwards jumps are probably not guards, and don't retarget to a
 432         // dest that's more than a one-byte offset away.
 433         if (delta < 0 || !deltaFits(delta, sz::byte)) continue;
 434
 435         DecodedInstruction dj(env.text.toDestAddress(jmps[j]), jmps[j]);
 436         if (!ccImplies(di.jccCondCode(), dj.jccCondCode())) continue;
 437
 438         di.setPicAddress(jmps[j]);
 439         retargeted.insert(jmps[i]);
 440
 441         // We might've converted a smashable jump to a regular in-unit jump, so
 442         // remove any smashable alignments.
 443         auto range = env.meta.alignments.equal_range(jmps[i]);
 444         while (range.first != range.second) {
 445           auto iter = range.first;
 446           ++range.first;
 447
 448           auto& align = iter->second;
 449           if (align.first == Alignment::SmashJcc &&
 450               align.second == AlignContext::Live) {
 451             env.meta.alignments.erase(iter);
 452           }
 453         }
 454
 455         break;
 456       }
 457     }
 458   }
 459
 460   return retargeted;
 461 }
 462
 463 namespace {
 464   struct SrcKeyBoolTupleHasher {
 465     size_t operator()(std::tuple<SrcKey, bool> v) const {
 466       return folly::hash::hash_combine(
 467         std::get<0>(v).toAtomicInt(),
 468         std::get<1>(v)
 469       );
 470     }
 471   };
 472 }
 473
 474 template<class X64Asm>
 475 void Vgen<X64Asm>::retargetBinds(Venv& env) {
 476   if (RuntimeOption::EvalJitRetargetJumps < 1) return;
 477
 478   // The target is unique per the SrcKey and the fallback flag.
 479   jit::hash_map<
 480     std::pair<SrcKey, bool>,
 481     jit::vector<TCA>,
 482     SrcKeyBoolTupleHasher
 483   > binds;
 484
 485   for (auto const& b : env.meta.smashableBinds) {
 486     if (b.smashable.type() == IncomingBranch::Tag::JCC) {
 487       binds[std::make_pair(b.sk, b.fallback)]
 488         .emplace_back(b.smashable.toSmash());
 489     }
 490   }
 491
 492   auto const retargeted = retargetJumps(env, std::move(binds));
 493   if (retargeted.empty()) return;
 494
 495   // Finally, remove any retargeted jmps from inProgressTailJumps and
 496   // smashableBinds.
 497   GrowableVector<IncomingBranch> newTailJumps;
 498   for (auto& jmp : env.meta.inProgressTailJumps) {
 499     if (retargeted.count(jmp.toSmash()) == 0) {
 500       newTailJumps.push_back(jmp);
 501     }
 502   }
 503   env.meta.inProgressTailJumps.swap(newTailJumps);
 504
 505   decltype(env.meta.smashableBinds) newBinds;
 506   for (auto& bind : env.meta.smashableBinds) {
 507     if (retargeted.count(bind.smashable.toSmash()) == 0) {
 508       newBinds.push_back(bind);
 509     } else {
 510       FTRACE(3, "retargetBinds: removed {} from smashableBinds\n",
 511              bind.smashable.toSmash());
 512     }
 513   }
 514   env.meta.smashableBinds.swap(newBinds);
 515 }
 516
 517 template<class X64Asm>
 518 void Vgen<X64Asm>::patch(Venv& env) {
 519   for (auto const& p : env.jmps) {
 520     assertx(env.addrs[p.target]);
 521     X64Asm::patchJmp(
 522       env.text.toDestAddress(p.instr), p.instr, env.addrs[p.target]);
 523   }
 524
 525   auto const optLevel = RuntimeOption::EvalJitRetargetJumps;
 526   jit::hash_map<TCA, jit::vector<TCA>> jccs;
 527   for (auto const& p : env.jccs) {
 528     assertx(env.addrs[p.target]);
 529     X64Asm::patchJcc(
 530       env.text.toDestAddress(p.instr), p.instr, env.addrs[p.target]);
 531     if (optLevel >= 2) {
 532       jccs[env.addrs[p.target]].emplace_back(p.instr);
 533     }
 534   }
 535
 536   if (!jccs.empty()) retargetJumps(env, jccs);
 537
 538   for (auto const& p : env.leas) {
 539     assertx(env.vaddrs[p.target]);
 540     DecodedInstruction di(env.text.toDestAddress(p.instr), p.instr);
 541     assertx(di.hasPicOffset());
 542     di.setPicAddress(env.vaddrs[p.target]);
 543   }
 544 }
 545
 546 template<class X64Asm>
 547 void Vgen<X64Asm>::pad(CodeBlock& cb) {
 548   X64Asm a { cb };
 549   a.pad();
 550 }
 551
 552 ///////////////////////////////////////////////////////////////////////////////
 553
 554 template<class X64Asm>
 555 void Vgen<X64Asm>::emit(const copy& i) {
 556   if (i.s == i.d) return;
 557   if (i.s.isGP()) {
 558     if (i.d.isGP()) {                 // GP => GP
 559       a.movq(i.s, i.d);
 560     } else {                             // GP => XMM
 561       assertx(i.d.isSIMD());
 562       // This generates a movq x86 instruction, which zero extends
 563       // the 64-bit value in srcReg into a 128-bit XMM register
 564       a.movq_rx(i.s, i.d);
 565     }
 566   } else {
 567     if (i.d.isGP()) {                 // XMM => GP
 568       a.movq_xr(i.s, i.d);
 569     } else {                             // XMM => XMM
 570       assertx(i.d.isSIMD());
 571       // This copies all 128 bits in XMM,
 572       // thus avoiding partial register stalls
 573       a.movdqa(i.s, i.d);
 574     }
 575   }
 576 }
 577
 578 template<class X64Asm>
 579 void Vgen<X64Asm>::emit(const copy2& i) {
 580   assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid());
 581   auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1;
 582   assertx(d0 != d1);
 583   if (d0 == s1) {
 584     if (d1 == s0) {
 585       a.xchgq(d0, d1);
 586     } else {
 587       // could do this in a simplify pass
 588       if (s1 != d1) a.movq(s1, d1); // save s1 first; d1 != s0
 589       if (s0 != d0) a.movq(s0, d0);
 590     }
 591   } else {
 592     // could do this in a simplify pass
 593     if (s0 != d0) a.movq(s0, d0);
 594     if (s1 != d1) a.movq(s1, d1);
 595   }
 596 }
 597
 598 template<class X64Asm>
 599 void Vgen<X64Asm>::emit_simd_imm(int64_t val, Vreg d) {
 600   if (val == 0) {
 601     a.pxor(d, d); // does not modify flags
 602   } else {
 603     auto addr = alloc_literal(env, val);
 604     a.movsd(rip[(intptr_t)addr], d);
 605   }
 606 }
 607
 608 template<class X64Asm>
 609 void Vgen<X64Asm>::emit(const ldimmb& i) {
 610   // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t
 611   auto val = i.s.ub();
 612   if (i.d.isGP()) {
 613     Vreg8 d8 = i.d;
 614     a.movb(static_cast<int8_t>(val), d8);
 615   } else {
 616     emit_simd_imm(val, i.d);
 617   }
 618 }
 619
 620 template<class X64Asm>
 621 void Vgen<X64Asm>::emit(const ldimml& i) {
 622   // ldimml is for Vconst::Long, which is treated as unsigned uint32_t
 623   auto val = i.s.l();
 624   if (i.d.isGP()) {
 625     Vreg32 d32 = i.d;
 626     a.movl(val, d32);
 627   } else {
 628     emit_simd_imm(uint32_t(val), i.d);
 629   }
 630 }
 631
 632 template<class X64Asm>
 633 void Vgen<X64Asm>::emit(const ldimmq& i) {
 634   auto val = i.s.q();
 635   if (i.d.isGP()) {
 636     if (val == 0) {
 637       Vreg32 d32 = i.d;
 638       a.movl(0, d32); // because emitImmReg tries the xor optimization
 639     } else {
 640       a.emitImmReg(i.s, i.d);
 641     }
 642   } else {
 643     emit_simd_imm(val, i.d);
 644   }
 645 }
 646
 647 template<class X64Asm>
 648 void Vgen<X64Asm>::emit(const load& i) {
 649   auto mref = i.s.mr();
 650   a.prefix(mref);
 651   if (i.d.isGP()) {
 652     a.loadq(mref, i.d);
 653   } else {
 654     assertx(i.d.isSIMD());
 655     a.movsd(mref, i.d);
 656   }
 657 }
 658
 659 template<class X64Asm>
 660 void Vgen<X64Asm>::emit(const store& i) {
 661   auto const mref = i.d.mr();
 662   a.prefix(mref);
 663   if (i.s.isGP()) {
 664     a.storeq(i.s, i.d);
 665   } else {
 666     assertx(i.s.isSIMD());
 667     a.movsd(i.s, i.d);
 668   }
 669 }
 670
 671 ///////////////////////////////////////////////////////////////////////////////
 672
 673 template<class X64Asm>
 674 void Vgen<X64Asm>::emit(const mcprep& i) {
 675   /*
 676    * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
 677    * address of the movq) so that we can find the movq from the handler.
 678    *
 679    * We set the low bit for two reasons: the Class* will never be a valid
 680    * Class*, so we'll always miss the inline check before it's smashed, and
 681    * MethodCache::handleStaticCall can tell it's not been smashed yet
 682    */
 683   auto const mov_addr = emitSmashableMovq(a.code(), env.meta, 0, r64(i.d));
 684   auto const imm = reinterpret_cast<uint64_t>(mov_addr);
 685   smashMovq(a.toDestAddress(mov_addr), (imm << 1) | 1);
 686
 687   env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm));
 688 }
 689
 690 ///////////////////////////////////////////////////////////////////////////////
 691
 692 template<class X64Asm>
 693 void Vgen<X64Asm>::emit(const call& i) {
 694   if (a.jmpDeltaFits(i.target)) {
 695     a.call(i.target);
 696   } else {
 697     // can't do a near call; store address in data section.
 698     // call by loading the address using rip-relative addressing.  This
 699     // assumes the data section is near the current code section.  Since
 700     // this sequence is directly in-line, rip-relative like this is
 701     // more compact than loading a 64-bit immediate.
 702     auto addr = alloc_literal(env, (uint64_t)i.target);
 703     a.call(rip[(intptr_t)addr]);
 704   }
 705   if (i.watch) {
 706     *i.watch = a.frontier();
 707     env.meta.watchpoints.push_back(i.watch);
 708   }
 709 }
 710
 711 template<class X64Asm>
 712 void Vgen<X64Asm>::emit(const calls& i) {
 713   emitSmashableCall(a.code(), env.meta, i.target);
 714 }
 715
 716 ///////////////////////////////////////////////////////////////////////////////
 717
 718 template<class X64Asm>
 719 void Vgen<X64Asm>::emit(const stubret& i) {
 720   if (i.saveframe) {
 721     a.pop(x64::rvmfp());
 722   } else {
 723     a.addq(8, reg::rsp);
 724   }
 725   a.ret();
 726 }
 727
 728 template<class X64Asm>
 729 void Vgen<X64Asm>::emit(const callstub& i) {
 730   emit(call{i.target, i.args});
 731 }
 732
 733 template<class X64Asm>
 734 void Vgen<X64Asm>::emit(const callfaststub& i) {
 735   emit(call{i.target, i.args});
 736 }
 737
 738 template<class X64Asm>
 739 void Vgen<X64Asm>::emit(const tailcallstub& i) {
 740   a.addq(8, reg::rsp);
 741   emit(jmpi{i.target, i.args});
 742 }
 743
 744 template<class X64Asm>
 745 void Vgen<X64Asm>::emit(const tailcallstubr& i) {
 746   a.addq(8, reg::rsp);
 747   emit(jmpr{i.target, i.args});
 748 }
 749
 750 ///////////////////////////////////////////////////////////////////////////////
 751
 752 template<class X64Asm>
 753 void Vgen<X64Asm>::emit(const phpret& i) {
 754   a.push(i.fp[AROFF(m_savedRip)]);
 755   if (!i.noframe) {
 756     a.loadq(i.fp[AROFF(m_sfp)], x64::rvmfp());
 757   }
 758   a.ret();
 759 }
 760
 761 template<class X64Asm>
 762 void Vgen<X64Asm>::emit(const contenter& i) {
 763   Label Stub, End;
 764   Reg64 fp = i.fp, target = i.target;
 765   a.jmp8(End);
 766
 767   asm_label(a, Stub);
 768   a.pop(fp[AROFF(m_savedRip)]);
 769   a.jmp(target);
 770
 771   asm_label(a, End);
 772   a.call(Stub);
 773   // m_savedRip will point here.
 774   emit(unwind{{i.targets[0], i.targets[1]}});
 775 }
 776
 777 ///////////////////////////////////////////////////////////////////////////////
 778
 779 template<class X64Asm>
 780 void Vgen<X64Asm>::emit(const nothrow& /*i*/) {
 781   env.meta.catches.emplace_back(a.frontier(), nullptr);
 782 }
 783
 784 template<class X64Asm>
 785 void Vgen<X64Asm>::emit(const syncpoint& i) {
 786   FTRACE(5, "IR recordSyncPoint: {} {}\n", a.frontier(), i.fix.show());
 787   env.meta.fixups.emplace_back(a.frontier(), i.fix);
 788   env.record_inline_stack(a.frontier());
 789 }
 790
 791 template<class X64Asm>
 792 void Vgen<X64Asm>::emit(const unwind& i) {
 793   catches.push_back({a.frontier(), i.targets[1]});
 794   env.record_inline_stack(a.frontier());
 795   emit(jmp{i.targets[0]});
 796 }
 797
 798 ///////////////////////////////////////////////////////////////////////////////
 799
 800 template<class X64Asm>
 801 void Vgen<X64Asm>::emit(const fallthru&) {
 802   a.nop();
 803 }
 804
 805 ///////////////////////////////////////////////////////////////////////////////
 806
 807 template<class X64Asm>
 808 void Vgen<X64Asm>::emit(andqi i) {
 809   if (magFits(i.s0.q(), sz::dword)) {
 810     emit(andli{int32_t(i.s0.q()), Reg32(i.s1), Reg32(i.d), i.sf});
 811     return;
 812   }
 813
 814   binary(i);
 815   a.andq(i.s0, i.d);
 816 }
 817
 818 template<class X64Asm>
 819 void Vgen<X64Asm>::emit(const addlim& i) {
 820   auto mref = i.m.mr();
 821   a.prefix(mref).addl(i.s0, mref);
 822 }
 823
 824 template<typename X64Asm>
 825 void Vgen<X64Asm>::emit(const addqmr& i) {
 826   binary(i);
 827   auto const mref = i.m.mr();
 828   a.prefix(mref).addq(mref, i.d);
 829 }
 830
 831 template<typename X64Asm>
 832 void Vgen<X64Asm>::emit(const addqrm& i) {
 833   auto const mref = i.m.mr();
 834   a.prefix(mref).addq(i.s1, mref);
 835 }
 836
 837 template<class X64Asm>
 838 void Vgen<X64Asm>::emit(const addqim& i) {
 839   auto mref = i.m.mr();
 840   a.prefix(mref).addq(i.s0, mref);
 841 }
 842
 843 template<class X64Asm>
 844 void Vgen<X64Asm>::emit(const subqim& i) {
 845   auto mref = i.m.mr();
 846   a.prefix(mref).subq(i.s0, mref);
 847 }
 848
 849 template<class X64Asm>
 850 void Vgen<X64Asm>::emit(const cloadq& i) {
 851   auto m = i.t;
 852   always_assert(!m.index.isValid()); // not supported, but could be later.
 853   if (i.f != i.d) {
 854     if (i.d == m.base) {
 855       // We can't move f over d or we'll clobber the Vptr we need to load from.
 856       // Since cload does the load unconditionally anyway, we can just load and
 857       // cmov.
 858       a.prefix(m.mr()).loadq(i.t, i.d);
 859       a.cmov_reg64_reg64(ccNegate(i.cc), i.f, i.d);
 860       return;
 861     }
 862     a.movq(i.f, i.d);
 863   }
 864   a.prefix(m.mr()).cload_reg64_disp_reg64(i.cc, m.base, m.disp, i.d);
 865 }
 866
 867 // add s0 s1 d => mov s1->d; d += s0
 868 // cmov cc s d => if cc { mov s->d }
 869 template<class X64Asm>
 870 template<class cmov>
 871 void Vgen<X64Asm>::emit_cmov(const cmov& i) {
 872   if (i.f != i.d && i.t == i.d) {
 873     // negate the condition and swap t/f operands so we dont clobber i.t
 874     return emit(cmov{ccNegate(i.cc), i.sf, i.t, i.f, i.d});
 875   } else {
 876     prep(i.f, i.d);
 877   }
 878   a.cmov_reg64_reg64(i.cc, r64(i.t), r64(i.d));
 879 }
 880
 881 template<class X64Asm>
 882 void Vgen<X64Asm>::emit(const cvtsi2sd& i) {
 883   a.pxor(i.d, i.d);
 884   a.cvtsi2sd(i.s, i.d);
 885 }
 886
 887 template<class X64Asm>
 888 void Vgen<X64Asm>::emit(const cvtsi2sdm& i) {
 889   a.pxor(i.d, i.d);
 890   a.cvtsi2sd(i.s, i.d);
 891 }
 892
 893 template<class X64Asm>
 894 void Vgen<X64Asm>::emit(const jcc& i) {
 895   if (i.targets[1] != i.targets[0]) {
 896     if (next == i.targets[1]) {
 897       return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}});
 898     }
 899     auto taken = i.targets[1];
 900     jccs.push_back({a.frontier(), taken});
 901     a.jcc(i.cc, a.frontier());
 902   }
 903   emit(jmp{i.targets[0]});
 904 }
 905
 906 template<class X64Asm>
 907 void Vgen<X64Asm>::emit(const jcci& i) {
 908   a.jcc(i.cc, i.taken);
 909 }
 910
 911 template<class X64Asm>
 912 void Vgen<X64Asm>::emit(const jmp& i) {
 913   if (next == i.target) return;
 914   jmps.push_back({a.frontier(), i.target});
 915   a.jmp(a.frontier());
 916 }
 917
 918 template<class X64Asm>
 919 void Vgen<X64Asm>::emit(const jmpi& i) {
 920   if (a.jmpDeltaFits(i.target)) {
 921     a.jmp(i.target);
 922   } else {
 923     // can't do a near jmp - use rip-relative addressing
 924     auto addr = alloc_literal(env, (uint64_t)i.target);
 925     a.jmp(rip[(intptr_t)addr]);
 926   }
 927 }
 928
 929 template<class X64Asm>
 930 void Vgen<X64Asm>::emit(const ldbindretaddr& i) {
 931   auto const addr = a.frontier();
 932   emit(leap{reg::rip[(intptr_t)addr], i.d});
 933   env.ldbindretaddrs.push_back({addr, i.target, i.spOff});
 934 }
 935
 936 template<class X64Asm>
 937 void Vgen<X64Asm>::emit(const lea& i) {
 938   assertx(i.s.seg == Segment::DS);
 939   // could do this in a simplify pass
 940   if (i.s.disp == 0 && i.s.base.isValid() && !i.s.index.isValid()) {
 941     emit(copy{i.s.base, i.d});
 942   } else {
 943     a.lea(i.s, i.d);
 944   }
 945 }
 946
 947 template<class X64Asm>
 948 void Vgen<X64Asm>::emit(const leav& i) {
 949   auto const addr = a.frontier();
 950   emit(leap{reg::rip[(intptr_t)addr], i.d});
 951   env.leas.push_back({addr, i.s});
 952 }
 953
 954 template<class X64Asm>
 955 void Vgen<X64Asm>::emit(const storebi& i) {
 956   auto mref = i.m.mr();
 957   a.prefix(mref).storeb(i.s, mref);
 958 }
 959
 960 template<class X64Asm>
 961 void Vgen<X64Asm>::emit(const storeqi& i) {
 962   auto mref = i.m.mr();
 963   a.prefix(mref).storeq(i.s, mref);
 964 }
 965
 966 template<class VgenImpl, typename Inst>
 967 bool testimHelper(VgenImpl& env, const Inst& i, uint64_t mask) {
 968   // If there's only 1 byte of meaningful bits in the mask, we can adjust the
 969   // pointer offset and use testbim instead.
 970   int off = 0;
 971   while (mask > 0xff && !(mask & 0xff)) {
 972     off++;
 973     mask >>= 8;
 974   }
 975
 976   if (mask > 0xff) return false;
 977
 978   env.emit(testbim{int8_t(mask), i.s1 + off, i.sf});
 979   return true;
 980 }
 981
 982 template<class X64Asm>
 983 void Vgen<X64Asm>::emit(const testwi& i) {
 984   if (i.s0.w() == -1) {
 985     return emit(testw{i.s1, i.s1, i.sf});
 986   }
 987   a.testw(i.s0, i.s1);
 988 }
 989
 990 template<class X64Asm>
 991 void Vgen<X64Asm>::Vgen::emit(const testwim& i) {
 992   if (testimHelper(*this, i, i.s0.w())) return;
 993   a.prefix(i.s1.mr()).testw(i.s0, i.s1);
 994 }
 995
 996 template<class X64Asm>
 997 void Vgen<X64Asm>::Vgen::emit(const testlim& i) {
 998   if (testimHelper(*this, i, i.s0.l())) return;
 999   a.prefix(i.s1.mr()).testl(i.s0, i.s1);
1000 }
1001
1002 template<class X64Asm>
1003 void Vgen<X64Asm>::Vgen::emit(const testli& i) {
1004   if (i.s0.l() == -1) {
1005     return emit(testl{i.s1, i.s1, i.sf});
1006   }
1007   a.testl(i.s0, i.s1);
1008 }
1009
1010 template<class X64Asm>
1011 void Vgen<X64Asm>::emit(const testqi& i) {
1012   auto const imm = i.s0.q();
1013   if (magFits(imm, sz::byte)) {
1014     a.testb(int8_t(imm), rbyte(i.s1));
1015   } else if (magFits(imm, sz::dword)) {
1016     emit(testli{int32_t(imm), Reg32(i.s1), i.sf});
1017   } else if (imm == -1) {
1018     emit(testq{i.s1, i.s1, i.sf});
1019   } else {
1020     a.testq(i.s0, i.s1);
1021   }
1022 }
1023
1024 template<class X64Asm>
1025 void Vgen<X64Asm>::emit(const testqim& i) {
1026   if (testimHelper(*this, i, i.s0.q())) return;
1027   if (magFits(i.s0.q(), sz::dword)) {
1028     // For an unsigned 32 bit immediate, we can get the same results
1029     // by emitting a testlim.
1030     emit(testlim{int32_t(i.s0.q()), i.s1, i.sf});
1031   } else {
1032     a.prefix(i.s1.mr()).testq(i.s0, i.s1);
1033   }
1034 }
1035
1036 template<class X64Asm>
1037 void Vgen<X64Asm>::emit(const trap& i) {
1038   env.meta.trapReasons.emplace_back(a.frontier(), i.reason);
1039   a.ud2();
1040 }
1041
1042 template<class X64Asm>
1043 void Vgen<X64Asm>::emit(xorq i) {
1044   if (i.s0 == i.s1) {
1045     // 32-bit xor{s, s, d} zeroes the upper bits of `d'.
1046     return emit(xorl{r32(i.s0), r32(i.s1), r32(i.d), i.sf});
1047   }
1048   commuteSF(i);
1049   a.xorq(i.s0, i.d);
1050 }
1051
1052 template<class X64Asm>
1053 void Vgen<X64Asm>::emit(const crc32q& i) {
1054   noncommute(i);
1055   a.crc32q(i.s0, i.d);
1056 }
1057
1058 template<typename X64Asm>
1059 void Vgen<X64Asm>::emit(const decqmlocknosf& i) {
1060   a.pushf();
1061   a.prefix(i.m.mr()).decqlock(i.m);
1062   a.popf();
1063 }
1064
1065 ///////////////////////////////////////////////////////////////////////////////
1066
1067 template<typename Lower>
1068 void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) {
1069   vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; });
1070 }
1071
1072 template <typename Inst>
1073 void lower(Vunit& /*unit*/, Inst& /*inst*/, Vlabel /*b*/, size_t /*i*/) {}
1074
1075 ///////////////////////////////////////////////////////////////////////////////
1076
1077 void lower(Vunit& unit, popp& inst, Vlabel b, size_t i) {
1078   lower_impl(unit, b, i, [&] (Vout& v) {
1079     v << pop{inst.d0};
1080     v << pop{inst.d1};
1081   });
1082 }
1083
1084 void lower(Vunit& unit, poppm& inst, Vlabel b, size_t i) {
1085   lower_impl(unit, b, i, [&] (Vout& v) {
1086     v << popm{inst.d0};
1087     v << popm{inst.d1};
1088   });
1089 }
1090
1091 void lower(Vunit& unit, pushp& inst, Vlabel b, size_t i) {
1092   lower_impl(unit, b, i, [&] (Vout& v) {
1093     v << push{inst.s0};
1094     v << push{inst.s1};
1095   });
1096 }
1097
1098 void lower(Vunit& unit, pushpm& inst, Vlabel b, size_t i) {
1099   lower_impl(unit, b, i, [&] (Vout& v) {
1100     v << pushm{inst.s0};
1101     v << pushm{inst.s1};
1102   });
1103 }
1104
1105 ///////////////////////////////////////////////////////////////////////////////
1106
1107 void lower(Vunit& unit, stublogue& inst, Vlabel b, size_t i) {
1108   if (inst.saveframe) {
1109     unit.blocks[b].code[i] = push{x64::rvmfp()};
1110   } else {
1111     unit.blocks[b].code[i] = lea{reg::rsp[-8], reg::rsp};
1112   }
1113 }
1114
1115 void lower(Vunit& unit, unstublogue& /*inst*/, Vlabel b, size_t i) {
1116   unit.blocks[b].code[i] = lea{reg::rsp[8], reg::rsp};
1117 }
1118
1119 void lower(Vunit& unit, stubunwind& inst, Vlabel b, size_t i) {
1120   lower_impl(unit, b, i, [&] (Vout& v) {
1121     v << lea{reg::rsp[8], reg::rsp};
1122     v << pop{inst.d};
1123   });
1124 }
1125
1126 void lower(Vunit& unit, stubtophp& /*inst*/, Vlabel b, size_t i) {
1127   unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
1128 }
1129
1130 void lower(Vunit& unit, loadstubret& inst, Vlabel b, size_t i) {
1131   unit.blocks[b].code[i] = load{reg::rsp[8], inst.d};
1132 }
1133
1134 void lower(Vunit& unit, phplogue& inst, Vlabel b, size_t i) {
1135   unit.blocks[b].code[i] = popm{inst.fp[AROFF(m_savedRip)]};
1136 }
1137
1138 void lower(Vunit& unit, resumetc& inst, Vlabel b, size_t i) {
1139   lower_impl(unit, b, i, [&] (Vout& v) {
1140     v << callr{inst.target, inst.args};
1141     v << jmpi{inst.exittc};
1142   });
1143 }
1144
1145 ///////////////////////////////////////////////////////////////////////////////
1146
1147 void lower(Vunit& unit, sar& inst, Vlabel b, size_t i) {
1148   lower_impl(unit, b, i, [&] (Vout& v) {
1149     v << copy{inst.s0, rcx};
1150     v << sarq{inst.s1, inst.d, inst.sf};
1151   });
1152 }
1153
1154 void lower(Vunit& unit, shl& inst, Vlabel b, size_t i) {
1155   lower_impl(unit, b, i, [&] (Vout& v) {
1156     v << copy{inst.s0, rcx};
1157     v << shlq{inst.s1, inst.d, inst.sf};
1158   });
1159 }
1160
1161 void lower(Vunit& unit, shr& inst, Vlabel b, size_t i) {
1162   lower_impl(unit, b, i, [&] (Vout& v) {
1163     v << copy{inst.s0, rcx};
1164     v << shrq{inst.s1, inst.d, inst.sf};
1165   });
1166 }
1167
1168 void lower(Vunit& unit, srem& inst, Vlabel b, size_t i) {
1169   lower_impl(unit, b, i, [&] (Vout& v) {
1170     v << copy{inst.s0, rax};
1171     v << cqo{};                      // sign-extend rax => rdx:rax
1172     v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
1173     v << copy{rdx, inst.d};
1174   });
1175 }
1176
1177 void lower(Vunit& unit, divint& inst, Vlabel b, size_t i) {
1178   lower_impl(unit, b, i, [&] (Vout& v) {
1179     v << copy{inst.s0, rax};
1180     v << cqo{};                      // sign-extend rax => rdx:rax
1181     v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
1182     v << copy{rax, inst.d};
1183   });
1184 }
1185
1186 ///////////////////////////////////////////////////////////////////////////////
1187
1188 void lower(Vunit& unit, movtqb& inst, Vlabel b, size_t i) {
1189   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1190 }
1191 void lower(Vunit& unit, movtdb& inst, Vlabel b, size_t i) {
1192   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1193 }
1194 void lower(Vunit& unit, movtdq& inst, Vlabel b, size_t i) {
1195   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1196 }
1197 void lower(Vunit& unit, movtqw& inst, Vlabel b, size_t i) {
1198   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1199 }
1200 void lower(Vunit& unit, movtql& inst, Vlabel b, size_t i) {
1201   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1202 }
1203
1204 ///////////////////////////////////////////////////////////////////////////////
1205
1206 /*
1207  * Lower a few abstractions to facilitate straightforward x64 codegen.
1208  */
1209 void lowerForX64(Vunit& unit) {
1210   vasm_lower(unit, [&](const VLS& /*env*/, Vinstr& inst, Vlabel b, size_t i) {
1211     switch (inst.op) {
1212 #define O(name, ...)                      \
1213       case Vinstr::name:                  \
1214         lower(unit, inst.name##_, b, i);  \
1215         break;
1216
1217       VASM_OPCODES
1218 #undef O
1219     }
1220   });
1221 }
1222
1223 ///////////////////////////////////////////////////////////////////////////////
1224
1225 }
1226
1227 void optimizeX64(Vunit& unit, const Abi& abi, bool regalloc) {
1228   Timer timer(Timer::vasm_optimize, unit.log_entry);
1229
1230   tracing::Block _{
1231     "vasm-optimize",
1232     [&] { return traceProps(unit).add("reg_alloc", regalloc); }
1233   };
1234
1235   auto const doPass = [&] (const char* name, auto fun) {
1236     rqtrace::EventGuard trace{name};
1237     fun(unit);
1238   };
1239
1240   doPass("VOPT_DCE",    removeDeadCode);
1241   doPass("VOPT_PHI",    optimizePhis);
1242   doPass("VOPT_BRANCH", fuseBranches);
1243   doPass("VOPT_JMP",    [] (Vunit& u) { optimizeJmps(u, false); });
1244
1245   assertx(checkWidths(unit));
1246
1247   if (unit.context && unit.context->kind == TransKind::Optimize &&
1248       RuntimeOption::EvalProfBranchSampleFreq > 0) {
1249     // Even when branch profiling is on, we still only want to profile
1250     // non-profiling translations of PHP functions.  We also require that we
1251     // can spill, so that we can generate arbitrary profiling code, and also to
1252     // ensure we don't profile unique stubs and such.
1253     doPass("VOPT_PROF_BRANCH", profile_branches);
1254   }
1255
1256   doPass("VOPT_X64",      lowerForX64);
1257   doPass("VOPT_SIMPLIFY", simplify);
1258   doPass("VOPT_X64",      lowerForX64);
1259
1260   if (!unit.constToReg.empty()) {
1261     doPass("VOPT_FOLD_IMM", foldImms<x64::ImmFolder>);
1262   }
1263
1264   doPass("VOPT_COPY",   [&] (Vunit& u) { optimizeCopies(u, abi); });
1265   doPass("VOPT_DCE",    removeDeadCode);
1266   doPass("VOPT_BRANCH", fuseBranches);
1267
1268   if (unit.needsRegAlloc()) {
1269     doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u, false); });
1270     doPass("VOPT_DCE", removeDeadCode);
1271
1272     if (regalloc) {
1273       // vasm-block-counts and register allocation require edges to
1274       // be pre-split.
1275       splitCriticalEdges(unit);
1276
1277       doPass("VOPT_BLOCK_WEIGHTS", VasmBlockCounters::profileGuidedUpdate);
1278
1279       if (RuntimeOption::EvalUseGraphColor &&
1280           unit.context &&
1281           (unit.context->kind == TransKind::Optimize ||
1282            unit.context->kind == TransKind::OptPrologue)) {
1283         rqtrace::EventGuard trace{"VOPT_GRAPH_COLOR"};
1284         allocateRegistersWithGraphColor(unit, abi);
1285       } else {
1286         rqtrace::EventGuard trace{"VOPT_XLS"};
1287         allocateRegistersWithXLS(unit, abi);
1288       }
1289       doPass("VOPT_SF_PEEPHOLES", [&] (Vunit& u) { sfPeepholes(u, abi); });
1290       doPass("VOPT_POST_RA_SIMPLIFY", postRASimplify);
1291     }
1292   }
1293
1294   // We can add side-exiting instructions now
1295   doPass("VOPT_EXIT", optimizeExits);
1296   doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u, true); });
1297 }
1298
1299 void emitX64(Vunit& unit, Vtext& text, CGMeta& fixups,
1300              AsmInfo* asmInfo) {
1301   tracing::Block _{"emit-X64", [&] { return traceProps(unit); }};
1302
1303 #ifdef HAVE_LIBXED
1304   if (RuntimeOption::EvalUseXedAssembler) {
1305     return vasm_emit<Vgen<XedAssembler>>(unit, text, fixups, asmInfo);
1306   }
1307 #endif
1308   vasm_emit<Vgen<X64Assembler>>(unit, text, fixups, asmInfo);
1309 }
1310
1311 ///////////////////////////////////////////////////////////////////////////////
1312 }}