hphp/runtime/vm/jit/vasm-x64.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com)     |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/runtime/vm/jit/vasm-emit.h"
  18
  19 #include "hphp/runtime/base/runtime-option.h"
  20
  21 #include "hphp/runtime/vm/jit/abi-x64.h"
  22 #include "hphp/runtime/vm/jit/block.h"
  23 #include "hphp/runtime/vm/jit/code-gen-helpers.h"
  24 #include "hphp/runtime/vm/jit/func-guard-x64.h"
  25 #include "hphp/runtime/vm/jit/print.h"
  26 #include "hphp/runtime/vm/jit/prof-data.h"
  27 #include "hphp/runtime/vm/jit/service-requests.h"
  28 #include "hphp/runtime/vm/jit/smashable-instr-x64.h"
  29 #include "hphp/runtime/vm/jit/target-cache.h"
  30 #include "hphp/runtime/vm/jit/timer.h"
  31 #include "hphp/runtime/vm/jit/vasm.h"
  32 #include "hphp/runtime/vm/jit/vasm-instr.h"
  33 #include "hphp/runtime/vm/jit/vasm-internal.h"
  34 #include "hphp/runtime/vm/jit/vasm-lower.h"
  35 #include "hphp/runtime/vm/jit/vasm-print.h"
  36 #include "hphp/runtime/vm/jit/vasm-prof.h"
  37 #include "hphp/runtime/vm/jit/vasm-unit.h"
  38 #include "hphp/runtime/vm/jit/vasm-util.h"
  39 #include "hphp/runtime/vm/jit/vasm-visit.h"
  40
  41 #include <algorithm>
  42 #include <tuple>
  43
  44 TRACE_SET_MOD(vasm);
  45
  46 namespace HPHP { namespace jit {
  47 ///////////////////////////////////////////////////////////////////////////////
  48
  49 using namespace reg;
  50 using namespace x64;
  51
  52 namespace x64 { struct ImmFolder; }
  53
  54 namespace {
  55 ///////////////////////////////////////////////////////////////////////////////
  56
  57 struct Vgen {
  58   explicit Vgen(Venv& env)
  59     : env(env)
  60     , a(*env.cb)
  61     , current(env.current)
  62     , next(env.next)
  63     , jmps(env.jmps)
  64     , jccs(env.jccs)
  65     , catches(env.catches)
  66   {}
  67
  68   static void patch(Venv& env);
  69   static void pad(CodeBlock& cb);
  70
  71   /////////////////////////////////////////////////////////////////////////////
  72
  73   template<class Inst> void emit(const Inst& i) {
  74     always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
  75                        vinst_names[Vinstr(i).op], size_t(current));
  76   }
  77
  78   // intrinsics
  79   void emit(const copy& i);
  80   void emit(const copy2& i);
  81   void emit(const debugtrap& i) { a.int3(); }
  82   void emit(const fallthru& i) {}
  83   void emit(const ldimmb& i);
  84   void emit(const ldimml& i);
  85   void emit(const ldimmq& i);
  86   void emit(const load& i);
  87   void emit(const store& i);
  88   void emit(const mcprep& i);
  89
  90   // native function abi
  91   void emit(const call& i);
  92   void emit(const callm& i) { a.call(i.target); }
  93   void emit(const callr& i) { a.call(i.target); }
  94   void emit(const calls& i);
  95   void emit(const ret& i) { a.ret(); }
  96
  97   // stub function abi
  98   void emit(const stubret& i);
  99   void emit(const callstub& i);
 100   void emit(const callfaststub& i);
 101   void emit(const tailcallstub& i);
 102
 103   // php function abi
 104   void emit(const phpret& i);
 105   void emit(const tailcallphp& i);
 106   void emit(const callarray& i);
 107   void emit(const contenter& i);
 108
 109   // vm entry abi
 110   void emit(const inittc& i) {}
 111   void emit(const calltc&);
 112   void emit(const leavetc&) { a.ret(); }
 113
 114   // exceptions
 115   void emit(const landingpad& i) {}
 116   void emit(const nothrow& i);
 117   void emit(const syncpoint& i);
 118   void emit(const unwind& i);
 119
 120   // instructions
 121   void emit(absdbl i) { unary(i); a.psllq(1, i.d); a.psrlq(1, i.d); }
 122   void emit(andb i) { commuteSF(i); a.andb(i.s0, i.d); }
 123   void emit(andbi i) { binary(i); a.andb(i.s0, i.d); }
 124   void emit(const andbim& i) { a.andb(i.s, i.m); }
 125   void emit(andl i) { commuteSF(i); a.andl(i.s0, i.d); }
 126   void emit(andli i) { binary(i); a.andl(i.s0, i.d); }
 127   void emit(andq i) { commuteSF(i); a.andq(i.s0, i.d); }
 128   void emit(andqi i);
 129   void emit(addli i) { binary(i); a.addl(i.s0, i.d); }
 130   void emit(const addlm& i) { a.addl(i.s0, i.m); }
 131   void emit(const addlim& i);
 132   void emit(addq i) { commuteSF(i); a.addq(i.s0, i.d); }
 133   void emit(addqi i) { binary(i); a.addq(i.s0, i.d); }
 134   void emit(const addqim& i);
 135   void emit(addsd i) { commute(i); a.addsd(i.s0, i.d); }
 136   void emit(const cloadq& i);
 137   template<class cmov> void emit_cmov(const cmov& i);
 138   void emit(const cmovb& i) { emit_cmov(i); }
 139   void emit(const cmovw& i) { emit_cmov(i); }
 140   void emit(const cmovl& i) { emit_cmov(i); }
 141   void emit(const cmovq& i) { emit_cmov(i); }
 142   void emit(const cmpb& i) { a.cmpb(i.s0, i.s1); }
 143   void emit(const cmpbi& i) { a.cmpb(i.s0, i.s1); }
 144   void emit(const cmpbim& i) { a.cmpb(i.s0, i.s1); }
 145   void emit(const cmpbm& i) { a.cmpb(i.s0, i.s1); }
 146   void emit(const cmpwim& i) { a.cmpw(i.s0, i.s1); }
 147   void emit(const cmpwm& i) { a.cmpw(i.s0, i.s1); }
 148   void emit(const cmpl& i) { a.cmpl(i.s0, i.s1); }
 149   void emit(const cmpli& i) { a.cmpl(i.s0, i.s1); }
 150   void emit(const cmplim& i) { a.cmpl(i.s0, i.s1); }
 151   void emit(const cmplm& i) { a.cmpl(i.s0, i.s1); }
 152   void emit(const cmpq& i) { a.cmpq(i.s0, i.s1); }
 153   void emit(const cmpqi& i) { a.cmpq(i.s0, i.s1); }
 154   void emit(const cmpqim& i) { a.cmpq(i.s0, i.s1); }
 155   void emit(const cmpqm& i) { a.cmpq(i.s0, i.s1); }
 156   void emit(cmpsd i) { noncommute(i); a.cmpsd(i.s0, i.d, i.pred); }
 157   void emit(const cqo& i) { a.cqo(); }
 158   void emit(const cvttsd2siq& i) { a.cvttsd2siq(i.s, i.d); }
 159   void emit(const cvtsi2sd& i);
 160   void emit(const cvtsi2sdm& i);
 161   void emit(decl i) { unary(i); a.decl(i.d); }
 162   void emit(const declm& i) { a.decl(i.m); }
 163   void emit(decq i) { unary(i); a.decq(i.d); }
 164   void emit(const decqm& i) { a.decq(i.m); }
 165   void emit(const decqmlock& i) { a.lock(); a.decq(i.m); }
 166   void emit(divsd i) { noncommute(i); a.divsd(i.s0, i.d); }
 167   void emit(imul i) { commuteSF(i); a.imul(i.s0, i.d); }
 168   void emit(const idiv& i) { a.idiv(i.s); }
 169   void emit(incl i) { unary(i); a.incl(i.d); }
 170   void emit(const inclm& i) { a.incl(i.m); }
 171   void emit(incq i) { unary(i); a.incq(i.d); }
 172   void emit(const incqm& i) { a.incq(i.m); }
 173   void emit(const incwm& i) { a.incw(i.m); }
 174   void emit(const jcc& i);
 175   void emit(const jcci& i);
 176   void emit(const jmp& i);
 177   void emit(const jmpr& i) { a.jmp(i.target); }
 178   void emit(const jmpm& i) { a.jmp(i.target); }
 179   void emit(const jmpi& i);
 180   void emit(const lea& i);
 181   void emit(const leap& i) { a.lea(i.s, i.d); }
 182   void emit(const lead& i) { a.lea(rip[(intptr_t)i.s.get()], i.d); }
 183   void emit(const loadups& i) { a.movups(i.s, i.d); }
 184   void emit(const loadtqb& i) { a.loadb(i.s, i.d); }
 185   void emit(const loadb& i) { a.loadb(i.s, i.d); }
 186   void emit(const loadw& i) { a.loadw(i.s, i.d); }
 187   void emit(const loadl& i) { a.loadl(i.s, i.d); }
 188   void emit(const loadqp& i) { a.loadq(i.s, i.d); }
 189   void emit(const loadqd& i) { a.loadq(rip[(intptr_t)i.s.get()], i.d); }
 190   void emit(const loadsd& i) { a.movsd(i.s, i.d); }
 191   void emit(const loadzbl& i) { a.loadzbl(i.s, i.d); }
 192   void emit(const loadzbq& i) { a.loadzbl(i.s, Reg32(i.d)); }
 193   void emit(const loadzlq& i) { a.loadl(i.s, Reg32(i.d)); }
 194   void emit(const movb& i) { a.movb(i.s, i.d); }
 195   void emit(const movl& i) { a.movl(i.s, i.d); }
 196   void emit(const movzbw& i) { a.movzbl(i.s, Reg32(i.d)); }
 197   void emit(const movzbl& i) { a.movzbl(i.s, i.d); }
 198   void emit(const movzbq& i) { a.movzbl(i.s, Reg32(i.d)); }
 199   void emit(const movzwl& i) { a.movzwl(i.s, i.d); }
 200   void emit(const movzwq& i) { a.movzwl(i.s, Reg32(i.d)); }
 201   void emit(const movzlq& i) { a.movl(i.s, Reg32(i.d)); }
 202   void emit(mulsd i) { commute(i); a.mulsd(i.s0, i.d); }
 203   void emit(neg i) { unary(i); a.neg(i.d); }
 204   void emit(const nop& i) { a.nop(); }
 205   void emit(not i) { unary(i); a.not(i.d); }
 206   void emit(notb i) { unary(i); a.notb(i.d); }
 207   void emit(const orbim& i) { a.orb(i.s0, i.m); }
 208   void emit(const orwim& i) { a.orw(i.s0, i.m); }
 209   void emit(orq i) { commuteSF(i); a.orq(i.s0, i.d); }
 210   void emit(orqi i) { binary(i); a.orq(i.s0, i.d); }
 211   void emit(const orqim& i) { a.orq(i.s0, i.m); }
 212   void emit(const pop& i) { a.pop(i.d); }
 213   void emit(const popm& i) { a.pop(i.d); }
 214   void emit(const popf& i) { assertx(i.d == RegSF{0}); a.popf(); }
 215   void emit(const push& i) { a.push(i.s); }
 216   void emit(const pushm& i) { a.push(i.s); }
 217   void emit(const pushf& i) { assertx(i.s == RegSF{0}); a.pushf(); }
 218   void emit(const roundsd& i) { a.roundsd(i.dir, i.s, i.d); }
 219   void emit(const sarq& i) { unary(i); a.sarq(i.d); }
 220   void emit(sarqi i) { binary(i); a.sarq(i.s0, i.d); }
 221   void emit(const setcc& i) { a.setcc(i.cc, i.d); }
 222   void emit(shlli i) { binary(i); a.shll(i.s0, i.d); }
 223   void emit(shlq i) { unary(i); a.shlq(i.d); }
 224   void emit(shlqi i) { binary(i); a.shlq(i.s0, i.d); }
 225   void emit(shrli i) { binary(i); a.shrl(i.s0, i.d); }
 226   void emit(shrqi i) { binary(i); a.shrq(i.s0, i.d); }
 227   void emit(const sqrtsd& i) { a.sqrtsd(i.s, i.d); }
 228   void emit(const storeups& i) { a.movups(i.s, i.m); }
 229   void emit(const storeb& i) { a.storeb(i.s, i.m); }
 230   void emit(const storebi& i);
 231   void emit(const storel& i) { a.storel(i.s, i.m); }
 232   void emit(const storeli& i) { a.storel(i.s, i.m); }
 233   void emit(const storeqi& i);
 234   void emit(const storesd& i) { a.movsd(i.s, i.m); }
 235   void emit(const storew& i) { a.storew(i.s, i.m); }
 236   void emit(const storewi& i) { a.storew(i.s, i.m); }
 237   void emit(subbi i) { binary(i); a.subb(i.s0, i.d); }
 238   void emit(subl i) { noncommute(i); a.subl(i.s0, i.d); }
 239   void emit(subli i) { binary(i); a.subl(i.s0, i.d); }
 240   void emit(subq i) { noncommute(i); a.subq(i.s0, i.d); }
 241   void emit(subqi i) { binary(i); a.subq(i.s0, i.d); }
 242   void emit(subsd i) { noncommute(i); a.subsd(i.s0, i.d); }
 243   void emit(const testb& i) { a.testb(i.s0, i.s1); }
 244   void emit(const testbi& i) { a.testb(i.s0, i.s1); }
 245   void emit(const testbim& i) { a.testb(i.s0, i.s1); }
 246   void emit(const testwim& i);
 247   void emit(const testl& i) { a.testl(i.s0, i.s1); }
 248   void emit(const testli& i);
 249   void emit(const testlim& i);
 250   void emit(const testq& i) { a.testq(i.s0, i.s1); }
 251   void emit(const testqi& i);
 252   void emit(const testqm& i) { a.testq(i.s0, i.s1); }
 253   void emit(const testqim& i);
 254   void emit(const ucomisd& i) { a.ucomisd(i.s0, i.s1); }
 255   void emit(const ud2& i) { a.ud2(); }
 256   void emit(unpcklpd i) { noncommute(i); a.unpcklpd(i.s0, i.d); }
 257   void emit(xorb i) { commuteSF(i); a.xorb(i.s0, i.d); }
 258   void emit(xorbi i) { binary(i); a.xorb(i.s0, i.d); }
 259   void emit(xorl i) { commuteSF(i); a.xorl(i.s0, i.d); }
 260   void emit(xorq i);
 261   void emit(xorqi i) { binary(i); a.xorq(i.s0, i.d); }
 262   void emit(const conjure& i) { always_assert(false); }
 263   void emit(const conjureuse& i) { always_assert(false); }
 264
 265   void emit_nop() {
 266     emit(lea{rax[8], rax});
 267     emit(lea{rax[-8], rax});
 268   }
 269
 270 private:
 271   // helpers
 272   void prep(Reg8 s, Reg8 d) { if (s != d) a.movb(s, d); }
 273   void prep(Reg16 s, Reg16 d) { if (s != d) a.movw(s, d); }
 274   void prep(Reg32 s, Reg32 d) { if (s != d) a.movl(s, d); }
 275   void prep(Reg64 s, Reg64 d) { if (s != d) a.movq(s, d); }
 276   void prep(RegXMM s, RegXMM d) { if (s != d) a.movdqa(s, d); }
 277   void emit_simd_imm(int64_t, Vreg);
 278
 279   template<class Inst> void unary(Inst& i) { prep(i.s, i.d); }
 280   template<class Inst> void binary(Inst& i) { prep(i.s1, i.d); }
 281   template<class Inst> void commuteSF(Inst&);
 282   template<class Inst> void commute(Inst&);
 283   template<class Inst> void noncommute(Inst&);
 284
 285   CodeBlock& frozen() { return env.text.frozen().code; }
 286
 287 private:
 288   Venv& env;
 289   X64Assembler a;
 290
 291   const Vlabel current;
 292   const Vlabel next;
 293   jit::vector<Venv::LabelPatch>& jmps;
 294   jit::vector<Venv::LabelPatch>& jccs;
 295   jit::vector<Venv::LabelPatch>& catches;
 296 };
 297
 298 ///////////////////////////////////////////////////////////////////////////////
 299
 300 /*
 301  * Prepare a binary op that is not commutative.
 302  *
 303  * s0 must be a different register than s1 so we don't clobber it.
 304  */
 305 template<class Inst> void Vgen::noncommute(Inst& i) {
 306   assertx(i.s1 == i.d || i.s0 != i.d); // do not clobber s0
 307   binary(i);
 308 }
 309
 310 /*
 311  * Prepare a binary op that is commutative.
 312  *
 313  * Swap operands if the dest is s0.
 314  */
 315 template<class Inst> void Vgen::commuteSF(Inst& i) {
 316   if (i.s1 != i.d && i.s0 == i.d) {
 317     i = Inst{i.s1, i.s0, i.d, i.sf};
 318   } else {
 319     binary(i);
 320   }
 321 }
 322
 323 template<class Inst> void Vgen::commute(Inst& i) {
 324   if (i.s1 != i.d && i.s0 == i.d) {
 325     i = Inst{i.s1, i.s0, i.d};
 326   } else {
 327     binary(i);
 328   }
 329 }
 330
 331 /*
 332  * Helper for emitting instructions whose Vptr operand specifies a segment.
 333  */
 334 X64Assembler& prefix(X64Assembler& a, const Vptr& ptr) {
 335   if (ptr.seg == Vptr::Segment::FS) {
 336     a.fs();
 337   } else if (ptr.seg == Vptr::Segment::GS) {
 338     a.gs();
 339   }
 340   return a;
 341 }
 342
 343 ///////////////////////////////////////////////////////////////////////////////
 344
 345 /*
 346  * Returns true iff the status flags necessary to take a j<a> imply that a j<b>
 347  * will also be taken.
 348  */
 349 bool ccImplies(ConditionCode a, ConditionCode b) {
 350   if (a == b) return true;
 351
 352   switch (a) {
 353     case CC_None:
 354     case CC_O:  case CC_NO:
 355     case CC_AE: case CC_BE:
 356     case CC_NE:
 357     case CC_S:  case CC_NS:
 358     case CC_P:  case CC_NP:
 359     case CC_GE: case CC_LE:
 360       return false;
 361
 362     case CC_B: return b == CC_BE;
 363     case CC_E: return b == CC_BE || b == CC_LE;
 364     case CC_A: return b == CC_AE || b == CC_NE;
 365     case CC_L: return b == CC_LE;
 366     case CC_G: return b == CC_NE || b == CC_GE;
 367   }
 368   always_assert(false);
 369 }
 370
 371 static CodeAddress toReal(Venv& env, CodeAddress a) {
 372   if (env.text.main().code.contains(a)) {
 373     return env.text.main().code.toDestAddress(a);
 374   }
 375   if (env.text.cold().code.contains(a)) {
 376     return env.text.cold().code.toDestAddress(a);
 377   }
 378   if (env.text.frozen().code.contains(a)) {
 379     return env.text.frozen().code.toDestAddress(a);
 380   }
 381   return a;
 382 }
 383
 384 /*
 385  * When two jccs go to the same destination, the cc of the first is compatible
 386  * with the cc of the second, and they're within a one-byte offset of each
 387  * other, retarget the first to jump to the second. This will allow the
 388  * relocator to shrink the first one, and the extra jmp shouldn't matter since
 389  * we try to only do this to rarely taken jumps.
 390  */
 391 void retargetJumps(Venv& env,
 392                    const jit::hash_map<TCA, jit::vector<TCA>>& jccs) {
 393   jit::hash_set<TCA> retargeted;
 394   for (auto& pair : jccs) {
 395     auto const& jmps = pair.second;
 396     if (jmps.size() < 2) continue;
 397
 398     for (size_t i = 0; i < jmps.size(); ++i) {
 399       DecodedInstruction di(toReal(env, jmps[i]), jmps[i]);
 400       // Don't bother if the jump is already a short jump.
 401       if (di.size() != 6) continue;
 402
 403       for (size_t j = jmps.size() - 1; j > i; --j) {
 404         auto const delta = jmps[j] - jmps[i] + 2;
 405         // Backwards jumps are probably not guards, and don't retarget to a
 406         // dest that's more than a one-byte offset away.
 407         if (delta < 0 || !deltaFits(delta, sz::byte)) continue;
 408
 409         DecodedInstruction dj(toReal(env, jmps[j]), jmps[j]);
 410         if (!ccImplies(di.jccCondCode(), dj.jccCondCode())) continue;
 411
 412         di.setPicAddress(jmps[j]);
 413         retargeted.insert(jmps[i]);
 414
 415         // We might've converted a smashable jump to a regular in-unit jump, so
 416         // remove any smashable alignments.
 417         auto range = env.meta.alignments.equal_range(jmps[i]);
 418         while (range.first != range.second) {
 419           auto iter = range.first;
 420           ++range.first;
 421
 422           auto& align = iter->second;
 423           if (align.first == Alignment::SmashJcc &&
 424               align.second == AlignContext::Live) {
 425             env.meta.alignments.erase(iter);
 426           }
 427         }
 428
 429         break;
 430       }
 431     }
 432   }
 433
 434   // Finally, remove any retargeted jmps from inProgressTailJumps.
 435   if (!retargeted.empty()) {
 436     GrowableVector<IncomingBranch> newTailJumps;
 437     for (auto& jmp : env.meta.inProgressTailJumps) {
 438       if (retargeted.count(jmp.toSmash()) == 0) {
 439         newTailJumps.push_back(jmp);
 440       }
 441     }
 442     env.meta.inProgressTailJumps.swap(newTailJumps);
 443   }
 444 }
 445
 446 void Vgen::patch(Venv& env) {
 447   for (auto& p : env.jmps) {
 448     assertx(env.addrs[p.target]);
 449     X64Assembler::patchJmp(toReal(env, p.instr), p.instr, env.addrs[p.target]);
 450   }
 451
 452   auto const optLevel = RuntimeOption::EvalJitRetargetJumps;
 453   jit::hash_map<TCA, jit::vector<TCA>> jccs;
 454   for (auto& p : env.jccs) {
 455     assertx(env.addrs[p.target]);
 456     X64Assembler::patchJcc(toReal(env, p.instr), p.instr, env.addrs[p.target]);
 457     if (optLevel >= 2 ||
 458         (optLevel == 1 && p.target >= env.unit.blocks.size())) {
 459       jccs[env.addrs[p.target]].emplace_back(p.instr);
 460     }
 461   }
 462
 463   if (!jccs.empty()) retargetJumps(env, jccs);
 464 }
 465
 466 void Vgen::pad(CodeBlock& cb) {
 467   X64Assembler a { cb };
 468   while (a.available() >= 2) a.ud2();
 469   if (a.available() > 0) a.int3();
 470   assertx(a.available() == 0);
 471 }
 472
 473 ///////////////////////////////////////////////////////////////////////////////
 474
 475 void Vgen::emit(const copy& i) {
 476   if (i.s == i.d) return;
 477   if (i.s.isGP()) {
 478     if (i.d.isGP()) {                 // GP => GP
 479       a.movq(i.s, i.d);
 480     } else {                             // GP => XMM
 481       assertx(i.d.isSIMD());
 482       // This generates a movq x86 instruction, which zero extends
 483       // the 64-bit value in srcReg into a 128-bit XMM register
 484       a.movq_rx(i.s, i.d);
 485     }
 486   } else {
 487     if (i.d.isGP()) {                 // XMM => GP
 488       a.movq_xr(i.s, i.d);
 489     } else {                             // XMM => XMM
 490       assertx(i.d.isSIMD());
 491       // This copies all 128 bits in XMM,
 492       // thus avoiding partial register stalls
 493       a.movdqa(i.s, i.d);
 494     }
 495   }
 496 }
 497
 498 void Vgen::emit(const copy2& i) {
 499   assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid());
 500   auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1;
 501   assertx(d0 != d1);
 502   if (d0 == s1) {
 503     if (d1 == s0) {
 504       a.xchgq(d0, d1);
 505     } else {
 506       // could do this in a simplify pass
 507       if (s1 != d1) a.movq(s1, d1); // save s1 first; d1 != s0
 508       if (s0 != d0) a.movq(s0, d0);
 509     }
 510   } else {
 511     // could do this in a simplify pass
 512     if (s0 != d0) a.movq(s0, d0);
 513     if (s1 != d1) a.movq(s1, d1);
 514   }
 515 }
 516
 517 void Vgen::emit_simd_imm(int64_t val, Vreg d) {
 518   if (val == 0) {
 519     a.pxor(d, d); // does not modify flags
 520   } else {
 521     auto addr = alloc_literal(env, val);
 522     a.movsd(rip[(intptr_t)addr], d);
 523   }
 524 }
 525
 526 void Vgen::emit(const ldimmb& i) {
 527   // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t
 528   auto val = i.s.ub();
 529   if (i.d.isGP()) {
 530     Vreg8 d8 = i.d;
 531     a.movb(static_cast<int8_t>(val), d8);
 532   } else {
 533     emit_simd_imm(val, i.d);
 534   }
 535 }
 536
 537 void Vgen::emit(const ldimml& i) {
 538   // ldimml is for Vconst::Long, which is treated as unsigned uint32_t
 539   auto val = i.s.l();
 540   if (i.d.isGP()) {
 541     Vreg32 d32 = i.d;
 542     a.movl(val, d32);
 543   } else {
 544     emit_simd_imm(uint32_t(val), i.d);
 545   }
 546 }
 547
 548 void Vgen::emit(const ldimmq& i) {
 549   auto val = i.s.q();
 550   if (i.d.isGP()) {
 551     if (val == 0) {
 552       Vreg32 d32 = i.d;
 553       a.movl(0, d32); // because emitImmReg tries the xor optimization
 554     } else {
 555       a.emitImmReg(i.s, i.d);
 556     }
 557   } else {
 558     emit_simd_imm(val, i.d);
 559   }
 560 }
 561
 562 void Vgen::emit(const load& i) {
 563   prefix(a, i.s);
 564   auto mref = i.s.mr();
 565   if (i.d.isGP()) {
 566     a.loadq(mref, i.d);
 567   } else {
 568     assertx(i.d.isSIMD());
 569     a.movsd(mref, i.d);
 570   }
 571 }
 572
 573 void Vgen::emit(const store& i) {
 574   if (i.s.isGP()) {
 575     a.storeq(i.s, i.d);
 576   } else {
 577     assertx(i.s.isSIMD());
 578     a.movsd(i.s, i.d);
 579   }
 580 }
 581
 582 ///////////////////////////////////////////////////////////////////////////////
 583
 584 void Vgen::emit(const mcprep& i) {
 585   /*
 586    * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
 587    * address of the movq) so that we can find the movq from the handler.
 588    *
 589    * We set the low bit for two reasons: the Class* will never be a valid
 590    * Class*, so we'll always miss the inline check before it's smashed, and
 591    * handlePrimeCacheInit can tell it's not been smashed yet
 592    */
 593   auto const mov_addr = emitSmashableMovq(a.code(), env.meta, 0, r64(i.d));
 594   auto const imm = reinterpret_cast<uint64_t>(mov_addr);
 595   smashMovq(a.toDestAddress(mov_addr), (imm << 1) | 1);
 596
 597   env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm));
 598 }
 599
 600 ///////////////////////////////////////////////////////////////////////////////
 601
 602 void Vgen::emit(const call& i) {
 603   if (a.jmpDeltaFits(i.target)) {
 604     a.call(i.target);
 605   } else {
 606     // can't do a near call; store address in data section.
 607     // call by loading the address using rip-relative addressing.  This
 608     // assumes the data section is near the current code section.  Since
 609     // this sequence is directly in-line, rip-relative like this is
 610     // more compact than loading a 64-bit immediate.
 611     auto addr = alloc_literal(env, (uint64_t)i.target);
 612     a.call(rip[(intptr_t)addr]);
 613   }
 614   if (i.watch) {
 615     *i.watch = a.frontier();
 616     env.meta.watchpoints.push_back(i.watch);
 617   }
 618 }
 619
 620 void Vgen::emit(const calls& i) {
 621   emitSmashableCall(a.code(), env.meta, i.target);
 622 }
 623
 624 ///////////////////////////////////////////////////////////////////////////////
 625
 626 void Vgen::emit(const stubret& i) {
 627   if (i.saveframe) {
 628     a.pop(rvmfp());
 629   } else {
 630     a.addq(8, reg::rsp);
 631   }
 632   a.ret();
 633 }
 634
 635 void Vgen::emit(const callstub& i) {
 636   emit(call{i.target, i.args});
 637 }
 638
 639 void Vgen::emit(const callfaststub& i) {
 640   emit(call{i.target, i.args});
 641   emit(syncpoint{i.fix});
 642 }
 643
 644 void Vgen::emit(const tailcallstub& i) {
 645   a.addq(8, reg::rsp);
 646   emit(jmpi{i.target, i.args});
 647 }
 648
 649 ///////////////////////////////////////////////////////////////////////////////
 650
 651 void Vgen::emit(const phpret& i) {
 652   a.push(i.fp[AROFF(m_savedRip)]);
 653   if (!i.noframe) {
 654     a.loadq(i.fp[AROFF(m_sfp)], i.d);
 655   }
 656   a.ret();
 657 }
 658
 659 void Vgen::emit(const tailcallphp& i) {
 660   emit(pushm{i.fp[AROFF(m_savedRip)]});
 661   emit(jmpr{i.target, i.args});
 662 }
 663
 664 void Vgen::emit(const callarray& i) {
 665   emit(call{i.target, i.args});
 666 }
 667
 668 void Vgen::emit(const contenter& i) {
 669   Label Stub, End;
 670   Reg64 fp = i.fp, target = i.target;
 671   a.jmp8(End);
 672
 673   asm_label(a, Stub);
 674   a.pop(fp[AROFF(m_savedRip)]);
 675   a.jmp(target);
 676
 677   asm_label(a, End);
 678   a.call(Stub);
 679   // m_savedRip will point here.
 680   emit(unwind{{i.targets[0], i.targets[1]}});
 681 }
 682
 683 ///////////////////////////////////////////////////////////////////////////////
 684
 685 void Vgen::emit(const calltc& i) {
 686   a.push(i.exittc);
 687   a.push(i.fp[AROFF(m_savedRip)]);
 688
 689   Label stub;
 690   a.call(stub);
 691
 692   asm_label(a, stub);
 693   assertx(!i.args.contains(reg::rax));
 694   a.pop(reg::rax);  // unused
 695   a.jmp(i.target);
 696 }
 697
 698 ///////////////////////////////////////////////////////////////////////////////
 699
 700 void Vgen::emit(const nothrow& i) {
 701   env.meta.catches.emplace_back(a.frontier(), nullptr);
 702 }
 703
 704 void Vgen::emit(const syncpoint& i) {
 705   FTRACE(5, "IR recordSyncPoint: {} {} {}\n", a.frontier(),
 706          i.fix.pcOffset, i.fix.spOffset);
 707   env.meta.fixups.emplace_back(a.frontier(), i.fix);
 708 }
 709
 710 void Vgen::emit(const unwind& i) {
 711   catches.push_back({a.frontier(), i.targets[1]});
 712   emit(jmp{i.targets[0]});
 713 }
 714
 715 ///////////////////////////////////////////////////////////////////////////////
 716
 717 void Vgen::emit(andqi i) {
 718   if (magFits(i.s0.q(), sz::dword)) {
 719     emit(andli{int32_t(i.s0.q()), Reg32(i.s1), Reg32(i.d), i.sf});
 720     return;
 721   }
 722
 723   binary(i);
 724   a.andq(i.s0, i.d);
 725 }
 726
 727 void Vgen::emit(const addlim& i) {
 728   prefix(a, i.m).addl(i.s0, i.m.mr());
 729 }
 730
 731 void Vgen::emit(const addqim& i) {
 732   prefix(a, i.m).addq(i.s0, i.m.mr());
 733 }
 734
 735 void Vgen::emit(const cloadq& i) {
 736   auto m = i.t;
 737   always_assert(!m.index.isValid()); // not supported, but could be later.
 738   if (i.f != i.d) {
 739     if (i.d == m.base) {
 740       // We can't move f over d or we'll clobber the Vptr we need to load from.
 741       // Since cload does the load unconditionally anyway, we can just load and
 742       // cmov.
 743       a.loadq(i.t, i.d);
 744       a.cmov_reg64_reg64(ccNegate(i.cc), i.f, i.d);
 745       return;
 746     }
 747     a.movq(i.f, i.d);
 748   }
 749   a.cload_reg64_disp_reg64(i.cc, m.base, m.disp, i.d);
 750 }
 751
 752 // add s0 s1 d => mov s1->d; d += s0
 753 // cmov cc s d => if cc { mov s->d }
 754 template<class cmov>
 755 void Vgen::emit_cmov(const cmov& i) {
 756   if (i.f != i.d && i.t == i.d) {
 757     // negate the condition and swap t/f operands so we dont clobber i.t
 758     return emit(cmov{ccNegate(i.cc), i.sf, i.t, i.f, i.d});
 759   } else {
 760     prep(i.f, i.d);
 761   }
 762   a.cmov_reg64_reg64(i.cc, r64(i.t), r64(i.d));
 763 }
 764
 765 void Vgen::emit(const cvtsi2sd& i) {
 766   a.pxor(i.d, i.d);
 767   a.cvtsi2sd(i.s, i.d);
 768 }
 769
 770 void Vgen::emit(const cvtsi2sdm& i) {
 771   a.pxor(i.d, i.d);
 772   a.cvtsi2sd(i.s, i.d);
 773 }
 774
 775 void Vgen::emit(const jcc& i) {
 776   if (i.targets[1] != i.targets[0]) {
 777     if (next == i.targets[1]) {
 778       return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}});
 779     }
 780     auto taken = i.targets[1];
 781     jccs.push_back({a.frontier(), taken});
 782     a.jcc(i.cc, a.frontier());
 783   }
 784   emit(jmp{i.targets[0]});
 785 }
 786
 787 void Vgen::emit(const jcci& i) {
 788   a.jcc(i.cc, i.taken);
 789   emit(jmp{i.target});
 790 }
 791
 792 void Vgen::emit(const jmp& i) {
 793   if (next == i.target) return;
 794   jmps.push_back({a.frontier(), i.target});
 795   a.jmp(a.frontier());
 796 }
 797
 798 void Vgen::emit(const jmpi& i) {
 799   if (a.jmpDeltaFits(i.target)) {
 800     a.jmp(i.target);
 801   } else {
 802     // can't do a near jmp - use rip-relative addressing
 803     auto addr = alloc_literal(env, (uint64_t)i.target);
 804     a.jmp(rip[(intptr_t)addr]);
 805   }
 806 }
 807
 808 void Vgen::emit(const lea& i) {
 809   // could do this in a simplify pass
 810   if (i.s.disp == 0 && i.s.base.isValid() && !i.s.index.isValid()) {
 811     emit(copy{i.s.base, i.d});
 812   } else {
 813     a.lea(i.s, i.d);
 814   }
 815 }
 816
 817 void Vgen::emit(const storebi& i) {
 818   prefix(a, i.m).storeb(i.s, i.m.mr());
 819 }
 820
 821 void Vgen::emit(const storeqi& i) {
 822   prefix(a, i.m).storeq(i.s, i.m.mr());
 823 }
 824
 825 template<typename Inst>
 826 bool testimHelper(Vgen& env, const Inst& i, uint64_t mask) {
 827   // If there's only 1 byte of meaningful bits in the mask, we can adjust the
 828   // pointer offset and use testbim instead.
 829   int off = 0;
 830   while (mask > 0xff && !(mask & 0xff)) {
 831     off++;
 832     mask >>= 8;
 833   }
 834
 835   if (mask > 0xff) return false;
 836
 837   env.emit(testbim{int8_t(mask), i.s1 + off, i.sf});
 838   return true;
 839 }
 840
 841 void Vgen::emit(const testwim& i) {
 842   if (testimHelper(*this, i, i.s0.w())) return;
 843   a.testw(i.s0, i.s1);
 844 }
 845
 846 void Vgen::emit(const testlim& i) {
 847   if (testimHelper(*this, i, i.s0.l())) return;
 848   a.testl(i.s0, i.s1);
 849 }
 850
 851 void Vgen::emit(const testli& i) {
 852   if (i.s0.l() == -1) {
 853     return emit(testl{i.s1, i.s1, i.sf});
 854   }
 855   a.testl(i.s0, i.s1);
 856 }
 857
 858 void Vgen::emit(const testqi& i) {
 859   auto const imm = i.s0.q();
 860   if (magFits(imm, sz::byte)) {
 861     a.testb(int8_t(imm), rbyte(i.s1));
 862   } else if (magFits(imm, sz::dword)) {
 863     emit(testli{int32_t(imm), Reg32(i.s1), i.sf});
 864   } else if (imm == -1) {
 865     emit(testq{i.s1, i.s1, i.sf});
 866   } else {
 867     a.testq(i.s0, i.s1);
 868   }
 869 }
 870
 871 void Vgen::emit(const testqim& i) {
 872   if (testimHelper(*this, i, i.s0.q())) return;
 873   if (magFits(i.s0.q(), sz::dword)) {
 874     // For an unsigned 32 bit immediate, we can get the same results
 875     // by emitting a testlim.
 876     emit(testlim{int32_t(i.s0.q()), i.s1, i.sf});
 877   } else {
 878     a.testq(i.s0, i.s1);
 879   }
 880 }
 881
 882 void Vgen::emit(xorq i) {
 883   if (i.s0 == i.s1) {
 884     // 32-bit xor{s, s, d} zeroes the upper bits of `d'.
 885     return emit(xorl{r32(i.s0), r32(i.s1), r32(i.d), i.sf});
 886   }
 887   commuteSF(i);
 888   a.xorq(i.s0, i.d);
 889 }
 890
 891 ///////////////////////////////////////////////////////////////////////////////
 892
 893 template<typename Lower>
 894 void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) {
 895   vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; });
 896 }
 897
 898 template<typename Inst>
 899 void lower(Vunit& unit, Inst& inst, Vlabel b, size_t i) {}
 900
 901 ///////////////////////////////////////////////////////////////////////////////
 902
 903 void lower(Vunit& unit, popp& inst, Vlabel b, size_t i) {
 904   lower_impl(unit, b, i, [&] (Vout& v) {
 905     v << pop{inst.d0};
 906     v << pop{inst.d1};
 907   });
 908 }
 909
 910 void lower(Vunit& unit, poppm& inst, Vlabel b, size_t i) {
 911   lower_impl(unit, b, i, [&] (Vout& v) {
 912     v << popm{inst.d0};
 913     v << popm{inst.d1};
 914   });
 915 }
 916
 917 void lower(Vunit& unit, pushp& inst, Vlabel b, size_t i) {
 918   lower_impl(unit, b, i, [&] (Vout& v) {
 919     v << push{inst.s0};
 920     v << push{inst.s1};
 921   });
 922 }
 923
 924 void lower(Vunit& unit, pushpm& inst, Vlabel b, size_t i) {
 925   lower_impl(unit, b, i, [&] (Vout& v) {
 926     v << pushm{inst.s0};
 927     v << pushm{inst.s1};
 928   });
 929 }
 930
 931 ///////////////////////////////////////////////////////////////////////////////
 932
 933 void lower(Vunit& unit, stublogue& inst, Vlabel b, size_t i) {
 934   if (inst.saveframe) {
 935     unit.blocks[b].code[i] = push{rvmfp()};
 936   } else {
 937     unit.blocks[b].code[i] = lea{reg::rsp[-8], reg::rsp};
 938   }
 939 }
 940
 941 void lower(Vunit& unit, stubunwind& inst, Vlabel b, size_t i) {
 942   unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
 943 }
 944
 945 void lower(Vunit& unit, stubtophp& inst, Vlabel b, size_t i) {
 946   unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
 947 }
 948
 949 void lower(Vunit& unit, loadstubret& inst, Vlabel b, size_t i) {
 950   unit.blocks[b].code[i] = load{reg::rsp[8], inst.d};
 951 }
 952
 953 void lower(Vunit& unit, phplogue& inst, Vlabel b, size_t i) {
 954   unit.blocks[b].code[i] = popm{inst.fp[AROFF(m_savedRip)]};
 955 }
 956
 957 void lower(Vunit& unit, resumetc& inst, Vlabel b, size_t i) {
 958   lower_impl(unit, b, i, [&] (Vout& v) {
 959     v << callr{inst.target, inst.args};
 960     v << jmpi{inst.exittc};
 961   });
 962 }
 963
 964 ///////////////////////////////////////////////////////////////////////////////
 965
 966 void lower(Vunit& unit, sar& inst, Vlabel b, size_t i) {
 967   lower_impl(unit, b, i, [&] (Vout& v) {
 968     v << copy{inst.s0, rcx};
 969     v << sarq{inst.s1, inst.d, inst.sf};
 970   });
 971 }
 972
 973 void lower(Vunit& unit, shl& inst, Vlabel b, size_t i) {
 974   lower_impl(unit, b, i, [&] (Vout& v) {
 975     v << copy{inst.s0, rcx};
 976     v << shlq{inst.s1, inst.d, inst.sf};
 977   });
 978 }
 979
 980 void lower(Vunit& unit, srem& inst, Vlabel b, size_t i) {
 981   lower_impl(unit, b, i, [&] (Vout& v) {
 982     v << copy{inst.s0, rax};
 983     v << cqo{};                      // sign-extend rax => rdx:rax
 984     v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
 985     v << copy{rdx, inst.d};
 986   });
 987 }
 988
 989 void lower(Vunit& unit, divint& inst, Vlabel b, size_t i) {
 990   lower_impl(unit, b, i, [&] (Vout& v) {
 991     v << copy{inst.s0, rax};
 992     v << cqo{};                      // sign-extend rax => rdx:rax
 993     v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
 994     v << copy{rax, inst.d};
 995   });
 996 }
 997
 998 ///////////////////////////////////////////////////////////////////////////////
 999
1000 void lower(Vunit& unit, movtqb& inst, Vlabel b, size_t i) {
1001   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1002 }
1003 void lower(Vunit& unit, movtdb& inst, Vlabel b, size_t i) {
1004   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1005 }
1006 void lower(Vunit& unit, movtdq& inst, Vlabel b, size_t i) {
1007   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1008 }
1009 void lower(Vunit& unit, movtql& inst, Vlabel b, size_t i) {
1010   unit.blocks[b].code[i] = copy{inst.s, inst.d};
1011 }
1012
1013 ///////////////////////////////////////////////////////////////////////////////
1014
1015 /*
1016  * Lower a few abstractions to facilitate straightforward x64 codegen.
1017  */
1018 void lowerForX64(Vunit& unit) {
1019   vasm_lower(unit, [&] (const VLS& env, Vinstr& inst, Vlabel b, size_t i) {
1020     switch (inst.op) {
1021 #define O(name, ...)                      \
1022       case Vinstr::name:                  \
1023         lower(unit, inst.name##_, b, i);  \
1024         break;
1025
1026       VASM_OPCODES
1027 #undef O
1028     }
1029   });
1030 }
1031
1032 ///////////////////////////////////////////////////////////////////////////////
1033 }
1034
1035 void optimizeX64(Vunit& unit, const Abi& abi, bool regalloc) {
1036   Timer timer(Timer::vasm_optimize, unit.log_entry);
1037
1038   removeTrivialNops(unit);
1039   optimizePhis(unit);
1040   fuseBranches(unit);
1041   optimizeJmps(unit);
1042   optimizeExits(unit);
1043
1044   assertx(checkWidths(unit));
1045
1046   if (unit.context && isProfiling(unit.context->kind) && abi.canSpill &&
1047       RuntimeOption::EvalProfBranchSampleFreq > 0) {
1048     // Even when branch profiling is on, we still only want to profile
1049     // non-profiling translations of PHP functions.  We also require that we
1050     // can spill, so that we can generate arbitrary profiling code, and also to
1051     // ensure we don't profile unique stubs and such.
1052     profile_branches(unit);
1053   }
1054
1055   lowerForX64(unit);
1056   simplify(unit);
1057   lowerForX64(unit);
1058
1059   if (!unit.constToReg.empty()) {
1060     foldImms<x64::ImmFolder>(unit);
1061   }
1062
1063   optimizeCopies(unit, abi);
1064
1065   if (unit.needsRegAlloc()) {
1066     removeDeadCode(unit);
1067     if (regalloc) allocateRegisters(unit, abi);
1068   }
1069   if (unit.blocks.size() > 1) {
1070     optimizeJmps(unit);
1071   }
1072 }
1073
1074 void emitX64(Vunit& unit, Vtext& text, CGMeta& fixups,
1075              AsmInfo* asmInfo) {
1076   vasm_emit<Vgen>(unit, text, fixups, asmInfo);
1077 }
1078
1079 ///////////////////////////////////////////////////////////////////////////////
1080 }}