Don't do branch profiling for profiling translations
[hiphop-php.git] / hphp / runtime / vm / jit / vasm-x64.cpp
blob301016054f46cb1300e6249ce5485f5bd41118ed
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/vm/jit/vasm-emit.h"
19 #include "hphp/runtime/base/runtime-option.h"
21 #include "hphp/runtime/vm/jit/abi-x64.h"
22 #include "hphp/runtime/vm/jit/block.h"
23 #include "hphp/runtime/vm/jit/code-gen-helpers.h"
24 #include "hphp/runtime/vm/jit/func-guard-x64.h"
25 #include "hphp/runtime/vm/jit/print.h"
26 #include "hphp/runtime/vm/jit/prof-data.h"
27 #include "hphp/runtime/vm/jit/service-requests.h"
28 #include "hphp/runtime/vm/jit/smashable-instr-x64.h"
29 #include "hphp/runtime/vm/jit/target-cache.h"
30 #include "hphp/runtime/vm/jit/timer.h"
31 #include "hphp/runtime/vm/jit/vasm.h"
32 #include "hphp/runtime/vm/jit/vasm-instr.h"
33 #include "hphp/runtime/vm/jit/vasm-internal.h"
34 #include "hphp/runtime/vm/jit/vasm-lower.h"
35 #include "hphp/runtime/vm/jit/vasm-print.h"
36 #include "hphp/runtime/vm/jit/vasm-prof.h"
37 #include "hphp/runtime/vm/jit/vasm-unit.h"
38 #include "hphp/runtime/vm/jit/vasm-util.h"
39 #include "hphp/runtime/vm/jit/vasm-visit.h"
41 #include <algorithm>
42 #include <tuple>
44 TRACE_SET_MOD(vasm);
46 namespace HPHP { namespace jit {
47 ///////////////////////////////////////////////////////////////////////////////
49 using namespace reg;
50 using namespace x64;
52 namespace x64 { struct ImmFolder; }
54 namespace {
55 ///////////////////////////////////////////////////////////////////////////////
57 struct Vgen {
58 explicit Vgen(Venv& env)
59 : env(env)
60 , a(*env.cb)
61 , current(env.current)
62 , next(env.next)
63 , jmps(env.jmps)
64 , jccs(env.jccs)
65 , catches(env.catches)
68 static void patch(Venv& env);
69 static void pad(CodeBlock& cb);
71 /////////////////////////////////////////////////////////////////////////////
73 template<class Inst> void emit(const Inst& i) {
74 always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
75 vinst_names[Vinstr(i).op], size_t(current));
78 // intrinsics
79 void emit(const copy& i);
80 void emit(const copy2& i);
81 void emit(const debugtrap& i) { a.int3(); }
82 void emit(const fallthru& i) {}
83 void emit(const ldimmb& i);
84 void emit(const ldimml& i);
85 void emit(const ldimmq& i);
86 void emit(const load& i);
87 void emit(const store& i);
88 void emit(const mcprep& i);
90 // native function abi
91 void emit(const call& i);
92 void emit(const callm& i) { a.call(i.target); }
93 void emit(const callr& i) { a.call(i.target); }
94 void emit(const calls& i);
95 void emit(const ret& i) { a.ret(); }
97 // stub function abi
98 void emit(const stubret& i);
99 void emit(const callstub& i);
100 void emit(const callfaststub& i);
101 void emit(const tailcallstub& i);
103 // php function abi
104 void emit(const phpret& i);
105 void emit(const tailcallphp& i);
106 void emit(const callarray& i);
107 void emit(const contenter& i);
109 // vm entry abi
110 void emit(const inittc& i) {}
111 void emit(const calltc&);
112 void emit(const leavetc&) { a.ret(); }
114 // exceptions
115 void emit(const landingpad& i) {}
116 void emit(const nothrow& i);
117 void emit(const syncpoint& i);
118 void emit(const unwind& i);
120 // instructions
121 void emit(absdbl i) { unary(i); a.psllq(1, i.d); a.psrlq(1, i.d); }
122 void emit(andb i) { commuteSF(i); a.andb(i.s0, i.d); }
123 void emit(andbi i) { binary(i); a.andb(i.s0, i.d); }
124 void emit(const andbim& i) { a.andb(i.s, i.m); }
125 void emit(andl i) { commuteSF(i); a.andl(i.s0, i.d); }
126 void emit(andli i) { binary(i); a.andl(i.s0, i.d); }
127 void emit(andq i) { commuteSF(i); a.andq(i.s0, i.d); }
128 void emit(andqi i);
129 void emit(addli i) { binary(i); a.addl(i.s0, i.d); }
130 void emit(const addlm& i) { a.addl(i.s0, i.m); }
131 void emit(const addlim& i);
132 void emit(addq i) { commuteSF(i); a.addq(i.s0, i.d); }
133 void emit(addqi i) { binary(i); a.addq(i.s0, i.d); }
134 void emit(const addqim& i);
135 void emit(addsd i) { commute(i); a.addsd(i.s0, i.d); }
136 void emit(const cloadq& i);
137 template<class cmov> void emit_cmov(const cmov& i);
138 void emit(const cmovb& i) { emit_cmov(i); }
139 void emit(const cmovw& i) { emit_cmov(i); }
140 void emit(const cmovl& i) { emit_cmov(i); }
141 void emit(const cmovq& i) { emit_cmov(i); }
142 void emit(const cmpb& i) { a.cmpb(i.s0, i.s1); }
143 void emit(const cmpbi& i) { a.cmpb(i.s0, i.s1); }
144 void emit(const cmpbim& i) { a.cmpb(i.s0, i.s1); }
145 void emit(const cmpbm& i) { a.cmpb(i.s0, i.s1); }
146 void emit(const cmpwim& i) { a.cmpw(i.s0, i.s1); }
147 void emit(const cmpwm& i) { a.cmpw(i.s0, i.s1); }
148 void emit(const cmpl& i) { a.cmpl(i.s0, i.s1); }
149 void emit(const cmpli& i) { a.cmpl(i.s0, i.s1); }
150 void emit(const cmplim& i) { a.cmpl(i.s0, i.s1); }
151 void emit(const cmplm& i) { a.cmpl(i.s0, i.s1); }
152 void emit(const cmpq& i) { a.cmpq(i.s0, i.s1); }
153 void emit(const cmpqi& i) { a.cmpq(i.s0, i.s1); }
154 void emit(const cmpqim& i) { a.cmpq(i.s0, i.s1); }
155 void emit(const cmpqm& i) { a.cmpq(i.s0, i.s1); }
156 void emit(cmpsd i) { noncommute(i); a.cmpsd(i.s0, i.d, i.pred); }
157 void emit(const cqo& i) { a.cqo(); }
158 void emit(const cvttsd2siq& i) { a.cvttsd2siq(i.s, i.d); }
159 void emit(const cvtsi2sd& i);
160 void emit(const cvtsi2sdm& i);
161 void emit(decl i) { unary(i); a.decl(i.d); }
162 void emit(const declm& i) { a.decl(i.m); }
163 void emit(decq i) { unary(i); a.decq(i.d); }
164 void emit(const decqm& i) { a.decq(i.m); }
165 void emit(const decqmlock& i) { a.lock(); a.decq(i.m); }
166 void emit(divsd i) { noncommute(i); a.divsd(i.s0, i.d); }
167 void emit(imul i) { commuteSF(i); a.imul(i.s0, i.d); }
168 void emit(const idiv& i) { a.idiv(i.s); }
169 void emit(incl i) { unary(i); a.incl(i.d); }
170 void emit(const inclm& i) { a.incl(i.m); }
171 void emit(incq i) { unary(i); a.incq(i.d); }
172 void emit(const incqm& i) { a.incq(i.m); }
173 void emit(const incwm& i) { a.incw(i.m); }
174 void emit(const jcc& i);
175 void emit(const jcci& i);
176 void emit(const jmp& i);
177 void emit(const jmpr& i) { a.jmp(i.target); }
178 void emit(const jmpm& i) { a.jmp(i.target); }
179 void emit(const jmpi& i);
180 void emit(const lea& i);
181 void emit(const leap& i) { a.lea(i.s, i.d); }
182 void emit(const lead& i) { a.lea(rip[(intptr_t)i.s.get()], i.d); }
183 void emit(const loadups& i) { a.movups(i.s, i.d); }
184 void emit(const loadtqb& i) { a.loadb(i.s, i.d); }
185 void emit(const loadb& i) { a.loadb(i.s, i.d); }
186 void emit(const loadw& i) { a.loadw(i.s, i.d); }
187 void emit(const loadl& i) { a.loadl(i.s, i.d); }
188 void emit(const loadqp& i) { a.loadq(i.s, i.d); }
189 void emit(const loadqd& i) { a.loadq(rip[(intptr_t)i.s.get()], i.d); }
190 void emit(const loadsd& i) { a.movsd(i.s, i.d); }
191 void emit(const loadzbl& i) { a.loadzbl(i.s, i.d); }
192 void emit(const loadzbq& i) { a.loadzbl(i.s, Reg32(i.d)); }
193 void emit(const loadzlq& i) { a.loadl(i.s, Reg32(i.d)); }
194 void emit(const movb& i) { a.movb(i.s, i.d); }
195 void emit(const movl& i) { a.movl(i.s, i.d); }
196 void emit(const movzbw& i) { a.movzbl(i.s, Reg32(i.d)); }
197 void emit(const movzbl& i) { a.movzbl(i.s, i.d); }
198 void emit(const movzbq& i) { a.movzbl(i.s, Reg32(i.d)); }
199 void emit(const movzwl& i) { a.movzwl(i.s, i.d); }
200 void emit(const movzwq& i) { a.movzwl(i.s, Reg32(i.d)); }
201 void emit(const movzlq& i) { a.movl(i.s, Reg32(i.d)); }
202 void emit(mulsd i) { commute(i); a.mulsd(i.s0, i.d); }
203 void emit(neg i) { unary(i); a.neg(i.d); }
204 void emit(const nop& i) { a.nop(); }
205 void emit(not i) { unary(i); a.not(i.d); }
206 void emit(notb i) { unary(i); a.notb(i.d); }
207 void emit(const orbim& i) { a.orb(i.s0, i.m); }
208 void emit(const orwim& i) { a.orw(i.s0, i.m); }
209 void emit(orq i) { commuteSF(i); a.orq(i.s0, i.d); }
210 void emit(orqi i) { binary(i); a.orq(i.s0, i.d); }
211 void emit(const orqim& i) { a.orq(i.s0, i.m); }
212 void emit(const pop& i) { a.pop(i.d); }
213 void emit(const popm& i) { a.pop(i.d); }
214 void emit(const popf& i) { assertx(i.d == RegSF{0}); a.popf(); }
215 void emit(const push& i) { a.push(i.s); }
216 void emit(const pushm& i) { a.push(i.s); }
217 void emit(const pushf& i) { assertx(i.s == RegSF{0}); a.pushf(); }
218 void emit(const roundsd& i) { a.roundsd(i.dir, i.s, i.d); }
219 void emit(const sarq& i) { unary(i); a.sarq(i.d); }
220 void emit(sarqi i) { binary(i); a.sarq(i.s0, i.d); }
221 void emit(const setcc& i) { a.setcc(i.cc, i.d); }
222 void emit(shlli i) { binary(i); a.shll(i.s0, i.d); }
223 void emit(shlq i) { unary(i); a.shlq(i.d); }
224 void emit(shlqi i) { binary(i); a.shlq(i.s0, i.d); }
225 void emit(shrli i) { binary(i); a.shrl(i.s0, i.d); }
226 void emit(shrqi i) { binary(i); a.shrq(i.s0, i.d); }
227 void emit(const sqrtsd& i) { a.sqrtsd(i.s, i.d); }
228 void emit(const storeups& i) { a.movups(i.s, i.m); }
229 void emit(const storeb& i) { a.storeb(i.s, i.m); }
230 void emit(const storebi& i);
231 void emit(const storel& i) { a.storel(i.s, i.m); }
232 void emit(const storeli& i) { a.storel(i.s, i.m); }
233 void emit(const storeqi& i);
234 void emit(const storesd& i) { a.movsd(i.s, i.m); }
235 void emit(const storew& i) { a.storew(i.s, i.m); }
236 void emit(const storewi& i) { a.storew(i.s, i.m); }
237 void emit(subbi i) { binary(i); a.subb(i.s0, i.d); }
238 void emit(subl i) { noncommute(i); a.subl(i.s0, i.d); }
239 void emit(subli i) { binary(i); a.subl(i.s0, i.d); }
240 void emit(subq i) { noncommute(i); a.subq(i.s0, i.d); }
241 void emit(subqi i) { binary(i); a.subq(i.s0, i.d); }
242 void emit(subsd i) { noncommute(i); a.subsd(i.s0, i.d); }
243 void emit(const testb& i) { a.testb(i.s0, i.s1); }
244 void emit(const testbi& i) { a.testb(i.s0, i.s1); }
245 void emit(const testbim& i) { a.testb(i.s0, i.s1); }
246 void emit(const testwim& i);
247 void emit(const testl& i) { a.testl(i.s0, i.s1); }
248 void emit(const testli& i);
249 void emit(const testlim& i);
250 void emit(const testq& i) { a.testq(i.s0, i.s1); }
251 void emit(const testqi& i);
252 void emit(const testqm& i) { a.testq(i.s0, i.s1); }
253 void emit(const testqim& i);
254 void emit(const ucomisd& i) { a.ucomisd(i.s0, i.s1); }
255 void emit(const ud2& i) { a.ud2(); }
256 void emit(unpcklpd i) { noncommute(i); a.unpcklpd(i.s0, i.d); }
257 void emit(xorb i) { commuteSF(i); a.xorb(i.s0, i.d); }
258 void emit(xorbi i) { binary(i); a.xorb(i.s0, i.d); }
259 void emit(xorl i) { commuteSF(i); a.xorl(i.s0, i.d); }
260 void emit(xorq i);
261 void emit(xorqi i) { binary(i); a.xorq(i.s0, i.d); }
262 void emit(const conjure& i) { always_assert(false); }
263 void emit(const conjureuse& i) { always_assert(false); }
265 void emit_nop() {
266 emit(lea{rax[8], rax});
267 emit(lea{rax[-8], rax});
270 private:
271 // helpers
272 void prep(Reg8 s, Reg8 d) { if (s != d) a.movb(s, d); }
273 void prep(Reg16 s, Reg16 d) { if (s != d) a.movw(s, d); }
274 void prep(Reg32 s, Reg32 d) { if (s != d) a.movl(s, d); }
275 void prep(Reg64 s, Reg64 d) { if (s != d) a.movq(s, d); }
276 void prep(RegXMM s, RegXMM d) { if (s != d) a.movdqa(s, d); }
277 void emit_simd_imm(int64_t, Vreg);
279 template<class Inst> void unary(Inst& i) { prep(i.s, i.d); }
280 template<class Inst> void binary(Inst& i) { prep(i.s1, i.d); }
281 template<class Inst> void commuteSF(Inst&);
282 template<class Inst> void commute(Inst&);
283 template<class Inst> void noncommute(Inst&);
285 CodeBlock& frozen() { return env.text.frozen().code; }
287 private:
288 Venv& env;
289 X64Assembler a;
291 const Vlabel current;
292 const Vlabel next;
293 jit::vector<Venv::LabelPatch>& jmps;
294 jit::vector<Venv::LabelPatch>& jccs;
295 jit::vector<Venv::LabelPatch>& catches;
298 ///////////////////////////////////////////////////////////////////////////////
301 * Prepare a binary op that is not commutative.
303 * s0 must be a different register than s1 so we don't clobber it.
305 template<class Inst> void Vgen::noncommute(Inst& i) {
306 assertx(i.s1 == i.d || i.s0 != i.d); // do not clobber s0
307 binary(i);
311 * Prepare a binary op that is commutative.
313 * Swap operands if the dest is s0.
315 template<class Inst> void Vgen::commuteSF(Inst& i) {
316 if (i.s1 != i.d && i.s0 == i.d) {
317 i = Inst{i.s1, i.s0, i.d, i.sf};
318 } else {
319 binary(i);
323 template<class Inst> void Vgen::commute(Inst& i) {
324 if (i.s1 != i.d && i.s0 == i.d) {
325 i = Inst{i.s1, i.s0, i.d};
326 } else {
327 binary(i);
332 * Helper for emitting instructions whose Vptr operand specifies a segment.
334 X64Assembler& prefix(X64Assembler& a, const Vptr& ptr) {
335 if (ptr.seg == Vptr::Segment::FS) {
336 a.fs();
337 } else if (ptr.seg == Vptr::Segment::GS) {
338 a.gs();
340 return a;
343 ///////////////////////////////////////////////////////////////////////////////
346 * Returns true iff the status flags necessary to take a j<a> imply that a j<b>
347 * will also be taken.
349 bool ccImplies(ConditionCode a, ConditionCode b) {
350 if (a == b) return true;
352 switch (a) {
353 case CC_None:
354 case CC_O: case CC_NO:
355 case CC_AE: case CC_BE:
356 case CC_NE:
357 case CC_S: case CC_NS:
358 case CC_P: case CC_NP:
359 case CC_GE: case CC_LE:
360 return false;
362 case CC_B: return b == CC_BE;
363 case CC_E: return b == CC_BE || b == CC_LE;
364 case CC_A: return b == CC_AE || b == CC_NE;
365 case CC_L: return b == CC_LE;
366 case CC_G: return b == CC_NE || b == CC_GE;
368 always_assert(false);
371 static CodeAddress toReal(Venv& env, CodeAddress a) {
372 if (env.text.main().code.contains(a)) {
373 return env.text.main().code.toDestAddress(a);
375 if (env.text.cold().code.contains(a)) {
376 return env.text.cold().code.toDestAddress(a);
378 if (env.text.frozen().code.contains(a)) {
379 return env.text.frozen().code.toDestAddress(a);
381 return a;
385 * When two jccs go to the same destination, the cc of the first is compatible
386 * with the cc of the second, and they're within a one-byte offset of each
387 * other, retarget the first to jump to the second. This will allow the
388 * relocator to shrink the first one, and the extra jmp shouldn't matter since
389 * we try to only do this to rarely taken jumps.
391 void retargetJumps(Venv& env,
392 const jit::hash_map<TCA, jit::vector<TCA>>& jccs) {
393 jit::hash_set<TCA> retargeted;
394 for (auto& pair : jccs) {
395 auto const& jmps = pair.second;
396 if (jmps.size() < 2) continue;
398 for (size_t i = 0; i < jmps.size(); ++i) {
399 DecodedInstruction di(toReal(env, jmps[i]), jmps[i]);
400 // Don't bother if the jump is already a short jump.
401 if (di.size() != 6) continue;
403 for (size_t j = jmps.size() - 1; j > i; --j) {
404 auto const delta = jmps[j] - jmps[i] + 2;
405 // Backwards jumps are probably not guards, and don't retarget to a
406 // dest that's more than a one-byte offset away.
407 if (delta < 0 || !deltaFits(delta, sz::byte)) continue;
409 DecodedInstruction dj(toReal(env, jmps[j]), jmps[j]);
410 if (!ccImplies(di.jccCondCode(), dj.jccCondCode())) continue;
412 di.setPicAddress(jmps[j]);
413 retargeted.insert(jmps[i]);
415 // We might've converted a smashable jump to a regular in-unit jump, so
416 // remove any smashable alignments.
417 auto range = env.meta.alignments.equal_range(jmps[i]);
418 while (range.first != range.second) {
419 auto iter = range.first;
420 ++range.first;
422 auto& align = iter->second;
423 if (align.first == Alignment::SmashJcc &&
424 align.second == AlignContext::Live) {
425 env.meta.alignments.erase(iter);
429 break;
434 // Finally, remove any retargeted jmps from inProgressTailJumps.
435 if (!retargeted.empty()) {
436 GrowableVector<IncomingBranch> newTailJumps;
437 for (auto& jmp : env.meta.inProgressTailJumps) {
438 if (retargeted.count(jmp.toSmash()) == 0) {
439 newTailJumps.push_back(jmp);
442 env.meta.inProgressTailJumps.swap(newTailJumps);
446 void Vgen::patch(Venv& env) {
447 for (auto& p : env.jmps) {
448 assertx(env.addrs[p.target]);
449 X64Assembler::patchJmp(toReal(env, p.instr), p.instr, env.addrs[p.target]);
452 auto const optLevel = RuntimeOption::EvalJitRetargetJumps;
453 jit::hash_map<TCA, jit::vector<TCA>> jccs;
454 for (auto& p : env.jccs) {
455 assertx(env.addrs[p.target]);
456 X64Assembler::patchJcc(toReal(env, p.instr), p.instr, env.addrs[p.target]);
457 if (optLevel >= 2 ||
458 (optLevel == 1 && p.target >= env.unit.blocks.size())) {
459 jccs[env.addrs[p.target]].emplace_back(p.instr);
463 if (!jccs.empty()) retargetJumps(env, jccs);
466 void Vgen::pad(CodeBlock& cb) {
467 X64Assembler a { cb };
468 while (a.available() >= 2) a.ud2();
469 if (a.available() > 0) a.int3();
470 assertx(a.available() == 0);
473 ///////////////////////////////////////////////////////////////////////////////
475 void Vgen::emit(const copy& i) {
476 if (i.s == i.d) return;
477 if (i.s.isGP()) {
478 if (i.d.isGP()) { // GP => GP
479 a.movq(i.s, i.d);
480 } else { // GP => XMM
481 assertx(i.d.isSIMD());
482 // This generates a movq x86 instruction, which zero extends
483 // the 64-bit value in srcReg into a 128-bit XMM register
484 a.movq_rx(i.s, i.d);
486 } else {
487 if (i.d.isGP()) { // XMM => GP
488 a.movq_xr(i.s, i.d);
489 } else { // XMM => XMM
490 assertx(i.d.isSIMD());
491 // This copies all 128 bits in XMM,
492 // thus avoiding partial register stalls
493 a.movdqa(i.s, i.d);
498 void Vgen::emit(const copy2& i) {
499 assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid());
500 auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1;
501 assertx(d0 != d1);
502 if (d0 == s1) {
503 if (d1 == s0) {
504 a.xchgq(d0, d1);
505 } else {
506 // could do this in a simplify pass
507 if (s1 != d1) a.movq(s1, d1); // save s1 first; d1 != s0
508 if (s0 != d0) a.movq(s0, d0);
510 } else {
511 // could do this in a simplify pass
512 if (s0 != d0) a.movq(s0, d0);
513 if (s1 != d1) a.movq(s1, d1);
517 void Vgen::emit_simd_imm(int64_t val, Vreg d) {
518 if (val == 0) {
519 a.pxor(d, d); // does not modify flags
520 } else {
521 auto addr = alloc_literal(env, val);
522 a.movsd(rip[(intptr_t)addr], d);
526 void Vgen::emit(const ldimmb& i) {
527 // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t
528 auto val = i.s.ub();
529 if (i.d.isGP()) {
530 Vreg8 d8 = i.d;
531 a.movb(static_cast<int8_t>(val), d8);
532 } else {
533 emit_simd_imm(val, i.d);
537 void Vgen::emit(const ldimml& i) {
538 // ldimml is for Vconst::Long, which is treated as unsigned uint32_t
539 auto val = i.s.l();
540 if (i.d.isGP()) {
541 Vreg32 d32 = i.d;
542 a.movl(val, d32);
543 } else {
544 emit_simd_imm(uint32_t(val), i.d);
548 void Vgen::emit(const ldimmq& i) {
549 auto val = i.s.q();
550 if (i.d.isGP()) {
551 if (val == 0) {
552 Vreg32 d32 = i.d;
553 a.movl(0, d32); // because emitImmReg tries the xor optimization
554 } else {
555 a.emitImmReg(i.s, i.d);
557 } else {
558 emit_simd_imm(val, i.d);
562 void Vgen::emit(const load& i) {
563 prefix(a, i.s);
564 auto mref = i.s.mr();
565 if (i.d.isGP()) {
566 a.loadq(mref, i.d);
567 } else {
568 assertx(i.d.isSIMD());
569 a.movsd(mref, i.d);
573 void Vgen::emit(const store& i) {
574 if (i.s.isGP()) {
575 a.storeq(i.s, i.d);
576 } else {
577 assertx(i.s.isSIMD());
578 a.movsd(i.s, i.d);
582 ///////////////////////////////////////////////////////////////////////////////
584 void Vgen::emit(const mcprep& i) {
586 * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
587 * address of the movq) so that we can find the movq from the handler.
589 * We set the low bit for two reasons: the Class* will never be a valid
590 * Class*, so we'll always miss the inline check before it's smashed, and
591 * handlePrimeCacheInit can tell it's not been smashed yet
593 auto const mov_addr = emitSmashableMovq(a.code(), env.meta, 0, r64(i.d));
594 auto const imm = reinterpret_cast<uint64_t>(mov_addr);
595 smashMovq(a.toDestAddress(mov_addr), (imm << 1) | 1);
597 env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm));
600 ///////////////////////////////////////////////////////////////////////////////
602 void Vgen::emit(const call& i) {
603 if (a.jmpDeltaFits(i.target)) {
604 a.call(i.target);
605 } else {
606 // can't do a near call; store address in data section.
607 // call by loading the address using rip-relative addressing. This
608 // assumes the data section is near the current code section. Since
609 // this sequence is directly in-line, rip-relative like this is
610 // more compact than loading a 64-bit immediate.
611 auto addr = alloc_literal(env, (uint64_t)i.target);
612 a.call(rip[(intptr_t)addr]);
614 if (i.watch) {
615 *i.watch = a.frontier();
616 env.meta.watchpoints.push_back(i.watch);
620 void Vgen::emit(const calls& i) {
621 emitSmashableCall(a.code(), env.meta, i.target);
624 ///////////////////////////////////////////////////////////////////////////////
626 void Vgen::emit(const stubret& i) {
627 if (i.saveframe) {
628 a.pop(rvmfp());
629 } else {
630 a.addq(8, reg::rsp);
632 a.ret();
635 void Vgen::emit(const callstub& i) {
636 emit(call{i.target, i.args});
639 void Vgen::emit(const callfaststub& i) {
640 emit(call{i.target, i.args});
641 emit(syncpoint{i.fix});
644 void Vgen::emit(const tailcallstub& i) {
645 a.addq(8, reg::rsp);
646 emit(jmpi{i.target, i.args});
649 ///////////////////////////////////////////////////////////////////////////////
651 void Vgen::emit(const phpret& i) {
652 a.push(i.fp[AROFF(m_savedRip)]);
653 if (!i.noframe) {
654 a.loadq(i.fp[AROFF(m_sfp)], i.d);
656 a.ret();
659 void Vgen::emit(const tailcallphp& i) {
660 emit(pushm{i.fp[AROFF(m_savedRip)]});
661 emit(jmpr{i.target, i.args});
664 void Vgen::emit(const callarray& i) {
665 emit(call{i.target, i.args});
668 void Vgen::emit(const contenter& i) {
669 Label Stub, End;
670 Reg64 fp = i.fp, target = i.target;
671 a.jmp8(End);
673 asm_label(a, Stub);
674 a.pop(fp[AROFF(m_savedRip)]);
675 a.jmp(target);
677 asm_label(a, End);
678 a.call(Stub);
679 // m_savedRip will point here.
680 emit(unwind{{i.targets[0], i.targets[1]}});
683 ///////////////////////////////////////////////////////////////////////////////
685 void Vgen::emit(const calltc& i) {
686 a.push(i.exittc);
687 a.push(i.fp[AROFF(m_savedRip)]);
689 Label stub;
690 a.call(stub);
692 asm_label(a, stub);
693 assertx(!i.args.contains(reg::rax));
694 a.pop(reg::rax); // unused
695 a.jmp(i.target);
698 ///////////////////////////////////////////////////////////////////////////////
700 void Vgen::emit(const nothrow& i) {
701 env.meta.catches.emplace_back(a.frontier(), nullptr);
704 void Vgen::emit(const syncpoint& i) {
705 FTRACE(5, "IR recordSyncPoint: {} {} {}\n", a.frontier(),
706 i.fix.pcOffset, i.fix.spOffset);
707 env.meta.fixups.emplace_back(a.frontier(), i.fix);
710 void Vgen::emit(const unwind& i) {
711 catches.push_back({a.frontier(), i.targets[1]});
712 emit(jmp{i.targets[0]});
715 ///////////////////////////////////////////////////////////////////////////////
717 void Vgen::emit(andqi i) {
718 if (magFits(i.s0.q(), sz::dword)) {
719 emit(andli{int32_t(i.s0.q()), Reg32(i.s1), Reg32(i.d), i.sf});
720 return;
723 binary(i);
724 a.andq(i.s0, i.d);
727 void Vgen::emit(const addlim& i) {
728 prefix(a, i.m).addl(i.s0, i.m.mr());
731 void Vgen::emit(const addqim& i) {
732 prefix(a, i.m).addq(i.s0, i.m.mr());
735 void Vgen::emit(const cloadq& i) {
736 auto m = i.t;
737 always_assert(!m.index.isValid()); // not supported, but could be later.
738 if (i.f != i.d) {
739 if (i.d == m.base) {
740 // We can't move f over d or we'll clobber the Vptr we need to load from.
741 // Since cload does the load unconditionally anyway, we can just load and
742 // cmov.
743 a.loadq(i.t, i.d);
744 a.cmov_reg64_reg64(ccNegate(i.cc), i.f, i.d);
745 return;
747 a.movq(i.f, i.d);
749 a.cload_reg64_disp_reg64(i.cc, m.base, m.disp, i.d);
752 // add s0 s1 d => mov s1->d; d += s0
753 // cmov cc s d => if cc { mov s->d }
754 template<class cmov>
755 void Vgen::emit_cmov(const cmov& i) {
756 if (i.f != i.d && i.t == i.d) {
757 // negate the condition and swap t/f operands so we dont clobber i.t
758 return emit(cmov{ccNegate(i.cc), i.sf, i.t, i.f, i.d});
759 } else {
760 prep(i.f, i.d);
762 a.cmov_reg64_reg64(i.cc, r64(i.t), r64(i.d));
765 void Vgen::emit(const cvtsi2sd& i) {
766 a.pxor(i.d, i.d);
767 a.cvtsi2sd(i.s, i.d);
770 void Vgen::emit(const cvtsi2sdm& i) {
771 a.pxor(i.d, i.d);
772 a.cvtsi2sd(i.s, i.d);
775 void Vgen::emit(const jcc& i) {
776 if (i.targets[1] != i.targets[0]) {
777 if (next == i.targets[1]) {
778 return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}});
780 auto taken = i.targets[1];
781 jccs.push_back({a.frontier(), taken});
782 a.jcc(i.cc, a.frontier());
784 emit(jmp{i.targets[0]});
787 void Vgen::emit(const jcci& i) {
788 a.jcc(i.cc, i.taken);
789 emit(jmp{i.target});
792 void Vgen::emit(const jmp& i) {
793 if (next == i.target) return;
794 jmps.push_back({a.frontier(), i.target});
795 a.jmp(a.frontier());
798 void Vgen::emit(const jmpi& i) {
799 if (a.jmpDeltaFits(i.target)) {
800 a.jmp(i.target);
801 } else {
802 // can't do a near jmp - use rip-relative addressing
803 auto addr = alloc_literal(env, (uint64_t)i.target);
804 a.jmp(rip[(intptr_t)addr]);
808 void Vgen::emit(const lea& i) {
809 // could do this in a simplify pass
810 if (i.s.disp == 0 && i.s.base.isValid() && !i.s.index.isValid()) {
811 emit(copy{i.s.base, i.d});
812 } else {
813 a.lea(i.s, i.d);
817 void Vgen::emit(const storebi& i) {
818 prefix(a, i.m).storeb(i.s, i.m.mr());
821 void Vgen::emit(const storeqi& i) {
822 prefix(a, i.m).storeq(i.s, i.m.mr());
825 template<typename Inst>
826 bool testimHelper(Vgen& env, const Inst& i, uint64_t mask) {
827 // If there's only 1 byte of meaningful bits in the mask, we can adjust the
828 // pointer offset and use testbim instead.
829 int off = 0;
830 while (mask > 0xff && !(mask & 0xff)) {
831 off++;
832 mask >>= 8;
835 if (mask > 0xff) return false;
837 env.emit(testbim{int8_t(mask), i.s1 + off, i.sf});
838 return true;
841 void Vgen::emit(const testwim& i) {
842 if (testimHelper(*this, i, i.s0.w())) return;
843 a.testw(i.s0, i.s1);
846 void Vgen::emit(const testlim& i) {
847 if (testimHelper(*this, i, i.s0.l())) return;
848 a.testl(i.s0, i.s1);
851 void Vgen::emit(const testli& i) {
852 if (i.s0.l() == -1) {
853 return emit(testl{i.s1, i.s1, i.sf});
855 a.testl(i.s0, i.s1);
858 void Vgen::emit(const testqi& i) {
859 auto const imm = i.s0.q();
860 if (magFits(imm, sz::byte)) {
861 a.testb(int8_t(imm), rbyte(i.s1));
862 } else if (magFits(imm, sz::dword)) {
863 emit(testli{int32_t(imm), Reg32(i.s1), i.sf});
864 } else if (imm == -1) {
865 emit(testq{i.s1, i.s1, i.sf});
866 } else {
867 a.testq(i.s0, i.s1);
871 void Vgen::emit(const testqim& i) {
872 if (testimHelper(*this, i, i.s0.q())) return;
873 if (magFits(i.s0.q(), sz::dword)) {
874 // For an unsigned 32 bit immediate, we can get the same results
875 // by emitting a testlim.
876 emit(testlim{int32_t(i.s0.q()), i.s1, i.sf});
877 } else {
878 a.testq(i.s0, i.s1);
882 void Vgen::emit(xorq i) {
883 if (i.s0 == i.s1) {
884 // 32-bit xor{s, s, d} zeroes the upper bits of `d'.
885 return emit(xorl{r32(i.s0), r32(i.s1), r32(i.d), i.sf});
887 commuteSF(i);
888 a.xorq(i.s0, i.d);
891 ///////////////////////////////////////////////////////////////////////////////
893 template<typename Lower>
894 void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) {
895 vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; });
898 template<typename Inst>
899 void lower(Vunit& unit, Inst& inst, Vlabel b, size_t i) {}
901 ///////////////////////////////////////////////////////////////////////////////
903 void lower(Vunit& unit, popp& inst, Vlabel b, size_t i) {
904 lower_impl(unit, b, i, [&] (Vout& v) {
905 v << pop{inst.d0};
906 v << pop{inst.d1};
910 void lower(Vunit& unit, poppm& inst, Vlabel b, size_t i) {
911 lower_impl(unit, b, i, [&] (Vout& v) {
912 v << popm{inst.d0};
913 v << popm{inst.d1};
917 void lower(Vunit& unit, pushp& inst, Vlabel b, size_t i) {
918 lower_impl(unit, b, i, [&] (Vout& v) {
919 v << push{inst.s0};
920 v << push{inst.s1};
924 void lower(Vunit& unit, pushpm& inst, Vlabel b, size_t i) {
925 lower_impl(unit, b, i, [&] (Vout& v) {
926 v << pushm{inst.s0};
927 v << pushm{inst.s1};
931 ///////////////////////////////////////////////////////////////////////////////
933 void lower(Vunit& unit, stublogue& inst, Vlabel b, size_t i) {
934 if (inst.saveframe) {
935 unit.blocks[b].code[i] = push{rvmfp()};
936 } else {
937 unit.blocks[b].code[i] = lea{reg::rsp[-8], reg::rsp};
941 void lower(Vunit& unit, stubunwind& inst, Vlabel b, size_t i) {
942 unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
945 void lower(Vunit& unit, stubtophp& inst, Vlabel b, size_t i) {
946 unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp};
949 void lower(Vunit& unit, loadstubret& inst, Vlabel b, size_t i) {
950 unit.blocks[b].code[i] = load{reg::rsp[8], inst.d};
953 void lower(Vunit& unit, phplogue& inst, Vlabel b, size_t i) {
954 unit.blocks[b].code[i] = popm{inst.fp[AROFF(m_savedRip)]};
957 void lower(Vunit& unit, resumetc& inst, Vlabel b, size_t i) {
958 lower_impl(unit, b, i, [&] (Vout& v) {
959 v << callr{inst.target, inst.args};
960 v << jmpi{inst.exittc};
964 ///////////////////////////////////////////////////////////////////////////////
966 void lower(Vunit& unit, sar& inst, Vlabel b, size_t i) {
967 lower_impl(unit, b, i, [&] (Vout& v) {
968 v << copy{inst.s0, rcx};
969 v << sarq{inst.s1, inst.d, inst.sf};
973 void lower(Vunit& unit, shl& inst, Vlabel b, size_t i) {
974 lower_impl(unit, b, i, [&] (Vout& v) {
975 v << copy{inst.s0, rcx};
976 v << shlq{inst.s1, inst.d, inst.sf};
980 void lower(Vunit& unit, srem& inst, Vlabel b, size_t i) {
981 lower_impl(unit, b, i, [&] (Vout& v) {
982 v << copy{inst.s0, rax};
983 v << cqo{}; // sign-extend rax => rdx:rax
984 v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
985 v << copy{rdx, inst.d};
989 void lower(Vunit& unit, divint& inst, Vlabel b, size_t i) {
990 lower_impl(unit, b, i, [&] (Vout& v) {
991 v << copy{inst.s0, rax};
992 v << cqo{}; // sign-extend rax => rdx:rax
993 v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
994 v << copy{rax, inst.d};
998 ///////////////////////////////////////////////////////////////////////////////
1000 void lower(Vunit& unit, movtqb& inst, Vlabel b, size_t i) {
1001 unit.blocks[b].code[i] = copy{inst.s, inst.d};
1003 void lower(Vunit& unit, movtdb& inst, Vlabel b, size_t i) {
1004 unit.blocks[b].code[i] = copy{inst.s, inst.d};
1006 void lower(Vunit& unit, movtdq& inst, Vlabel b, size_t i) {
1007 unit.blocks[b].code[i] = copy{inst.s, inst.d};
1009 void lower(Vunit& unit, movtql& inst, Vlabel b, size_t i) {
1010 unit.blocks[b].code[i] = copy{inst.s, inst.d};
1013 ///////////////////////////////////////////////////////////////////////////////
1016 * Lower a few abstractions to facilitate straightforward x64 codegen.
1018 void lowerForX64(Vunit& unit) {
1019 vasm_lower(unit, [&] (const VLS& env, Vinstr& inst, Vlabel b, size_t i) {
1020 switch (inst.op) {
1021 #define O(name, ...) \
1022 case Vinstr::name: \
1023 lower(unit, inst.name##_, b, i); \
1024 break;
1026 VASM_OPCODES
1027 #undef O
1032 ///////////////////////////////////////////////////////////////////////////////
1035 void optimizeX64(Vunit& unit, const Abi& abi, bool regalloc) {
1036 Timer timer(Timer::vasm_optimize, unit.log_entry);
1038 removeTrivialNops(unit);
1039 optimizePhis(unit);
1040 fuseBranches(unit);
1041 optimizeJmps(unit);
1042 optimizeExits(unit);
1044 assertx(checkWidths(unit));
1046 if (unit.context && isProfiling(unit.context->kind) && abi.canSpill &&
1047 RuntimeOption::EvalProfBranchSampleFreq > 0) {
1048 // Even when branch profiling is on, we still only want to profile
1049 // non-profiling translations of PHP functions. We also require that we
1050 // can spill, so that we can generate arbitrary profiling code, and also to
1051 // ensure we don't profile unique stubs and such.
1052 profile_branches(unit);
1055 lowerForX64(unit);
1056 simplify(unit);
1057 lowerForX64(unit);
1059 if (!unit.constToReg.empty()) {
1060 foldImms<x64::ImmFolder>(unit);
1063 optimizeCopies(unit, abi);
1065 if (unit.needsRegAlloc()) {
1066 removeDeadCode(unit);
1067 if (regalloc) allocateRegisters(unit, abi);
1069 if (unit.blocks.size() > 1) {
1070 optimizeJmps(unit);
1074 void emitX64(Vunit& unit, Vtext& text, CGMeta& fixups,
1075 AsmInfo* asmInfo) {
1076 vasm_emit<Vgen>(unit, text, fixups, asmInfo);
1079 ///////////////////////////////////////////////////////////////////////////////