2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/vm/jit/vasm-emit.h"
19 #include "hphp/runtime/base/runtime-option.h"
21 #include "hphp/runtime/vm/jit/abi-x64.h"
22 #include "hphp/runtime/vm/jit/block.h"
23 #include "hphp/runtime/vm/jit/code-gen-helpers.h"
24 #include "hphp/runtime/vm/jit/func-guard-x64.h"
25 #include "hphp/runtime/vm/jit/print.h"
26 #include "hphp/runtime/vm/jit/prof-data.h"
27 #include "hphp/runtime/vm/jit/service-requests.h"
28 #include "hphp/runtime/vm/jit/smashable-instr-x64.h"
29 #include "hphp/runtime/vm/jit/target-cache.h"
30 #include "hphp/runtime/vm/jit/timer.h"
31 #include "hphp/runtime/vm/jit/vasm.h"
32 #include "hphp/runtime/vm/jit/vasm-instr.h"
33 #include "hphp/runtime/vm/jit/vasm-internal.h"
34 #include "hphp/runtime/vm/jit/vasm-lower.h"
35 #include "hphp/runtime/vm/jit/vasm-print.h"
36 #include "hphp/runtime/vm/jit/vasm-prof.h"
37 #include "hphp/runtime/vm/jit/vasm-unit.h"
38 #include "hphp/runtime/vm/jit/vasm-util.h"
39 #include "hphp/runtime/vm/jit/vasm-visit.h"
46 namespace HPHP
{ namespace jit
{
47 ///////////////////////////////////////////////////////////////////////////////
52 namespace x64
{ struct ImmFolder
; }
55 ///////////////////////////////////////////////////////////////////////////////
58 explicit Vgen(Venv
& env
)
61 , current(env
.current
)
65 , catches(env
.catches
)
68 static void patch(Venv
& env
);
69 static void pad(CodeBlock
& cb
);
71 /////////////////////////////////////////////////////////////////////////////
73 template<class Inst
> void emit(const Inst
& i
) {
74 always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
75 vinst_names
[Vinstr(i
).op
], size_t(current
));
79 void emit(const copy
& i
);
80 void emit(const copy2
& i
);
81 void emit(const debugtrap
& i
) { a
.int3(); }
82 void emit(const fallthru
& i
) {}
83 void emit(const ldimmb
& i
);
84 void emit(const ldimml
& i
);
85 void emit(const ldimmq
& i
);
86 void emit(const load
& i
);
87 void emit(const store
& i
);
88 void emit(const mcprep
& i
);
90 // native function abi
91 void emit(const call
& i
);
92 void emit(const callm
& i
) { a
.call(i
.target
); }
93 void emit(const callr
& i
) { a
.call(i
.target
); }
94 void emit(const calls
& i
);
95 void emit(const ret
& i
) { a
.ret(); }
98 void emit(const stubret
& i
);
99 void emit(const callstub
& i
);
100 void emit(const callfaststub
& i
);
101 void emit(const tailcallstub
& i
);
104 void emit(const phpret
& i
);
105 void emit(const tailcallphp
& i
);
106 void emit(const callarray
& i
);
107 void emit(const contenter
& i
);
110 void emit(const inittc
& i
) {}
111 void emit(const calltc
&);
112 void emit(const leavetc
&) { a
.ret(); }
115 void emit(const landingpad
& i
) {}
116 void emit(const nothrow
& i
);
117 void emit(const syncpoint
& i
);
118 void emit(const unwind
& i
);
121 void emit(absdbl i
) { unary(i
); a
.psllq(1, i
.d
); a
.psrlq(1, i
.d
); }
122 void emit(andb i
) { commuteSF(i
); a
.andb(i
.s0
, i
.d
); }
123 void emit(andbi i
) { binary(i
); a
.andb(i
.s0
, i
.d
); }
124 void emit(const andbim
& i
) { a
.andb(i
.s
, i
.m
); }
125 void emit(andl i
) { commuteSF(i
); a
.andl(i
.s0
, i
.d
); }
126 void emit(andli i
) { binary(i
); a
.andl(i
.s0
, i
.d
); }
127 void emit(andq i
) { commuteSF(i
); a
.andq(i
.s0
, i
.d
); }
129 void emit(addli i
) { binary(i
); a
.addl(i
.s0
, i
.d
); }
130 void emit(const addlm
& i
) { a
.addl(i
.s0
, i
.m
); }
131 void emit(const addlim
& i
);
132 void emit(addq i
) { commuteSF(i
); a
.addq(i
.s0
, i
.d
); }
133 void emit(addqi i
) { binary(i
); a
.addq(i
.s0
, i
.d
); }
134 void emit(const addqim
& i
);
135 void emit(addsd i
) { commute(i
); a
.addsd(i
.s0
, i
.d
); }
136 void emit(const cloadq
& i
);
137 template<class cmov
> void emit_cmov(const cmov
& i
);
138 void emit(const cmovb
& i
) { emit_cmov(i
); }
139 void emit(const cmovw
& i
) { emit_cmov(i
); }
140 void emit(const cmovl
& i
) { emit_cmov(i
); }
141 void emit(const cmovq
& i
) { emit_cmov(i
); }
142 void emit(const cmpb
& i
) { a
.cmpb(i
.s0
, i
.s1
); }
143 void emit(const cmpbi
& i
) { a
.cmpb(i
.s0
, i
.s1
); }
144 void emit(const cmpbim
& i
) { a
.cmpb(i
.s0
, i
.s1
); }
145 void emit(const cmpbm
& i
) { a
.cmpb(i
.s0
, i
.s1
); }
146 void emit(const cmpwim
& i
) { a
.cmpw(i
.s0
, i
.s1
); }
147 void emit(const cmpwm
& i
) { a
.cmpw(i
.s0
, i
.s1
); }
148 void emit(const cmpl
& i
) { a
.cmpl(i
.s0
, i
.s1
); }
149 void emit(const cmpli
& i
) { a
.cmpl(i
.s0
, i
.s1
); }
150 void emit(const cmplim
& i
) { a
.cmpl(i
.s0
, i
.s1
); }
151 void emit(const cmplm
& i
) { a
.cmpl(i
.s0
, i
.s1
); }
152 void emit(const cmpq
& i
) { a
.cmpq(i
.s0
, i
.s1
); }
153 void emit(const cmpqi
& i
) { a
.cmpq(i
.s0
, i
.s1
); }
154 void emit(const cmpqim
& i
) { a
.cmpq(i
.s0
, i
.s1
); }
155 void emit(const cmpqm
& i
) { a
.cmpq(i
.s0
, i
.s1
); }
156 void emit(cmpsd i
) { noncommute(i
); a
.cmpsd(i
.s0
, i
.d
, i
.pred
); }
157 void emit(const cqo
& i
) { a
.cqo(); }
158 void emit(const cvttsd2siq
& i
) { a
.cvttsd2siq(i
.s
, i
.d
); }
159 void emit(const cvtsi2sd
& i
);
160 void emit(const cvtsi2sdm
& i
);
161 void emit(decl i
) { unary(i
); a
.decl(i
.d
); }
162 void emit(const declm
& i
) { a
.decl(i
.m
); }
163 void emit(decq i
) { unary(i
); a
.decq(i
.d
); }
164 void emit(const decqm
& i
) { a
.decq(i
.m
); }
165 void emit(const decqmlock
& i
) { a
.lock(); a
.decq(i
.m
); }
166 void emit(divsd i
) { noncommute(i
); a
.divsd(i
.s0
, i
.d
); }
167 void emit(imul i
) { commuteSF(i
); a
.imul(i
.s0
, i
.d
); }
168 void emit(const idiv
& i
) { a
.idiv(i
.s
); }
169 void emit(incl i
) { unary(i
); a
.incl(i
.d
); }
170 void emit(const inclm
& i
) { a
.incl(i
.m
); }
171 void emit(incq i
) { unary(i
); a
.incq(i
.d
); }
172 void emit(const incqm
& i
) { a
.incq(i
.m
); }
173 void emit(const incwm
& i
) { a
.incw(i
.m
); }
174 void emit(const jcc
& i
);
175 void emit(const jcci
& i
);
176 void emit(const jmp
& i
);
177 void emit(const jmpr
& i
) { a
.jmp(i
.target
); }
178 void emit(const jmpm
& i
) { a
.jmp(i
.target
); }
179 void emit(const jmpi
& i
);
180 void emit(const lea
& i
);
181 void emit(const leap
& i
) { a
.lea(i
.s
, i
.d
); }
182 void emit(const lead
& i
) { a
.lea(rip
[(intptr_t)i
.s
.get()], i
.d
); }
183 void emit(const loadups
& i
) { a
.movups(i
.s
, i
.d
); }
184 void emit(const loadtqb
& i
) { a
.loadb(i
.s
, i
.d
); }
185 void emit(const loadb
& i
) { a
.loadb(i
.s
, i
.d
); }
186 void emit(const loadw
& i
) { a
.loadw(i
.s
, i
.d
); }
187 void emit(const loadl
& i
) { a
.loadl(i
.s
, i
.d
); }
188 void emit(const loadqp
& i
) { a
.loadq(i
.s
, i
.d
); }
189 void emit(const loadqd
& i
) { a
.loadq(rip
[(intptr_t)i
.s
.get()], i
.d
); }
190 void emit(const loadsd
& i
) { a
.movsd(i
.s
, i
.d
); }
191 void emit(const loadzbl
& i
) { a
.loadzbl(i
.s
, i
.d
); }
192 void emit(const loadzbq
& i
) { a
.loadzbl(i
.s
, Reg32(i
.d
)); }
193 void emit(const loadzlq
& i
) { a
.loadl(i
.s
, Reg32(i
.d
)); }
194 void emit(const movb
& i
) { a
.movb(i
.s
, i
.d
); }
195 void emit(const movl
& i
) { a
.movl(i
.s
, i
.d
); }
196 void emit(const movzbw
& i
) { a
.movzbl(i
.s
, Reg32(i
.d
)); }
197 void emit(const movzbl
& i
) { a
.movzbl(i
.s
, i
.d
); }
198 void emit(const movzbq
& i
) { a
.movzbl(i
.s
, Reg32(i
.d
)); }
199 void emit(const movzwl
& i
) { a
.movzwl(i
.s
, i
.d
); }
200 void emit(const movzwq
& i
) { a
.movzwl(i
.s
, Reg32(i
.d
)); }
201 void emit(const movzlq
& i
) { a
.movl(i
.s
, Reg32(i
.d
)); }
202 void emit(mulsd i
) { commute(i
); a
.mulsd(i
.s0
, i
.d
); }
203 void emit(neg i
) { unary(i
); a
.neg(i
.d
); }
204 void emit(const nop
& i
) { a
.nop(); }
205 void emit(not i
) { unary(i
); a
.not(i
.d
); }
206 void emit(notb i
) { unary(i
); a
.notb(i
.d
); }
207 void emit(const orbim
& i
) { a
.orb(i
.s0
, i
.m
); }
208 void emit(const orwim
& i
) { a
.orw(i
.s0
, i
.m
); }
209 void emit(orq i
) { commuteSF(i
); a
.orq(i
.s0
, i
.d
); }
210 void emit(orqi i
) { binary(i
); a
.orq(i
.s0
, i
.d
); }
211 void emit(const orqim
& i
) { a
.orq(i
.s0
, i
.m
); }
212 void emit(const pop
& i
) { a
.pop(i
.d
); }
213 void emit(const popm
& i
) { a
.pop(i
.d
); }
214 void emit(const popf
& i
) { assertx(i
.d
== RegSF
{0}); a
.popf(); }
215 void emit(const push
& i
) { a
.push(i
.s
); }
216 void emit(const pushm
& i
) { a
.push(i
.s
); }
217 void emit(const pushf
& i
) { assertx(i
.s
== RegSF
{0}); a
.pushf(); }
218 void emit(const roundsd
& i
) { a
.roundsd(i
.dir
, i
.s
, i
.d
); }
219 void emit(const sarq
& i
) { unary(i
); a
.sarq(i
.d
); }
220 void emit(sarqi i
) { binary(i
); a
.sarq(i
.s0
, i
.d
); }
221 void emit(const setcc
& i
) { a
.setcc(i
.cc
, i
.d
); }
222 void emit(shlli i
) { binary(i
); a
.shll(i
.s0
, i
.d
); }
223 void emit(shlq i
) { unary(i
); a
.shlq(i
.d
); }
224 void emit(shlqi i
) { binary(i
); a
.shlq(i
.s0
, i
.d
); }
225 void emit(shrli i
) { binary(i
); a
.shrl(i
.s0
, i
.d
); }
226 void emit(shrqi i
) { binary(i
); a
.shrq(i
.s0
, i
.d
); }
227 void emit(const sqrtsd
& i
) { a
.sqrtsd(i
.s
, i
.d
); }
228 void emit(const storeups
& i
) { a
.movups(i
.s
, i
.m
); }
229 void emit(const storeb
& i
) { a
.storeb(i
.s
, i
.m
); }
230 void emit(const storebi
& i
);
231 void emit(const storel
& i
) { a
.storel(i
.s
, i
.m
); }
232 void emit(const storeli
& i
) { a
.storel(i
.s
, i
.m
); }
233 void emit(const storeqi
& i
);
234 void emit(const storesd
& i
) { a
.movsd(i
.s
, i
.m
); }
235 void emit(const storew
& i
) { a
.storew(i
.s
, i
.m
); }
236 void emit(const storewi
& i
) { a
.storew(i
.s
, i
.m
); }
237 void emit(subbi i
) { binary(i
); a
.subb(i
.s0
, i
.d
); }
238 void emit(subl i
) { noncommute(i
); a
.subl(i
.s0
, i
.d
); }
239 void emit(subli i
) { binary(i
); a
.subl(i
.s0
, i
.d
); }
240 void emit(subq i
) { noncommute(i
); a
.subq(i
.s0
, i
.d
); }
241 void emit(subqi i
) { binary(i
); a
.subq(i
.s0
, i
.d
); }
242 void emit(subsd i
) { noncommute(i
); a
.subsd(i
.s0
, i
.d
); }
243 void emit(const testb
& i
) { a
.testb(i
.s0
, i
.s1
); }
244 void emit(const testbi
& i
) { a
.testb(i
.s0
, i
.s1
); }
245 void emit(const testbim
& i
) { a
.testb(i
.s0
, i
.s1
); }
246 void emit(const testwim
& i
);
247 void emit(const testl
& i
) { a
.testl(i
.s0
, i
.s1
); }
248 void emit(const testli
& i
);
249 void emit(const testlim
& i
);
250 void emit(const testq
& i
) { a
.testq(i
.s0
, i
.s1
); }
251 void emit(const testqi
& i
);
252 void emit(const testqm
& i
) { a
.testq(i
.s0
, i
.s1
); }
253 void emit(const testqim
& i
);
254 void emit(const ucomisd
& i
) { a
.ucomisd(i
.s0
, i
.s1
); }
255 void emit(const ud2
& i
) { a
.ud2(); }
256 void emit(unpcklpd i
) { noncommute(i
); a
.unpcklpd(i
.s0
, i
.d
); }
257 void emit(xorb i
) { commuteSF(i
); a
.xorb(i
.s0
, i
.d
); }
258 void emit(xorbi i
) { binary(i
); a
.xorb(i
.s0
, i
.d
); }
259 void emit(xorl i
) { commuteSF(i
); a
.xorl(i
.s0
, i
.d
); }
261 void emit(xorqi i
) { binary(i
); a
.xorq(i
.s0
, i
.d
); }
262 void emit(const conjure
& i
) { always_assert(false); }
263 void emit(const conjureuse
& i
) { always_assert(false); }
266 emit(lea
{rax
[8], rax
});
267 emit(lea
{rax
[-8], rax
});
272 void prep(Reg8 s
, Reg8 d
) { if (s
!= d
) a
.movb(s
, d
); }
273 void prep(Reg16 s
, Reg16 d
) { if (s
!= d
) a
.movw(s
, d
); }
274 void prep(Reg32 s
, Reg32 d
) { if (s
!= d
) a
.movl(s
, d
); }
275 void prep(Reg64 s
, Reg64 d
) { if (s
!= d
) a
.movq(s
, d
); }
276 void prep(RegXMM s
, RegXMM d
) { if (s
!= d
) a
.movdqa(s
, d
); }
277 void emit_simd_imm(int64_t, Vreg
);
279 template<class Inst
> void unary(Inst
& i
) { prep(i
.s
, i
.d
); }
280 template<class Inst
> void binary(Inst
& i
) { prep(i
.s1
, i
.d
); }
281 template<class Inst
> void commuteSF(Inst
&);
282 template<class Inst
> void commute(Inst
&);
283 template<class Inst
> void noncommute(Inst
&);
285 CodeBlock
& frozen() { return env
.text
.frozen().code
; }
291 const Vlabel current
;
293 jit::vector
<Venv::LabelPatch
>& jmps
;
294 jit::vector
<Venv::LabelPatch
>& jccs
;
295 jit::vector
<Venv::LabelPatch
>& catches
;
298 ///////////////////////////////////////////////////////////////////////////////
301 * Prepare a binary op that is not commutative.
303 * s0 must be a different register than s1 so we don't clobber it.
305 template<class Inst
> void Vgen::noncommute(Inst
& i
) {
306 assertx(i
.s1
== i
.d
|| i
.s0
!= i
.d
); // do not clobber s0
311 * Prepare a binary op that is commutative.
313 * Swap operands if the dest is s0.
315 template<class Inst
> void Vgen::commuteSF(Inst
& i
) {
316 if (i
.s1
!= i
.d
&& i
.s0
== i
.d
) {
317 i
= Inst
{i
.s1
, i
.s0
, i
.d
, i
.sf
};
323 template<class Inst
> void Vgen::commute(Inst
& i
) {
324 if (i
.s1
!= i
.d
&& i
.s0
== i
.d
) {
325 i
= Inst
{i
.s1
, i
.s0
, i
.d
};
332 * Helper for emitting instructions whose Vptr operand specifies a segment.
334 X64Assembler
& prefix(X64Assembler
& a
, const Vptr
& ptr
) {
335 if (ptr
.seg
== Vptr::Segment::FS
) {
337 } else if (ptr
.seg
== Vptr::Segment::GS
) {
343 ///////////////////////////////////////////////////////////////////////////////
346 * Returns true iff the status flags necessary to take a j<a> imply that a j<b>
347 * will also be taken.
349 bool ccImplies(ConditionCode a
, ConditionCode b
) {
350 if (a
== b
) return true;
354 case CC_O
: case CC_NO
:
355 case CC_AE
: case CC_BE
:
357 case CC_S
: case CC_NS
:
358 case CC_P
: case CC_NP
:
359 case CC_GE
: case CC_LE
:
362 case CC_B
: return b
== CC_BE
;
363 case CC_E
: return b
== CC_BE
|| b
== CC_LE
;
364 case CC_A
: return b
== CC_AE
|| b
== CC_NE
;
365 case CC_L
: return b
== CC_LE
;
366 case CC_G
: return b
== CC_NE
|| b
== CC_GE
;
368 always_assert(false);
371 static CodeAddress
toReal(Venv
& env
, CodeAddress a
) {
372 if (env
.text
.main().code
.contains(a
)) {
373 return env
.text
.main().code
.toDestAddress(a
);
375 if (env
.text
.cold().code
.contains(a
)) {
376 return env
.text
.cold().code
.toDestAddress(a
);
378 if (env
.text
.frozen().code
.contains(a
)) {
379 return env
.text
.frozen().code
.toDestAddress(a
);
385 * When two jccs go to the same destination, the cc of the first is compatible
386 * with the cc of the second, and they're within a one-byte offset of each
387 * other, retarget the first to jump to the second. This will allow the
388 * relocator to shrink the first one, and the extra jmp shouldn't matter since
389 * we try to only do this to rarely taken jumps.
391 void retargetJumps(Venv
& env
,
392 const jit::hash_map
<TCA
, jit::vector
<TCA
>>& jccs
) {
393 jit::hash_set
<TCA
> retargeted
;
394 for (auto& pair
: jccs
) {
395 auto const& jmps
= pair
.second
;
396 if (jmps
.size() < 2) continue;
398 for (size_t i
= 0; i
< jmps
.size(); ++i
) {
399 DecodedInstruction
di(toReal(env
, jmps
[i
]), jmps
[i
]);
400 // Don't bother if the jump is already a short jump.
401 if (di
.size() != 6) continue;
403 for (size_t j
= jmps
.size() - 1; j
> i
; --j
) {
404 auto const delta
= jmps
[j
] - jmps
[i
] + 2;
405 // Backwards jumps are probably not guards, and don't retarget to a
406 // dest that's more than a one-byte offset away.
407 if (delta
< 0 || !deltaFits(delta
, sz::byte
)) continue;
409 DecodedInstruction
dj(toReal(env
, jmps
[j
]), jmps
[j
]);
410 if (!ccImplies(di
.jccCondCode(), dj
.jccCondCode())) continue;
412 di
.setPicAddress(jmps
[j
]);
413 retargeted
.insert(jmps
[i
]);
415 // We might've converted a smashable jump to a regular in-unit jump, so
416 // remove any smashable alignments.
417 auto range
= env
.meta
.alignments
.equal_range(jmps
[i
]);
418 while (range
.first
!= range
.second
) {
419 auto iter
= range
.first
;
422 auto& align
= iter
->second
;
423 if (align
.first
== Alignment::SmashJcc
&&
424 align
.second
== AlignContext::Live
) {
425 env
.meta
.alignments
.erase(iter
);
434 // Finally, remove any retargeted jmps from inProgressTailJumps.
435 if (!retargeted
.empty()) {
436 GrowableVector
<IncomingBranch
> newTailJumps
;
437 for (auto& jmp
: env
.meta
.inProgressTailJumps
) {
438 if (retargeted
.count(jmp
.toSmash()) == 0) {
439 newTailJumps
.push_back(jmp
);
442 env
.meta
.inProgressTailJumps
.swap(newTailJumps
);
446 void Vgen::patch(Venv
& env
) {
447 for (auto& p
: env
.jmps
) {
448 assertx(env
.addrs
[p
.target
]);
449 X64Assembler::patchJmp(toReal(env
, p
.instr
), p
.instr
, env
.addrs
[p
.target
]);
452 auto const optLevel
= RuntimeOption::EvalJitRetargetJumps
;
453 jit::hash_map
<TCA
, jit::vector
<TCA
>> jccs
;
454 for (auto& p
: env
.jccs
) {
455 assertx(env
.addrs
[p
.target
]);
456 X64Assembler::patchJcc(toReal(env
, p
.instr
), p
.instr
, env
.addrs
[p
.target
]);
458 (optLevel
== 1 && p
.target
>= env
.unit
.blocks
.size())) {
459 jccs
[env
.addrs
[p
.target
]].emplace_back(p
.instr
);
463 if (!jccs
.empty()) retargetJumps(env
, jccs
);
466 void Vgen::pad(CodeBlock
& cb
) {
467 X64Assembler a
{ cb
};
468 while (a
.available() >= 2) a
.ud2();
469 if (a
.available() > 0) a
.int3();
470 assertx(a
.available() == 0);
473 ///////////////////////////////////////////////////////////////////////////////
475 void Vgen::emit(const copy
& i
) {
476 if (i
.s
== i
.d
) return;
478 if (i
.d
.isGP()) { // GP => GP
480 } else { // GP => XMM
481 assertx(i
.d
.isSIMD());
482 // This generates a movq x86 instruction, which zero extends
483 // the 64-bit value in srcReg into a 128-bit XMM register
487 if (i
.d
.isGP()) { // XMM => GP
489 } else { // XMM => XMM
490 assertx(i
.d
.isSIMD());
491 // This copies all 128 bits in XMM,
492 // thus avoiding partial register stalls
498 void Vgen::emit(const copy2
& i
) {
499 assertx(i
.s0
.isValid() && i
.s1
.isValid() && i
.d0
.isValid() && i
.d1
.isValid());
500 auto s0
= i
.s0
, s1
= i
.s1
, d0
= i
.d0
, d1
= i
.d1
;
506 // could do this in a simplify pass
507 if (s1
!= d1
) a
.movq(s1
, d1
); // save s1 first; d1 != s0
508 if (s0
!= d0
) a
.movq(s0
, d0
);
511 // could do this in a simplify pass
512 if (s0
!= d0
) a
.movq(s0
, d0
);
513 if (s1
!= d1
) a
.movq(s1
, d1
);
517 void Vgen::emit_simd_imm(int64_t val
, Vreg d
) {
519 a
.pxor(d
, d
); // does not modify flags
521 auto addr
= alloc_literal(env
, val
);
522 a
.movsd(rip
[(intptr_t)addr
], d
);
526 void Vgen::emit(const ldimmb
& i
) {
527 // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t
531 a
.movb(static_cast<int8_t>(val
), d8
);
533 emit_simd_imm(val
, i
.d
);
537 void Vgen::emit(const ldimml
& i
) {
538 // ldimml is for Vconst::Long, which is treated as unsigned uint32_t
544 emit_simd_imm(uint32_t(val
), i
.d
);
548 void Vgen::emit(const ldimmq
& i
) {
553 a
.movl(0, d32
); // because emitImmReg tries the xor optimization
555 a
.emitImmReg(i
.s
, i
.d
);
558 emit_simd_imm(val
, i
.d
);
562 void Vgen::emit(const load
& i
) {
564 auto mref
= i
.s
.mr();
568 assertx(i
.d
.isSIMD());
573 void Vgen::emit(const store
& i
) {
577 assertx(i
.s
.isSIMD());
582 ///////////////////////////////////////////////////////////////////////////////
584 void Vgen::emit(const mcprep
& i
) {
586 * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
587 * address of the movq) so that we can find the movq from the handler.
589 * We set the low bit for two reasons: the Class* will never be a valid
590 * Class*, so we'll always miss the inline check before it's smashed, and
591 * handlePrimeCacheInit can tell it's not been smashed yet
593 auto const mov_addr
= emitSmashableMovq(a
.code(), env
.meta
, 0, r64(i
.d
));
594 auto const imm
= reinterpret_cast<uint64_t>(mov_addr
);
595 smashMovq(a
.toDestAddress(mov_addr
), (imm
<< 1) | 1);
597 env
.meta
.addressImmediates
.insert(reinterpret_cast<TCA
>(~imm
));
600 ///////////////////////////////////////////////////////////////////////////////
602 void Vgen::emit(const call
& i
) {
603 if (a
.jmpDeltaFits(i
.target
)) {
606 // can't do a near call; store address in data section.
607 // call by loading the address using rip-relative addressing. This
608 // assumes the data section is near the current code section. Since
609 // this sequence is directly in-line, rip-relative like this is
610 // more compact than loading a 64-bit immediate.
611 auto addr
= alloc_literal(env
, (uint64_t)i
.target
);
612 a
.call(rip
[(intptr_t)addr
]);
615 *i
.watch
= a
.frontier();
616 env
.meta
.watchpoints
.push_back(i
.watch
);
620 void Vgen::emit(const calls
& i
) {
621 emitSmashableCall(a
.code(), env
.meta
, i
.target
);
624 ///////////////////////////////////////////////////////////////////////////////
626 void Vgen::emit(const stubret
& i
) {
635 void Vgen::emit(const callstub
& i
) {
636 emit(call
{i
.target
, i
.args
});
639 void Vgen::emit(const callfaststub
& i
) {
640 emit(call
{i
.target
, i
.args
});
641 emit(syncpoint
{i
.fix
});
644 void Vgen::emit(const tailcallstub
& i
) {
646 emit(jmpi
{i
.target
, i
.args
});
649 ///////////////////////////////////////////////////////////////////////////////
651 void Vgen::emit(const phpret
& i
) {
652 a
.push(i
.fp
[AROFF(m_savedRip
)]);
654 a
.loadq(i
.fp
[AROFF(m_sfp
)], i
.d
);
659 void Vgen::emit(const tailcallphp
& i
) {
660 emit(pushm
{i
.fp
[AROFF(m_savedRip
)]});
661 emit(jmpr
{i
.target
, i
.args
});
664 void Vgen::emit(const callarray
& i
) {
665 emit(call
{i
.target
, i
.args
});
668 void Vgen::emit(const contenter
& i
) {
670 Reg64 fp
= i
.fp
, target
= i
.target
;
674 a
.pop(fp
[AROFF(m_savedRip
)]);
679 // m_savedRip will point here.
680 emit(unwind
{{i
.targets
[0], i
.targets
[1]}});
683 ///////////////////////////////////////////////////////////////////////////////
685 void Vgen::emit(const calltc
& i
) {
687 a
.push(i
.fp
[AROFF(m_savedRip
)]);
693 assertx(!i
.args
.contains(reg::rax
));
694 a
.pop(reg::rax
); // unused
698 ///////////////////////////////////////////////////////////////////////////////
700 void Vgen::emit(const nothrow
& i
) {
701 env
.meta
.catches
.emplace_back(a
.frontier(), nullptr);
704 void Vgen::emit(const syncpoint
& i
) {
705 FTRACE(5, "IR recordSyncPoint: {} {} {}\n", a
.frontier(),
706 i
.fix
.pcOffset
, i
.fix
.spOffset
);
707 env
.meta
.fixups
.emplace_back(a
.frontier(), i
.fix
);
710 void Vgen::emit(const unwind
& i
) {
711 catches
.push_back({a
.frontier(), i
.targets
[1]});
712 emit(jmp
{i
.targets
[0]});
715 ///////////////////////////////////////////////////////////////////////////////
717 void Vgen::emit(andqi i
) {
718 if (magFits(i
.s0
.q(), sz::dword
)) {
719 emit(andli
{int32_t(i
.s0
.q()), Reg32(i
.s1
), Reg32(i
.d
), i
.sf
});
727 void Vgen::emit(const addlim
& i
) {
728 prefix(a
, i
.m
).addl(i
.s0
, i
.m
.mr());
731 void Vgen::emit(const addqim
& i
) {
732 prefix(a
, i
.m
).addq(i
.s0
, i
.m
.mr());
735 void Vgen::emit(const cloadq
& i
) {
737 always_assert(!m
.index
.isValid()); // not supported, but could be later.
740 // We can't move f over d or we'll clobber the Vptr we need to load from.
741 // Since cload does the load unconditionally anyway, we can just load and
744 a
.cmov_reg64_reg64(ccNegate(i
.cc
), i
.f
, i
.d
);
749 a
.cload_reg64_disp_reg64(i
.cc
, m
.base
, m
.disp
, i
.d
);
752 // add s0 s1 d => mov s1->d; d += s0
753 // cmov cc s d => if cc { mov s->d }
755 void Vgen::emit_cmov(const cmov
& i
) {
756 if (i
.f
!= i
.d
&& i
.t
== i
.d
) {
757 // negate the condition and swap t/f operands so we dont clobber i.t
758 return emit(cmov
{ccNegate(i
.cc
), i
.sf
, i
.t
, i
.f
, i
.d
});
762 a
.cmov_reg64_reg64(i
.cc
, r64(i
.t
), r64(i
.d
));
765 void Vgen::emit(const cvtsi2sd
& i
) {
767 a
.cvtsi2sd(i
.s
, i
.d
);
770 void Vgen::emit(const cvtsi2sdm
& i
) {
772 a
.cvtsi2sd(i
.s
, i
.d
);
775 void Vgen::emit(const jcc
& i
) {
776 if (i
.targets
[1] != i
.targets
[0]) {
777 if (next
== i
.targets
[1]) {
778 return emit(jcc
{ccNegate(i
.cc
), i
.sf
, {i
.targets
[1], i
.targets
[0]}});
780 auto taken
= i
.targets
[1];
781 jccs
.push_back({a
.frontier(), taken
});
782 a
.jcc(i
.cc
, a
.frontier());
784 emit(jmp
{i
.targets
[0]});
787 void Vgen::emit(const jcci
& i
) {
788 a
.jcc(i
.cc
, i
.taken
);
792 void Vgen::emit(const jmp
& i
) {
793 if (next
== i
.target
) return;
794 jmps
.push_back({a
.frontier(), i
.target
});
798 void Vgen::emit(const jmpi
& i
) {
799 if (a
.jmpDeltaFits(i
.target
)) {
802 // can't do a near jmp - use rip-relative addressing
803 auto addr
= alloc_literal(env
, (uint64_t)i
.target
);
804 a
.jmp(rip
[(intptr_t)addr
]);
808 void Vgen::emit(const lea
& i
) {
809 // could do this in a simplify pass
810 if (i
.s
.disp
== 0 && i
.s
.base
.isValid() && !i
.s
.index
.isValid()) {
811 emit(copy
{i
.s
.base
, i
.d
});
817 void Vgen::emit(const storebi
& i
) {
818 prefix(a
, i
.m
).storeb(i
.s
, i
.m
.mr());
821 void Vgen::emit(const storeqi
& i
) {
822 prefix(a
, i
.m
).storeq(i
.s
, i
.m
.mr());
825 template<typename Inst
>
826 bool testimHelper(Vgen
& env
, const Inst
& i
, uint64_t mask
) {
827 // If there's only 1 byte of meaningful bits in the mask, we can adjust the
828 // pointer offset and use testbim instead.
830 while (mask
> 0xff && !(mask
& 0xff)) {
835 if (mask
> 0xff) return false;
837 env
.emit(testbim
{int8_t(mask
), i
.s1
+ off
, i
.sf
});
841 void Vgen::emit(const testwim
& i
) {
842 if (testimHelper(*this, i
, i
.s0
.w())) return;
846 void Vgen::emit(const testlim
& i
) {
847 if (testimHelper(*this, i
, i
.s0
.l())) return;
851 void Vgen::emit(const testli
& i
) {
852 if (i
.s0
.l() == -1) {
853 return emit(testl
{i
.s1
, i
.s1
, i
.sf
});
858 void Vgen::emit(const testqi
& i
) {
859 auto const imm
= i
.s0
.q();
860 if (magFits(imm
, sz::byte
)) {
861 a
.testb(int8_t(imm
), rbyte(i
.s1
));
862 } else if (magFits(imm
, sz::dword
)) {
863 emit(testli
{int32_t(imm
), Reg32(i
.s1
), i
.sf
});
864 } else if (imm
== -1) {
865 emit(testq
{i
.s1
, i
.s1
, i
.sf
});
871 void Vgen::emit(const testqim
& i
) {
872 if (testimHelper(*this, i
, i
.s0
.q())) return;
873 if (magFits(i
.s0
.q(), sz::dword
)) {
874 // For an unsigned 32 bit immediate, we can get the same results
875 // by emitting a testlim.
876 emit(testlim
{int32_t(i
.s0
.q()), i
.s1
, i
.sf
});
882 void Vgen::emit(xorq i
) {
884 // 32-bit xor{s, s, d} zeroes the upper bits of `d'.
885 return emit(xorl
{r32(i
.s0
), r32(i
.s1
), r32(i
.d
), i
.sf
});
891 ///////////////////////////////////////////////////////////////////////////////
893 template<typename Lower
>
894 void lower_impl(Vunit
& unit
, Vlabel b
, size_t i
, Lower lower
) {
895 vmodify(unit
, b
, i
, [&] (Vout
& v
) { lower(v
); return 1; });
898 template<typename Inst
>
899 void lower(Vunit
& unit
, Inst
& inst
, Vlabel b
, size_t i
) {}
901 ///////////////////////////////////////////////////////////////////////////////
903 void lower(Vunit
& unit
, popp
& inst
, Vlabel b
, size_t i
) {
904 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
910 void lower(Vunit
& unit
, poppm
& inst
, Vlabel b
, size_t i
) {
911 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
917 void lower(Vunit
& unit
, pushp
& inst
, Vlabel b
, size_t i
) {
918 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
924 void lower(Vunit
& unit
, pushpm
& inst
, Vlabel b
, size_t i
) {
925 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
931 ///////////////////////////////////////////////////////////////////////////////
933 void lower(Vunit
& unit
, stublogue
& inst
, Vlabel b
, size_t i
) {
934 if (inst
.saveframe
) {
935 unit
.blocks
[b
].code
[i
] = push
{rvmfp()};
937 unit
.blocks
[b
].code
[i
] = lea
{reg::rsp
[-8], reg::rsp
};
941 void lower(Vunit
& unit
, stubunwind
& inst
, Vlabel b
, size_t i
) {
942 unit
.blocks
[b
].code
[i
] = lea
{reg::rsp
[16], reg::rsp
};
945 void lower(Vunit
& unit
, stubtophp
& inst
, Vlabel b
, size_t i
) {
946 unit
.blocks
[b
].code
[i
] = lea
{reg::rsp
[16], reg::rsp
};
949 void lower(Vunit
& unit
, loadstubret
& inst
, Vlabel b
, size_t i
) {
950 unit
.blocks
[b
].code
[i
] = load
{reg::rsp
[8], inst
.d
};
953 void lower(Vunit
& unit
, phplogue
& inst
, Vlabel b
, size_t i
) {
954 unit
.blocks
[b
].code
[i
] = popm
{inst
.fp
[AROFF(m_savedRip
)]};
957 void lower(Vunit
& unit
, resumetc
& inst
, Vlabel b
, size_t i
) {
958 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
959 v
<< callr
{inst
.target
, inst
.args
};
960 v
<< jmpi
{inst
.exittc
};
964 ///////////////////////////////////////////////////////////////////////////////
966 void lower(Vunit
& unit
, sar
& inst
, Vlabel b
, size_t i
) {
967 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
968 v
<< copy
{inst
.s0
, rcx
};
969 v
<< sarq
{inst
.s1
, inst
.d
, inst
.sf
};
973 void lower(Vunit
& unit
, shl
& inst
, Vlabel b
, size_t i
) {
974 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
975 v
<< copy
{inst
.s0
, rcx
};
976 v
<< shlq
{inst
.s1
, inst
.d
, inst
.sf
};
980 void lower(Vunit
& unit
, srem
& inst
, Vlabel b
, size_t i
) {
981 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
982 v
<< copy
{inst
.s0
, rax
};
983 v
<< cqo
{}; // sign-extend rax => rdx:rax
984 v
<< idiv
{inst
.s1
, v
.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
985 v
<< copy
{rdx
, inst
.d
};
989 void lower(Vunit
& unit
, divint
& inst
, Vlabel b
, size_t i
) {
990 lower_impl(unit
, b
, i
, [&] (Vout
& v
) {
991 v
<< copy
{inst
.s0
, rax
};
992 v
<< cqo
{}; // sign-extend rax => rdx:rax
993 v
<< idiv
{inst
.s1
, v
.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
994 v
<< copy
{rax
, inst
.d
};
998 ///////////////////////////////////////////////////////////////////////////////
1000 void lower(Vunit
& unit
, movtqb
& inst
, Vlabel b
, size_t i
) {
1001 unit
.blocks
[b
].code
[i
] = copy
{inst
.s
, inst
.d
};
1003 void lower(Vunit
& unit
, movtdb
& inst
, Vlabel b
, size_t i
) {
1004 unit
.blocks
[b
].code
[i
] = copy
{inst
.s
, inst
.d
};
1006 void lower(Vunit
& unit
, movtdq
& inst
, Vlabel b
, size_t i
) {
1007 unit
.blocks
[b
].code
[i
] = copy
{inst
.s
, inst
.d
};
1009 void lower(Vunit
& unit
, movtql
& inst
, Vlabel b
, size_t i
) {
1010 unit
.blocks
[b
].code
[i
] = copy
{inst
.s
, inst
.d
};
1013 ///////////////////////////////////////////////////////////////////////////////
1016 * Lower a few abstractions to facilitate straightforward x64 codegen.
1018 void lowerForX64(Vunit
& unit
) {
1019 vasm_lower(unit
, [&] (const VLS
& env
, Vinstr
& inst
, Vlabel b
, size_t i
) {
1021 #define O(name, ...) \
1022 case Vinstr::name: \
1023 lower(unit, inst.name##_, b, i); \
1032 ///////////////////////////////////////////////////////////////////////////////
1035 void optimizeX64(Vunit
& unit
, const Abi
& abi
, bool regalloc
) {
1036 Timer
timer(Timer::vasm_optimize
, unit
.log_entry
);
1038 removeTrivialNops(unit
);
1042 optimizeExits(unit
);
1044 assertx(checkWidths(unit
));
1046 if (unit
.context
&& isProfiling(unit
.context
->kind
) && abi
.canSpill
&&
1047 RuntimeOption::EvalProfBranchSampleFreq
> 0) {
1048 // Even when branch profiling is on, we still only want to profile
1049 // non-profiling translations of PHP functions. We also require that we
1050 // can spill, so that we can generate arbitrary profiling code, and also to
1051 // ensure we don't profile unique stubs and such.
1052 profile_branches(unit
);
1059 if (!unit
.constToReg
.empty()) {
1060 foldImms
<x64::ImmFolder
>(unit
);
1063 optimizeCopies(unit
, abi
);
1065 if (unit
.needsRegAlloc()) {
1066 removeDeadCode(unit
);
1067 if (regalloc
) allocateRegisters(unit
, abi
);
1069 if (unit
.blocks
.size() > 1) {
1074 void emitX64(Vunit
& unit
, Vtext
& text
, CGMeta
& fixups
,
1076 vasm_emit
<Vgen
>(unit
, text
, fixups
, asmInfo
);
1079 ///////////////////////////////////////////////////////////////////////////////