2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/vm/jit/vasm-emit.h"
19 #include "hphp/runtime/base/arch.h"
20 #include "hphp/runtime/vm/jit/back-end-x64.h"
21 #include "hphp/runtime/vm/jit/block.h"
22 #include "hphp/runtime/vm/jit/code-gen-helpers-x64.h"
23 #include "hphp/runtime/vm/jit/code-gen.h"
24 #include "hphp/runtime/vm/jit/func-prologues-x64.h"
25 #include "hphp/runtime/vm/jit/mc-generator.h"
26 #include "hphp/runtime/vm/jit/print.h"
27 #include "hphp/runtime/vm/jit/prof-data.h"
28 #include "hphp/runtime/vm/jit/service-requests-inline.h"
29 #include "hphp/runtime/vm/jit/target-cache.h"
30 #include "hphp/runtime/vm/jit/timer.h"
31 #include "hphp/runtime/vm/jit/vasm.h"
32 #include "hphp/runtime/vm/jit/vasm-instr.h"
33 #include "hphp/runtime/vm/jit/vasm-print.h"
34 #include "hphp/runtime/vm/jit/vasm-unit.h"
35 #include "hphp/runtime/vm/jit/vasm-util.h"
36 #include "hphp/runtime/vm/jit/vasm-visit.h"
42 namespace HPHP
{ namespace jit
{
43 ///////////////////////////////////////////////////////////////////////////////
48 namespace x64
{ struct ImmFolder
; }
51 ///////////////////////////////////////////////////////////////////////////////
54 Vgen(const Vunit
& u
, Vasm::AreaList
& areas
, AsmInfo
* asmInfo
)
56 , backend(mcg
->backEnd())
58 , m_asmInfo(asmInfo
) {
59 addrs
.resize(u
.blocks
.size());
60 points
.resize(u
.next_point
);
62 void emit(jit::vector
<Vlabel
>&);
65 template<class Inst
> void emit(const Inst
& i
) {
66 always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
67 vinst_names
[Vinstr(i
).op
], size_t(current
));
70 void emit(const bindaddr
& i
);
71 void emit(const bindcall
& i
);
72 void emit(const bindjcc1st
& i
);
73 void emit(const bindjcc
& i
);
74 void emit(const bindjmp
& i
);
75 void emit(const callstub
& i
);
76 void emit(const contenter
& i
);
77 void emit(const copy
& i
);
78 void emit(const copy2
& i
);
79 void emit(const debugtrap
& i
) { a
->int3(); }
80 void emit(const fallthru
& i
) {}
81 void emit(const ldimmb
& i
);
82 void emit(const ldimml
& i
);
83 void emit(const ldimmq
& i
);
84 void emit(const ldimmqs
& i
);
85 void emit(const fallback
& i
);
86 void emit(const fallbackcc
& i
);
87 void emit(const load
& i
);
88 void emit(const mccall
& i
);
89 void emit(const mcprep
& i
);
90 void emit(const nothrow
& i
);
91 void emit(const store
& i
);
92 void emit(const syncpoint
& i
);
93 void emit(const unwind
& i
);
94 void emit(const landingpad
& i
) {}
95 void emit(const vretm
& i
);
96 void emit(const vret
& i
);
99 void emit(andb i
) { commuteSF(i
); a
->andb(i
.s0
, i
.d
); }
100 void emit(andbi i
) { binary(i
); a
->andb(i
.s0
, i
.d
); }
101 void emit(const andbim
& i
) { a
->andb(i
.s
, i
.m
); }
102 void emit(andl i
) { commuteSF(i
); a
->andl(i
.s0
, i
.d
); }
103 void emit(andli i
) { binary(i
); a
->andl(i
.s0
, i
.d
); }
104 void emit(andq i
) { commuteSF(i
); a
->andq(i
.s0
, i
.d
); }
105 void emit(andqi i
) { binary(i
); a
->andq(i
.s0
, i
.d
); }
106 void emit(addli i
) { binary(i
); a
->addl(i
.s0
, i
.d
); }
107 void emit(const addlm
& i
) { a
->addl(i
.s0
, i
.m
); }
108 void emit(addq i
) { commuteSF(i
); a
->addq(i
.s0
, i
.d
); }
109 void emit(addqi i
) { binary(i
); a
->addq(i
.s0
, i
.d
); }
110 void emit(const addqim
& i
);
111 void emit(addsd i
) { commute(i
); a
->addsd(i
.s0
, i
.d
); }
112 void emit(const call
& i
);
113 void emit(const callm
& i
) { a
->call(i
.target
); }
114 void emit(const callr
& i
) { a
->call(i
.target
); }
115 void emit(const cloadq
& i
);
116 void emit(const cmovq
& i
);
117 void emit(const cmpb
& i
) { a
->cmpb(i
.s0
, i
.s1
); }
118 void emit(const cmpbi
& i
) { a
->cmpb(i
.s0
, i
.s1
); }
119 void emit(const cmpbim
& i
) { a
->cmpb(i
.s0
, i
.s1
); }
120 void emit(const cmpl
& i
) { a
->cmpl(i
.s0
, i
.s1
); }
121 void emit(const cmpli
& i
) { a
->cmpl(i
.s0
, i
.s1
); }
122 void emit(const cmplim
& i
) { a
->cmpl(i
.s0
, i
.s1
); }
123 void emit(const cmplm
& i
) { a
->cmpl(i
.s0
, i
.s1
); }
124 void emit(const cmpq
& i
) { a
->cmpq(i
.s0
, i
.s1
); }
125 void emit(const cmpqi
& i
) { a
->cmpq(i
.s0
, i
.s1
); }
126 void emit(const cmpqim
& i
) { a
->cmpq(i
.s0
, i
.s1
); }
127 void emit(const cmpqims
& i
);
128 void emit(const cmpqm
& i
) { a
->cmpq(i
.s0
, i
.s1
); }
129 void emit(cmpsd i
) { noncommute(i
); a
->cmpsd(i
.s0
, i
.d
, i
.pred
); }
130 void emit(const cqo
& i
) { a
->cqo(); }
131 void emit(const cvttsd2siq
& i
) { a
->cvttsd2siq(i
.s
, i
.d
); }
132 void emit(const cvtsi2sd
& i
);
133 void emit(const cvtsi2sdm
& i
);
134 void emit(decl i
) { unary(i
); a
->decl(i
.d
); }
135 void emit(const declm
& i
) { a
->decl(i
.m
); }
136 void emit(decq i
) { unary(i
); a
->decq(i
.d
); }
137 void emit(const decqm
& i
) { a
->decq(i
.m
); }
138 void emit(divsd i
) { noncommute(i
); a
->divsd(i
.s0
, i
.d
); }
139 void emit(imul i
) { commuteSF(i
); a
->imul(i
.s0
, i
.d
); }
140 void emit(const idiv
& i
) { a
->idiv(i
.s
); }
141 void emit(incl i
) { unary(i
); a
->incl(i
.d
); }
142 void emit(const inclm
& i
) { a
->incl(i
.m
); }
143 void emit(incq i
) { unary(i
); a
->incq(i
.d
); }
144 void emit(const incqm
& i
) { a
->incq(i
.m
); }
145 void emit(const incqmlock
& i
) { a
->lock(); a
->incq(i
.m
); }
146 void emit(const incwm
& i
) { a
->incw(i
.m
); }
147 void emit(const jcc
& i
);
148 void emit(const jcci
& i
);
149 void emit(const jmp
& i
);
150 void emit(const jmpr
& i
) { a
->jmp(i
.target
); }
151 void emit(const jmpm
& i
) { a
->jmp(i
.target
); }
152 void emit(const jmpi
& i
) { a
->jmp(i
.target
); }
153 void emit(const lea
& i
);
154 void emit(const leap
& i
) { a
->lea(i
.s
, i
.d
); }
155 void emit(const loadups
& i
) { a
->movups(i
.s
, i
.d
); }
156 void emit(const loadtqb
& i
) { a
->loadb(i
.s
, i
.d
); }
157 void emit(const loadl
& i
) { a
->loadl(i
.s
, i
.d
); }
158 void emit(const loadqp
& i
) { a
->loadq(i
.s
, i
.d
); }
159 void emit(const loadsd
& i
) { a
->movsd(i
.s
, i
.d
); }
160 void emit(const loadzbl
& i
) { a
->loadzbl(i
.s
, i
.d
); }
161 void emit(const loadzbq
& i
) { a
->loadzbl(i
.s
, Reg32(i
.d
)); }
162 void emit(const loadzlq
& i
) { a
->loadl(i
.s
, Reg32(i
.d
)); }
163 void emit(const movb
& i
) { a
->movb(i
.s
, i
.d
); }
164 void emit(const movl
& i
) { a
->movl(i
.s
, i
.d
); }
165 void emit(const movzbl
& i
) { a
->movzbl(i
.s
, i
.d
); }
166 void emit(const movzbq
& i
) { a
->movzbl(i
.s
, Reg32(i
.d
)); }
167 void emit(mulsd i
) { commute(i
); a
->mulsd(i
.s0
, i
.d
); }
168 void emit(neg i
) { unary(i
); a
->neg(i
.d
); }
169 void emit(const nop
& i
) { a
->nop(); }
170 void emit(not i
) { unary(i
); a
->not(i
.d
); }
171 void emit(notb i
) { unary(i
); a
->notb(i
.d
); }
172 void emit(const orwim
& i
) { a
->orw(i
.s0
, i
.m
); }
173 void emit(orq i
) { commuteSF(i
); a
->orq(i
.s0
, i
.d
); }
174 void emit(orqi i
) { binary(i
); a
->orq(i
.s0
, i
.d
); }
175 void emit(const orqim
& i
) { a
->orq(i
.s0
, i
.m
); }
176 void emit(const pop
& i
) { a
->pop(i
.d
); }
177 void emit(const popm
& i
) { a
->pop(i
.d
); }
178 void emit(psllq i
) { binary(i
); a
->psllq(i
.s0
, i
.d
); }
179 void emit(psrlq i
) { binary(i
); a
->psrlq(i
.s0
, i
.d
); }
180 void emit(const push
& i
) { a
->push(i
.s
); }
181 void emit(const roundsd
& i
) { a
->roundsd(i
.dir
, i
.s
, i
.d
); }
182 void emit(const ret
& i
) { a
->ret(); }
183 void emit(const sarq
& i
) { unary(i
); a
->sarq(i
.d
); }
184 void emit(sarqi i
) { binary(i
); a
->sarq(i
.s0
, i
.d
); }
185 void emit(const setcc
& i
) { a
->setcc(i
.cc
, i
.d
); }
186 void emit(shlli i
) { binary(i
); a
->shll(i
.s0
, i
.d
); }
187 void emit(shlq i
) { unary(i
); a
->shlq(i
.d
); }
188 void emit(shlqi i
) { binary(i
); a
->shlq(i
.s0
, i
.d
); }
189 void emit(shrli i
) { binary(i
); a
->shrl(i
.s0
, i
.d
); }
190 void emit(shrqi i
) { binary(i
); a
->shrq(i
.s0
, i
.d
); }
191 void emit(const sqrtsd
& i
) { a
->sqrtsd(i
.s
, i
.d
); }
192 void emit(const storeups
& i
) { a
->movups(i
.s
, i
.m
); }
193 void emit(const storeb
& i
) { a
->storeb(i
.s
, i
.m
); }
194 void emit(const storebi
& i
);
195 void emit(const storel
& i
) { a
->storel(i
.s
, i
.m
); }
196 void emit(const storeli
& i
) { a
->storel(i
.s
, i
.m
); }
197 void emit(const storeqi
& i
) { a
->storeq(i
.s
, i
.m
); }
198 void emit(const storesd
& i
) { a
->movsd(i
.s
, i
.m
); }
199 void emit(const storew
& i
) { a
->storew(i
.s
, i
.m
); }
200 void emit(const storewi
& i
) { a
->storew(i
.s
, i
.m
); }
201 void emit(subbi i
) { binary(i
); a
->subb(i
.s0
, i
.d
); }
202 void emit(subl i
) { noncommute(i
); a
->subl(i
.s0
, i
.d
); }
203 void emit(subli i
) { binary(i
); a
->subl(i
.s0
, i
.d
); }
204 void emit(subq i
) { noncommute(i
); a
->subq(i
.s0
, i
.d
); }
205 void emit(subqi i
) { binary(i
); a
->subq(i
.s0
, i
.d
); }
206 void emit(subsd i
) { noncommute(i
); a
->subsd(i
.s0
, i
.d
); }
207 void emit(const testb
& i
) { a
->testb(i
.s0
, i
.s1
); }
208 void emit(const testbi
& i
) { a
->testb(i
.s0
, i
.s1
); }
209 void emit(const testbim
& i
) { a
->testb(i
.s0
, i
.s1
); }
210 void emit(const testwim
& i
);
211 void emit(const testl
& i
) { a
->testl(i
.s0
, i
.s1
); }
212 void emit(const testli
& i
) { a
->testl(i
.s0
, i
.s1
); }
213 void emit(const testlim
& i
);
214 void emit(const testq
& i
) { a
->testq(i
.s0
, i
.s1
); }
215 void emit(const testqm
& i
) { a
->testq(i
.s0
, i
.s1
); }
216 void emit(const testqim
& i
);
217 void emit(const ucomisd
& i
) { a
->ucomisd(i
.s0
, i
.s1
); }
218 void emit(const ud2
& i
) { a
->ud2(); }
219 void emit(unpcklpd i
) { noncommute(i
); a
->unpcklpd(i
.s0
, i
.d
); }
220 void emit(xorb i
) { commuteSF(i
); a
->xorb(i
.s0
, i
.d
); }
221 void emit(xorbi i
) { binary(i
); a
->xorb(i
.s0
, i
.d
); }
222 void emit(xorq i
) { commuteSF(i
); a
->xorq(i
.s0
, i
.d
); }
223 void emit(xorqi i
) { binary(i
); a
->xorq(i
.s0
, i
.d
); }
226 void prep(Reg8 s
, Reg8 d
) { if (s
!= d
) a
->movb(s
, d
); }
227 void prep(Reg32 s
, Reg32 d
) { if (s
!= d
) a
->movl(s
, d
); }
228 void prep(Reg64 s
, Reg64 d
) { if (s
!= d
) a
->movq(s
, d
); }
229 void prep(RegXMM s
, RegXMM d
) { if (s
!= d
) a
->movdqa(s
, d
); }
230 CodeAddress
start(Vlabel b
) {
231 auto area
= unit
.blocks
[b
].area
;
232 return areas
[(int)area
].start
;
234 CodeBlock
& main() { return area(AreaIndex::Main
).code
; }
235 CodeBlock
& cold() { return area(AreaIndex::Cold
).code
; }
236 CodeBlock
& frozen() { return area(AreaIndex::Frozen
).code
; }
237 template<class Inst
> void unary(Inst
& i
) { prep(i
.s
, i
.d
); }
238 template<class Inst
> void binary(Inst
& i
) { prep(i
.s1
, i
.d
); }
239 template<class Inst
> void commuteSF(Inst
&);
240 template<class Inst
> void commute(Inst
&);
241 template<class Inst
> void noncommute(Inst
&);
244 Vasm::Area
& area(AreaIndex i
) {
245 assertx((unsigned)i
< areas
.size());
246 return areas
[(unsigned)i
];
250 struct LabelPatch
{ CodeAddress instr
; Vlabel target
; };
251 struct PointPatch
{ CodeAddress instr
; Vpoint pos
; };
254 Vasm::AreaList
& areas
;
257 Vlabel current
{0}, next
{0}; // in linear order
258 jit::vector
<CodeAddress
> addrs
, points
;
259 jit::vector
<LabelPatch
> jccs
, jmps
, calls
, catches
;
260 jit::vector
<PointPatch
> ldpoints
;
261 jit::hash_map
<uint64_t,uint64_t*> constants
;
264 // prepare a binary op that is not commutative. s0 must be a different
265 // register than s1 so we don't clobber it.
266 template<class Inst
> void Vgen::noncommute(Inst
& i
) {
267 assertx(i
.s1
== i
.d
|| i
.s0
!= i
.d
); // do not clobber s0
271 // prepare a binary op that is commutative. Swap operands if the dest is s0.
272 template<class Inst
> void Vgen::commuteSF(Inst
& i
) {
273 if (i
.s1
!= i
.d
&& i
.s0
== i
.d
) {
274 i
= Inst
{i
.s1
, i
.s0
, i
.d
, i
.sf
};
280 // prepare a binary op that is commutative. Swap operands if the dest is s0.
281 template<class Inst
> void Vgen::commute(Inst
& i
) {
282 if (i
.s1
!= i
.d
&& i
.s0
== i
.d
) {
283 i
= Inst
{i
.s1
, i
.s0
, i
.d
};
289 ///////////////////////////////////////////////////////////////////////////////
291 void emitSimdImm(X64Assembler
* a
, int64_t val
, Vreg d
) {
293 a
->pxor(d
, d
); // does not modify flags
295 auto addr
= mcg
->allocLiteral(val
);
296 a
->movsd(rip
[(intptr_t)addr
], d
);
300 ///////////////////////////////////////////////////////////////////////////////
302 void Vgen::emit(const addqim
& i
) {
303 if (i
.m
.seg
== Vptr::FS
) a
->fs();
304 a
->addq(i
.s0
, i
.m
.mr());
307 void Vgen::emit(const call
& i
) {
308 // warning: this is a copy of emitCall(TCA) in code-gen-helpers-x64.cpp
309 if (a
->jmpDeltaFits(i
.target
)) {
312 // can't do a near call; store address in data section.
313 // call by loading the address using rip-relative addressing. This
314 // assumes the data section is near the current code section. Since
315 // this sequence is directly in-line, rip-relative like this is
316 // more compact than loading a 64-bit immediate.
317 auto addr
= mcg
->allocLiteral((uint64_t)i
.target
);
318 a
->call(rip
[(intptr_t)addr
]);
322 void Vgen::emit(const cloadq
& i
) {
324 always_assert(!m
.index
.isValid()); // not supported, but could be later.
327 // We can't move f over d or we'll clobber the Vptr we need to load from.
328 // Since cload does the load unconditionally anyway, we can just load and
331 a
->cmov_reg64_reg64(ccNegate(i
.cc
), i
.f
, i
.d
);
336 a
->cload_reg64_disp_reg64(i
.cc
, m
.base
, m
.disp
, i
.d
);
339 // add s0 s1 d => mov s1->d; d += s0
340 // cmov cc s d => if cc { mov s->d }
341 void Vgen::emit(const cmovq
& i
) {
342 if (i
.f
!= i
.d
&& i
.t
== i
.d
) {
343 // negate the condition and swap t/f operands so we dont clobber i.t
344 return emit(cmovq
{ccNegate(i
.cc
), i
.sf
, i
.t
, i
.f
, i
.d
});
348 a
->cmov_reg64_reg64(i
.cc
, i
.t
, i
.d
);
351 void Vgen::emit(const contenter
& i
) {
353 Reg64 fp
= i
.fp
, target
= i
.target
;
357 a
->pop(fp
[AROFF(m_savedRip
)]);
362 // m_savedRip will point here.
363 emit(unwind
{{i
.targets
[0], i
.targets
[1]}});
366 void Vgen::emit(const copy
& i
) {
367 if (i
.s
== i
.d
) return;
369 if (i
.d
.isGP()) { // GP => GP
371 } else { // GP => XMM
372 assertx(i
.d
.isSIMD());
373 // This generates a movq x86 instruction, which zero extends
374 // the 64-bit value in srcReg into a 128-bit XMM register
375 a
->movq_rx(i
.s
, i
.d
);
378 if (i
.d
.isGP()) { // XMM => GP
379 a
->movq_xr(i
.s
, i
.d
);
380 } else { // XMM => XMM
381 assertx(i
.d
.isSIMD());
382 // This copies all 128 bits in XMM,
383 // thus avoiding partial register stalls
389 void Vgen::emit(const copy2
& i
) {
390 assertx(i
.s0
.isValid() && i
.s1
.isValid() && i
.d0
.isValid() && i
.d1
.isValid());
391 auto s0
= i
.s0
, s1
= i
.s1
, d0
= i
.d0
, d1
= i
.d1
;
397 // could do this in a simplify pass
398 if (s1
!= d1
) a
->movq(s1
, d1
); // save s1 first; d1 != s0
399 if (s0
!= d0
) a
->movq(s0
, d0
);
402 // could do this in a simplify pass
403 if (s0
!= d0
) a
->movq(s0
, d0
);
404 if (s1
!= d1
) a
->movq(s1
, d1
);
408 void Vgen::emit(const bindaddr
& i
) {
409 *i
.dest
= emitBindAddr(a
->code(), frozen(), i
.dest
, i
.sk
, i
.spOff
);
410 mcg
->setJmpTransID(TCA(i
.dest
));
413 void Vgen::emit(const bindcall
& i
) {
414 mcg
->backEnd().prepareForSmash(a
->code(), kCallLen
);
416 emit(unwind
{{i
.targets
[0], i
.targets
[1]}});
419 void Vgen::emit(const bindjcc1st
& i
) {
420 emitBindJmpccFirst(a
->code(), frozen(), i
.cc
, i
.targets
[0], i
.targets
[1],
424 void Vgen::emit(const bindjcc
& i
) {
435 void Vgen::emit(const bindjmp
& i
) {
446 void Vgen::emit(const callstub
& i
) {
447 emit(call
{i
.target
, i
.args
});
450 void Vgen::emit(const cmpqims
& i
) {
451 backend
.prepareForSmash(a
->code(), kCmpLen
);
455 void Vgen::emit(const fallback
& i
) {
456 emit(fallbackcc
{CC_None
, InvalidReg
, i
.dest
, i
.trflags
, i
.args
});
459 void Vgen::emit(const fallbackcc
& i
) {
460 auto const destSR
= mcg
->tx().getSrcRec(i
.dest
);
461 if (!i
.trflags
.packed
) {
462 destSR
->emitFallbackJump(a
->code(), i
.cc
);
464 destSR
->emitFallbackJumpCustom(a
->code(), frozen(), i
.dest
, i
.trflags
);
468 void Vgen::emit(const ldimmb
& i
) {
469 // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t
473 if (val
== 0 && !i
.saveflags
) {
479 emitSimdImm(a
, uint8_t(val
), i
.d
);
483 void Vgen::emit(const ldimml
& i
) {
484 // ldimml is for Vconst::Long, which is treated as unsigned uint32_t
488 if (val
== 0 && !i
.saveflags
) {
494 emitSimdImm(a
, uint32_t(val
), i
.d
);
498 void Vgen::emit(const ldimmq
& i
) {
506 a
->xorl(r32(d
), r32(d
));
509 a
->emitImmReg(i
.s
, i
.d
);
512 emitSimdImm(a
, val
, i
.d
);
516 void Vgen::emit(const ldimmqs
& i
) {
517 backend
.prepareForSmash(a
->code(), kMovLen
);
518 a
->movq(0xdeadbeeffeedface, i
.d
);
520 auto immp
= reinterpret_cast<uintptr_t*>(a
->frontier()) - 1;
524 void Vgen::emit(const load
& i
) {
525 if (i
.s
.seg
== Vptr::FS
) a
->fs();
526 auto mref
= i
.s
.mr();
530 assertx(i
.d
.isSIMD());
535 void Vgen::emit(const mccall
& i
) {
536 backend
.prepareForSmash(a
->code(), kCallLen
);
540 // emit smashable mov as part of method cache callsite
541 void Vgen::emit(const mcprep
& i
) {
543 * For the first time through, set the cache to hold the address
544 * of the movq (*2 + 1), so we can find the movq from the handler.
546 * We set the low bit for two reasons: the Class* will never be a valid
547 * Class*, so we'll always miss the inline check before it's smashed, and
548 * handlePrimeCacheMiss can tell it's not been smashed yet
550 emit(ldimmqs
{0x8000000000000000u
, i
.d
});
552 auto movAddr
= reinterpret_cast<uintptr_t>(a
->frontier()) - x64::kMovLen
;
553 auto immAddr
= reinterpret_cast<uintptr_t*>(movAddr
+ x64::kMovImmOff
);
555 *immAddr
= (movAddr
<< 1) | 1;
556 mcg
->cgFixups().m_addressImmediates
.insert(reinterpret_cast<TCA
>(~movAddr
));
559 void Vgen::emit(const storebi
& i
) {
560 if (i
.m
.seg
== Vptr::FS
) a
->fs();
561 a
->storeb(i
.s
, i
.m
.mr());
564 void Vgen::emit(const store
& i
) {
568 assertx(i
.s
.isSIMD());
573 void Vgen::emit(const syncpoint
& i
) {
574 FTRACE(5, "IR recordSyncPoint: {} {} {}\n", a
->frontier(),
575 i
.fix
.pcOffset
, i
.fix
.spOffset
);
576 mcg
->recordSyncPoint(a
->frontier(), i
.fix
.pcOffset
,
580 void Vgen::emit(const testwim
& i
) {
581 // If there's only 1 byte of meaningful bits in the mask, we can adjust the
582 // pointer offset and use testbim instead.
584 uint16_t newMask
= i
.s0
.w();
585 while (newMask
> 0xff && !(newMask
& 0xff)) {
590 if (newMask
> 0xff) {
591 a
->testw(i
.s0
, i
.s1
);
593 emit(testbim
{int8_t(newMask
), i
.s1
+ off
, i
.sf
});
597 void Vgen::emit(const testlim
& i
) {
598 a
->testl(i
.s0
, i
.s1
);
601 void Vgen::emit(const testqim
& i
) {
602 // The immediate is 32 bits, sign-extended to 64. If the sign bit isn't set,
603 // we can get the same results by emitting a testlim.
605 a
->testq(i
.s0
, i
.s1
);
607 emit(testlim
{i
.s0
, i
.s1
, i
.sf
});
611 void Vgen::emit(const nothrow
& i
) {
612 // register a null catch trace at this position, telling the unwinder that
613 // the function call returning to here isn't allowed to throw.
614 mcg
->registerCatchBlock(a
->frontier(), nullptr);
617 void Vgen::emit(const unwind
& i
) {
618 // Unwind instructions terminate blocks with calls that can throw, and have
619 // the edges to catch (unwinder) blocks and fall-through blocks.
620 catches
.push_back({a
->frontier(), i
.targets
[1]});
621 emit(jmp
{i
.targets
[0]});
624 void Vgen::emit(const vretm
& i
) {
626 a
->loadq(i
.prevFp
, i
.d
);
630 void Vgen::emit(const vret
& i
) {
636 void Vgen::emit(jit::vector
<Vlabel
>& labels
) {
637 // Some structures here track where we put things just for debug printing.
639 const IRInstruction
* origin
;
643 jit::vector
<Snippet
> snippets
;
646 // This is under the printir tracemod because it mostly shows you IR and
647 // machine code, not vasm and machine code (not implemented).
648 bool shouldUpdateAsmInfo
= !!m_asmInfo
;
650 std::vector
<TransBCMapping
>* bcmap
= nullptr;
651 if (mcg
->tx().isTransDBEnabled() || RuntimeOption::EvalJitUseVtuneAPI
) {
652 bcmap
= &mcg
->cgFixups().m_bcMap
;
655 jit::vector
<jit::vector
<BlockInfo
>> areaToBlockInfos
;
656 if (shouldUpdateAsmInfo
) {
657 areaToBlockInfos
.resize(areas
.size());
658 for (auto& r
: areaToBlockInfos
) {
659 r
.resize(unit
.blocks
.size());
663 for (int i
= 0, n
= labels
.size(); i
< n
; ++i
) {
664 assertx(checkBlockEnd(unit
, labels
[i
]));
667 auto& block
= unit
.blocks
[b
];
668 X64Assembler as
{ area(block
.area
).code
};
670 auto blockStart
= a
->frontier();
671 addrs
[b
] = blockStart
;
674 // Compute the next block we will emit into the current area.
675 auto cur_start
= start(labels
[i
]);
677 while (j
< labels
.size() && cur_start
!= start(labels
[j
])) {
680 next
= j
< labels
.size() ? labels
[j
] : Vlabel(unit
.blocks
.size());
684 const IRInstruction
* currentOrigin
= nullptr;
685 auto blockInfo
= shouldUpdateAsmInfo
686 ? &areaToBlockInfos
[unsigned(block
.area
)][b
]
688 auto start_snippet
= [&](const Vinstr
& inst
) {
689 if (!shouldUpdateAsmInfo
) return;
691 blockInfo
->snippets
.push_back(
692 Snippet
{ inst
.origin
, TcaRange
{ a
->code().frontier(), nullptr } }
695 auto finish_snippet
= [&] {
696 if (!shouldUpdateAsmInfo
) return;
698 if (!blockInfo
->snippets
.empty()) {
699 auto& snip
= blockInfo
->snippets
.back();
700 snip
.range
= TcaRange
{ snip
.range
.start(), a
->code().frontier() };
704 for (auto& inst
: block
.code
) {
705 if (currentOrigin
!= inst
.origin
) {
708 currentOrigin
= inst
.origin
;
711 if (bcmap
&& inst
.origin
) {
712 auto sk
= inst
.origin
->marker().sk();
713 if (bcmap
->empty() ||
714 bcmap
->back().md5
!= sk
.unit()->md5() ||
715 bcmap
->back().bcStart
!= sk
.offset()) {
716 bcmap
->push_back(TransBCMapping
{sk
.unit()->md5(), sk
.offset(),
717 main().frontier(), cold().frontier(),
718 frozen().frontier()});
723 #define O(name, imms, uses, defs) \
724 case Vinstr::name: emit(inst.name##_); break;
733 for (auto& p
: jccs
) {
734 assertx(addrs
[p
.target
]);
735 X64Assembler::patchJcc(p
.instr
, addrs
[p
.target
]);
737 for (auto& p
: jmps
) {
738 assertx(addrs
[p
.target
]);
739 X64Assembler::patchJmp(p
.instr
, addrs
[p
.target
]);
741 for (auto& p
: calls
) {
742 assertx(addrs
[p
.target
]);
743 X64Assembler::patchCall(p
.instr
, addrs
[p
.target
]);
745 for (auto& p
: catches
) {
746 mcg
->registerCatchBlock(p
.instr
, addrs
[p
.target
]);
748 for (auto& p
: ldpoints
) {
749 auto after_lea
= p
.instr
+ 7;
750 auto d
= points
[p
.pos
] - after_lea
;
751 assertx(deltaFits(d
, sz::dword
));
752 ((int32_t*)after_lea
)[-1] = d
;
755 if (!shouldUpdateAsmInfo
) {
759 for (auto i
= 0; i
< areas
.size(); ++i
) {
760 auto& blockInfos
= areaToBlockInfos
[i
];
761 for (auto const blockID
: labels
) {
762 auto const& blockInfo
= blockInfos
[static_cast<size_t>(blockID
)];
763 if (blockInfo
.snippets
.empty()) continue;
765 const IRInstruction
* currentOrigin
= nullptr;
766 for (auto const& snip
: blockInfo
.snippets
) {
767 if (currentOrigin
!= snip
.origin
&& snip
.origin
) {
768 currentOrigin
= snip
.origin
;
771 m_asmInfo
->updateForInstruction(
773 static_cast<AreaIndex
>(i
),
781 void Vgen::emit(const cvtsi2sd
& i
) {
783 a
->cvtsi2sd(i
.s
, i
.d
);
786 void Vgen::emit(const cvtsi2sdm
& i
) {
788 a
->cvtsi2sd(i
.s
, i
.d
);
791 void Vgen::emit(const jcc
& i
) {
792 if (i
.targets
[1] != i
.targets
[0]) {
793 if (next
== i
.targets
[1]) {
794 return emit(jcc
{ccNegate(i
.cc
), i
.sf
, {i
.targets
[1], i
.targets
[0]}});
796 auto taken
= i
.targets
[1];
797 jccs
.push_back({a
->frontier(), taken
});
798 a
->jcc(i
.cc
, a
->frontier());
800 emit(jmp
{i
.targets
[0]});
803 void Vgen::emit(const jcci
& i
) {
804 a
->jcc(i
.cc
, i
.taken
);
808 void Vgen::emit(const jmp
& i
) {
809 if (next
== i
.target
) return;
810 jmps
.push_back({a
->frontier(), i
.target
});
811 a
->jmp(a
->frontier());
814 void Vgen::emit(const lea
& i
) {
815 // could do this in a simplify pass
816 if (i
.s
.disp
== 0 && i
.s
.base
.isValid() && !i
.s
.index
.isValid()) {
817 emit(copy
{i
.s
.base
, i
.d
});
824 * Move all the elements of in into out, replacing count elements of out
825 * starting at idx. in be will be cleared at the end.
827 * Example: vector_splice([1, 2, 3, 4, 5], 2, 1, [10, 11, 12]) will change out
828 * to [1, 2, 10, 11, 12, 4, 5].
831 void vector_splice(V
& out
, size_t idx
, size_t count
, V
& in
) {
832 auto out_size
= out
.size();
834 // Start by making room in out for the new elements.
835 out
.resize(out
.size() + in
.size() - count
);
837 // Move everything after the to-be-overwritten elements to the new end.
838 std::move_backward(out
.begin() + idx
+ count
, out
.begin() + out_size
,
841 // Move the new elements in
842 std::move(in
.begin(), in
.end(), out
.begin() + idx
);
846 // Lower svcreq{} by making copies to abi registers explicit, saving
847 // vm regs, and returning to the VM. svcreq{} is guaranteed to be
848 // at the end of a block, so we can just keep appending to the same
850 void lower_svcreq(Vunit
& unit
, Vlabel b
, const Vinstr
& inst
) {
851 assertx(unit
.tuples
[inst
.svcreq_
.extraArgs
].size() < kNumServiceReqArgRegs
);
852 auto svcreq
= inst
.svcreq_
; // copy it
853 auto origin
= inst
.origin
;
854 auto& argv
= unit
.tuples
[svcreq
.extraArgs
];
855 unit
.blocks
[b
].code
.pop_back(); // delete the svcreq instruction
856 Vout
v(unit
, b
, origin
);
858 RegSet arg_regs
= svcreq
.args
;
860 for (int i
= 0, n
= argv
.size(); i
< n
; ++i
) {
861 PhysReg d
{serviceReqArgRegs
[i
]};
862 arg_dests
.push_back(d
);
865 v
<< copyargs
{svcreq
.extraArgs
, v
.makeTuple(arg_dests
)};
866 if (svcreq
.stub_block
) {
867 v
<< leap
{rip
[(int64_t)svcreq
.stub_block
], rAsm
};
869 v
<< ldimmq
{0, rAsm
}; // because persist flag
871 v
<< ldimmq
{svcreq
.req
, rdi
};
872 arg_regs
|= rAsm
| rdi
| rVmFp
| rVmSp
;
874 v
<< jmpi
{TCA(handleSRHelper
), arg_regs
};
877 void lowerSrem(Vunit
& unit
, Vlabel b
, size_t iInst
) {
878 auto const& inst
= unit
.blocks
[b
].code
[iInst
];
879 auto const& srem
= inst
.srem_
;
880 auto scratch
= unit
.makeScratchBlock();
881 SCOPE_EXIT
{ unit
.freeScratchBlock(scratch
); };
882 Vout
v(unit
, scratch
, inst
.origin
);
883 v
<< copy
{srem
.s0
, rax
};
884 v
<< cqo
{}; // sign-extend rax => rdx:rax
885 v
<< idiv
{srem
.s1
, v
.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx
886 v
<< copy
{rdx
, srem
.d
};
888 vector_splice(unit
.blocks
[b
].code
, iInst
, 1, unit
.blocks
[scratch
].code
);
891 template<typename FromOp
, typename ToOp
>
892 void lowerShift(Vunit
& unit
, Vlabel b
, size_t iInst
) {
893 auto const& inst
= unit
.blocks
[b
].code
[iInst
];
894 auto const& shift
= inst
.get
<FromOp
>();
895 auto scratch
= unit
.makeScratchBlock();
896 SCOPE_EXIT
{ unit
.freeScratchBlock(scratch
); };
897 Vout
v(unit
, scratch
, inst
.origin
);
898 v
<< copy
{shift
.s0
, rcx
};
899 v
<< ToOp
{shift
.s1
, shift
.d
, shift
.sf
};
901 vector_splice(unit
.blocks
[b
].code
, iInst
, 1, unit
.blocks
[scratch
].code
);
904 void lowerAbsdbl(Vunit
& unit
, Vlabel b
, size_t iInst
) {
905 auto const& inst
= unit
.blocks
[b
].code
[iInst
];
906 auto const& absdbl
= inst
.absdbl_
;
907 auto scratch
= unit
.makeScratchBlock();
908 SCOPE_EXIT
{ unit
.freeScratchBlock(scratch
); };
909 Vout
v(unit
, scratch
, inst
.origin
);
911 // clear the high bit
912 auto tmp
= v
.makeReg();
913 v
<< psllq
{1, absdbl
.s
, tmp
};
914 v
<< psrlq
{1, tmp
, absdbl
.d
};
916 vector_splice(unit
.blocks
[b
].code
, iInst
, 1, unit
.blocks
[scratch
].code
);
919 void lowerVcall(Vunit
& unit
, Vlabel b
, size_t iInst
) {
920 auto& blocks
= unit
.blocks
;
921 auto& inst
= blocks
[b
].code
[iInst
];
922 auto const is_vcall
= inst
.op
== Vinstr::vcall
;
923 auto const vcall
= inst
.vcall_
;
924 auto const vinvoke
= inst
.vinvoke_
;
926 // Extract all the relevant information from the appropriate instruction.
927 auto const is_smashable
= !is_vcall
&& vinvoke
.smashable
;
928 auto const call
= is_vcall
? vcall
.call
: vinvoke
.call
;
929 auto const& vargs
= unit
.vcallArgs
[is_vcall
? vcall
.args
: vinvoke
.args
];
930 auto const& stkArgs
= vargs
.stkArgs
;
931 auto const dests
= unit
.tuples
[is_vcall
? vcall
.d
: vinvoke
.d
];
932 auto const fixup
= is_vcall
? vcall
.fixup
: vinvoke
.fixup
;
933 auto const destType
= is_vcall
? vcall
.destType
: vinvoke
.destType
;
935 auto scratch
= unit
.makeScratchBlock();
936 SCOPE_EXIT
{ unit
.freeScratchBlock(scratch
); };
937 Vout
v(unit
, scratch
, inst
.origin
);
939 int32_t const adjust
= (stkArgs
.size() & 0x1) ? sizeof(uintptr_t) : 0;
940 if (adjust
) v
<< subqi
{adjust
, reg::rsp
, reg::rsp
, v
.makeReg()};
942 // Push stack arguments, in reverse order.
943 for (int i
= stkArgs
.size() - 1; i
>= 0; --i
) v
<< push
{stkArgs
[i
]};
945 // Get the arguments in the proper registers.
947 auto doArgs
= [&](const VregList
& srcs
, const PhysReg argNames
[]) {
949 for (size_t i
= 0; i
< srcs
.size(); ++i
) {
950 auto reg
= argNames
[i
];
951 argDests
.push_back(reg
);
954 if (argDests
.size()) {
955 v
<< copyargs
{v
.makeTuple(srcs
),
956 v
.makeTuple(std::move(argDests
))};
959 doArgs(vargs
.args
, argNumToRegName
);
960 doArgs(vargs
.simdArgs
, argNumToSIMDRegName
);
963 if (is_smashable
) v
<< mccall
{(TCA
)call
.address(), argRegs
};
964 else emitCall(v
, call
, argRegs
);
966 // Handle fixup and unwind information.
967 if (fixup
.isValid()) v
<< syncpoint
{fixup
};
970 auto& targets
= vinvoke
.targets
;
971 v
<< unwind
{{targets
[0], targets
[1]}};
973 // Insert an lea fixup for any stack args at the beginning of the catch
975 if (auto rspOffset
= ((stkArgs
.size() + 1) & ~1) * sizeof(uintptr_t)) {
976 auto& taken
= unit
.blocks
[targets
[1]].code
;
977 assertx(taken
.front().op
== Vinstr::landingpad
||
978 taken
.front().op
== Vinstr::jmp
);
979 Vinstr v
{lea
{rsp
[rspOffset
], rsp
}};
980 v
.origin
= taken
.front().origin
;
981 if (taken
.front().op
== Vinstr::jmp
) {
982 taken
.insert(taken
.begin(), v
);
984 taken
.insert(taken
.begin() + 1, v
);
988 // Write out the code so far to the end of b. Remaining code will be
989 // emitted to the next block.
990 vector_splice(blocks
[b
].code
, iInst
, 1, blocks
[scratch
].code
);
991 } else if (vcall
.nothrow
) {
995 // Copy the call result to the destination register(s)
998 // rax contains m_type and m_aux but we're expecting just the type in
999 // the lower bits, so shift the type result register.
1000 static_assert(offsetof(TypedValue
, m_data
) == 0, "");
1001 static_assert(offsetof(TypedValue
, m_type
) == 8, "");
1002 if (dests
.size() == 2) {
1003 v
<< copy2
{reg::rax
, reg::rdx
, dests
[0], dests
[1]};
1005 // We have cases where we statically know the type but need the value
1006 // from native call. Even if the type does not really need a register
1007 // (e.g., InitNull), a Vreg is still allocated in assignRegs(), so the
1008 // following assertion holds.
1009 assertx(dests
.size() == 1);
1010 v
<< copy
{reg::rax
, dests
[0]};
1014 case DestType::SIMD
: {
1015 // rax contains m_type and m_aux but we're expecting just the type in
1016 // the lower bits, so shift the type result register.
1017 static_assert(offsetof(TypedValue
, m_data
) == 0, "");
1018 static_assert(offsetof(TypedValue
, m_type
) == 8, "");
1019 assertx(dests
.size() == 1);
1020 pack2(v
, reg::rax
, reg::rdx
, dests
[0]);
1024 case DestType::Byte
:
1025 // copy the single-register result to dests[0]
1026 assertx(dests
.size() == 1);
1027 assertx(dests
[0].isValid());
1028 v
<< copy
{reg::rax
, dests
[0]};
1030 case DestType::None
:
1031 assertx(dests
.empty());
1034 // copy the single-register result to dests[0]
1035 assertx(dests
.size() == 1);
1036 assertx(dests
[0].isValid());
1037 v
<< copy
{reg::xmm0
, dests
[0]};
1041 if (stkArgs
.size() > 0) {
1042 v
<< addqi
{safe_cast
<int32_t>(stkArgs
.size() * sizeof(uintptr_t)
1049 // Insert new instructions to the appropriate block
1051 vector_splice(blocks
[b
].code
, iInst
, 1, blocks
[scratch
].code
);
1053 vector_splice(blocks
[vinvoke
.targets
[0]].code
, 0, 0,
1054 blocks
[scratch
].code
);
1058 void lower_vcallstub(Vunit
& unit
, Vlabel b
) {
1059 auto& code
= unit
.blocks
[b
].code
;
1060 // vcallstub can only appear at the end of a block.
1061 auto const inst
= code
.back().get
<vcallstub
>();
1062 auto const origin
= code
.back().origin
;
1064 auto argRegs
= inst
.args
;
1065 auto const& srcs
= unit
.tuples
[inst
.extraArgs
];
1066 jit::vector
<Vreg
> dsts
;
1067 for (int i
= 0; i
< srcs
.size(); ++i
) {
1068 dsts
.emplace_back(argNumToRegName
[i
]);
1069 argRegs
|= argNumToRegName
[i
];
1072 code
.back() = copyargs
{unit
.makeTuple(srcs
), unit
.makeTuple(std::move(dsts
))};
1073 code
.emplace_back(callstub
{inst
.target
, argRegs
});
1074 code
.back().origin
= origin
;
1075 code
.emplace_back(unwind
{{inst
.targets
[0], inst
.targets
[1]}});
1076 code
.back().origin
= origin
;
1080 * Lower a few abstractions to facilitate straightforward x64 codegen.
1082 void lowerForX64(Vunit
& unit
, const Abi
& abi
) {
1083 Timer
_t(Timer::vasm_lower
);
1085 // This pass relies on having no critical edges in the unit.
1086 splitCriticalEdges(unit
);
1088 // Scratch block can change blocks allocation, hence cannot use regular
1090 auto& blocks
= unit
.blocks
;
1092 PostorderWalker
{unit
}.dfs([&](Vlabel ib
) {
1093 assertx(!blocks
[ib
].code
.empty());
1094 auto& back
= blocks
[ib
].code
.back();
1095 if (back
.op
== Vinstr::svcreq
) {
1096 lower_svcreq(unit
, Vlabel
{ib
}, blocks
[ib
].code
.back());
1097 } else if (back
.op
== Vinstr::vcallstub
) {
1098 lower_vcallstub(unit
, Vlabel
{ib
});
1101 for (size_t ii
= 0; ii
< blocks
[ib
].code
.size(); ++ii
) {
1102 auto& inst
= blocks
[ib
].code
[ii
];
1105 case Vinstr::vinvoke
:
1106 lowerVcall(unit
, Vlabel
{ib
}, ii
);
1110 lowerSrem(unit
, Vlabel
{ib
}, ii
);
1114 lowerShift
<sar
, sarq
>(unit
, Vlabel
{ib
}, ii
);
1118 lowerShift
<shl
, shlq
>(unit
, Vlabel
{ib
}, ii
);
1121 case Vinstr::absdbl
:
1122 lowerAbsdbl(unit
, Vlabel
{ib
}, ii
);
1125 case Vinstr::defvmsp
:
1126 inst
= copy
{rVmSp
, inst
.defvmsp_
.d
};
1129 case Vinstr::syncvmsp
:
1130 inst
= copy
{inst
.syncvmsp_
.s
, rVmSp
};
1133 case Vinstr::movtqb
:
1134 inst
= copy
{inst
.movtqb_
.s
, inst
.movtqb_
.d
};
1137 case Vinstr::movtql
:
1138 inst
= copy
{inst
.movtql_
.s
, inst
.movtql_
.d
};
1141 case Vinstr::countbytecode
:
1142 inst
= incqm
{inst
.countbytecode_
.base
[g_bytecodesVasm
.handle()],
1143 inst
.countbytecode_
.sf
};
1152 printUnit(kVasmLowerLevel
, "after lower for X64", unit
);
1155 ///////////////////////////////////////////////////////////////////////////////
1158 void optimizeX64(Vunit
& unit
, const Abi
& abi
) {
1159 Timer
timer(Timer::vasm_optimize
);
1161 removeTrivialNops(unit
);
1164 optimizeExits(unit
);
1166 lowerForX64(unit
, abi
);
1168 if (!unit
.constants
.empty()) {
1169 foldImms
<x64::ImmFolder
>(unit
);
1172 Timer
timer(Timer::vasm_copy
);
1173 optimizeCopies(unit
, abi
);
1175 if (unit
.needsRegAlloc()) {
1176 Timer
timer(Timer::vasm_xls
);
1177 removeDeadCode(unit
);
1178 allocateRegisters(unit
, abi
);
1180 if (unit
.blocks
.size() > 1) {
1181 Timer
timer(Timer::vasm_jumps
);
1186 void emitX64(const Vunit
& unit
, Vasm::AreaList
& areas
, AsmInfo
* asmInfo
) {
1187 static thread_local
bool busy
;
1188 always_assert(!busy
);
1190 SCOPE_EXIT
{ busy
= false; };
1192 Timer
timer(Timer::vasm_gen
);
1193 auto blocks
= layoutBlocks(unit
);
1194 Vgen(unit
, areas
, asmInfo
).emit(blocks
);
1197 ///////////////////////////////////////////////////////////////////////////////