Various llvm backend bugfixes
[hiphop-php.git] / hphp / runtime / vm / jit / back-end-x64.cpp
blobb3d43bc6305fe035437b8f1e7e1d6ced9fd323dd
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/vm/jit/back-end-x64.h"
19 #include "hphp/util/asm-x64.h"
20 #include "hphp/util/disasm.h"
21 #include "hphp/util/text-color.h"
23 #include "hphp/runtime/vm/jit/abi-x64.h"
24 #include "hphp/runtime/vm/jit/block.h"
25 #include "hphp/runtime/vm/jit/check.h"
26 #include "hphp/runtime/vm/jit/code-gen-helpers-x64.h"
27 #include "hphp/runtime/vm/jit/code-gen-x64.h"
28 #include "hphp/runtime/vm/jit/func-prologues-x64.h"
29 #include "hphp/runtime/vm/jit/cfg.h"
30 #include "hphp/runtime/vm/jit/mc-generator.h"
31 #include "hphp/runtime/vm/jit/print.h"
32 #include "hphp/runtime/vm/jit/reg-alloc-x64.h"
33 #include "hphp/runtime/vm/jit/service-requests-inline.h"
34 #include "hphp/runtime/vm/jit/service-requests-x64.h"
35 #include "hphp/runtime/vm/jit/timer.h"
36 #include "hphp/runtime/vm/jit/unique-stubs-x64.h"
37 #include "hphp/runtime/vm/jit/unwind-x64.h"
38 #include "hphp/runtime/vm/jit/vasm-print.h"
39 #include "hphp/runtime/vm/jit/vasm-llvm.h"
41 namespace HPHP { namespace jit {
43 using namespace reg;
45 extern "C" void enterTCHelper(Cell* vm_sp,
46 ActRec* vm_fp,
47 TCA start,
48 TReqInfo* infoPtr,
49 ActRec* firstAR,
50 void* targetCacheBase);
52 namespace x64 {
54 TRACE_SET_MOD(hhir);
56 struct BackEnd : public jit::BackEnd {
57 BackEnd() {}
58 ~BackEnd() {}
60 Abi abi() override {
61 return x64::abi;
64 size_t cacheLineSize() override {
65 return 64;
68 PhysReg rSp() override {
69 return PhysReg(reg::rsp);
72 PhysReg rVmSp() override {
73 return x64::rVmSp;
76 PhysReg rVmFp() override {
77 return x64::rVmFp;
80 PhysReg rVmTl() override {
81 return x64::rVmTl;
84 bool storesCell(const IRInstruction& inst, uint32_t srcIdx) override {
85 return x64::storesCell(inst, srcIdx);
88 bool loadsCell(const IRInstruction& inst) override {
89 return x64::loadsCell(inst.op());
93 * enterTCHelper does not save callee-saved registers except %rbp. This means
94 * when we call it from C++, we have to tell gcc to clobber all the other
95 * callee-saved registers.
97 #define CALLEE_SAVED_BARRIER() \
98 asm volatile("" : : : "rbx", "r12", "r13", "r14", "r15");
101 * enterTCHelper is a handwritten assembly function that transfers control in
102 * and out of the TC.
104 static_assert(x64::rVmSp == rbx &&
105 x64::rVmFp == rbp &&
106 x64::rVmTl == r12 &&
107 x64::rStashedAR == r15,
108 "__enterTCHelper needs to be modified to use the correct ABI");
109 static_assert(REQ_BIND_CALL == 0x1,
110 "Update assembly test for REQ_BIND_CALL in __enterTCHelper");
112 void enterTCHelper(TCA start, TReqInfo& info) override {
113 // We have to force C++ to spill anything that might be in a callee-saved
114 // register (aside from rbp). enterTCHelper does not save them.
115 CALLEE_SAVED_BARRIER();
116 auto& regs = vmRegsUnsafe();
117 jit::enterTCHelper(regs.stack.top(), regs.fp, start,
118 &info, vmFirstAR(), RDS::tl_base);
119 CALLEE_SAVED_BARRIER();
122 void moveToAlign(CodeBlock& cb,
123 MoveToAlignFlags alignment
124 = MoveToAlignFlags::kJmpTargetAlign) override {
125 size_t x64Alignment;
127 switch (alignment) {
128 case MoveToAlignFlags::kJmpTargetAlign:
129 x64Alignment = kJmpTargetAlign;
130 break;
131 case MoveToAlignFlags::kNonFallthroughAlign:
132 x64Alignment = jit::kNonFallthroughAlign;
133 break;
134 case MoveToAlignFlags::kCacheLineAlign:
135 x64Alignment = kCacheLineSize;
136 break;
138 x64::moveToAlign(cb, x64Alignment);
141 UniqueStubs emitUniqueStubs() override {
142 return x64::emitUniqueStubs();
145 TCA emitServiceReqWork(CodeBlock& cb, TCA start, SRFlags flags,
146 ServiceRequest req,
147 const ServiceReqArgVec& argv) override {
148 return x64::emitServiceReqWork(cb, start, flags, req, argv);
151 void emitInterpReq(CodeBlock& mainCode, CodeBlock& coldCode,
152 SrcKey sk) override {
153 Asm a { mainCode };
154 // Add a counter for the translation if requested
155 if (RuntimeOption::EvalJitTransCounters) {
156 x64::emitTransCounterInc(a);
158 a. jmp(emitServiceReq(coldCode, REQ_INTERPRET, sk.offset()));
161 bool funcPrologueHasGuard(TCA prologue, const Func* func) override {
162 return x64::funcPrologueHasGuard(prologue, func);
165 TCA funcPrologueToGuard(TCA prologue, const Func* func) override {
166 return x64::funcPrologueToGuard(prologue, func);
169 SrcKey emitFuncPrologue(CodeBlock& mainCode, CodeBlock& coldCode, Func* func,
170 bool funcIsMagic, int nPassed, TCA& start,
171 TCA& aStart) override {
172 return funcIsMagic
173 ? x64::emitMagicFuncPrologue(func, nPassed, start)
174 : x64::emitFuncPrologue(func, nPassed, start);
177 TCA emitCallArrayPrologue(Func* func, DVFuncletsVec& dvs) override {
178 return x64::emitCallArrayPrologue(func, dvs);
181 void funcPrologueSmashGuard(TCA prologue, const Func* func) override {
182 x64::funcPrologueSmashGuard(prologue, func);
185 void emitIncStat(CodeBlock& cb, intptr_t disp, int n) override {
186 X64Assembler a { cb };
188 a. pushf ();
189 // addq $n, [%fs:disp]
190 a. fs().addq(n, baseless(disp));
191 a. popf ();
194 void emitTraceCall(CodeBlock& cb, Offset pcOff) override {
195 x64::emitTraceCall(cb, pcOff);
198 bool isSmashable(Address frontier, int nBytes, int offset = 0) override {
199 assert(nBytes <= int(kCacheLineSize));
200 uintptr_t iFrontier = uintptr_t(frontier) + offset;
201 uintptr_t lastByte = uintptr_t(frontier) + nBytes - 1;
202 return (iFrontier & ~kCacheLineMask) == (lastByte & ~kCacheLineMask);
205 private:
206 void prepareForSmashImpl(CodeBlock& cb, int nBytes, int offset) {
207 if (!isSmashable(cb.frontier(), nBytes, offset)) {
208 X64Assembler a { cb };
209 int gapSize = (~(uintptr_t(a.frontier()) + offset) & kCacheLineMask) + 1;
210 a.emitNop(gapSize);
211 assert(isSmashable(a.frontier(), nBytes, offset));
215 public:
216 void prepareForSmash(CodeBlock& cb, int nBytes, int offset = 0) override {
217 prepareForSmashImpl(cb, nBytes, offset);
218 mcg->cgFixups().m_alignFixups.emplace(cb.frontier(),
219 std::make_pair(nBytes, offset));
222 void prepareForTestAndSmash(CodeBlock& cb, int testBytes,
223 TestAndSmashFlags flags) override {
224 using namespace x64;
225 switch (flags) {
226 case TestAndSmashFlags::kAlignJcc:
227 prepareForSmash(cb, testBytes + kJmpccLen, testBytes);
228 assert(isSmashable(cb.frontier() + testBytes, kJmpccLen));
229 break;
230 case TestAndSmashFlags::kAlignJccImmediate:
231 prepareForSmash(cb,
232 testBytes + kJmpccLen,
233 testBytes + kJmpccLen - kJmpImmBytes);
234 assert(isSmashable(cb.frontier() + testBytes, kJmpccLen,
235 kJmpccLen - kJmpImmBytes));
236 break;
237 case TestAndSmashFlags::kAlignJccAndJmp:
238 // Ensure that the entire jcc, and the entire jmp are smashable
239 // (but we dont need them both to be in the same cache line)
240 prepareForSmashImpl(cb, testBytes + kJmpccLen, testBytes);
241 prepareForSmashImpl(cb, testBytes + kJmpccLen + kJmpLen,
242 testBytes + kJmpccLen);
243 mcg->cgFixups().m_alignFixups.emplace(
244 cb.frontier(), std::make_pair(testBytes + kJmpccLen, testBytes));
245 mcg->cgFixups().m_alignFixups.emplace(
246 cb.frontier(), std::make_pair(testBytes + kJmpccLen + kJmpLen,
247 testBytes + kJmpccLen));
248 assert(isSmashable(cb.frontier() + testBytes, kJmpccLen));
249 assert(isSmashable(cb.frontier() + testBytes + kJmpccLen, kJmpLen));
250 break;
254 bool supportsRelocation() const override {
255 return true;
258 typedef hphp_hash_set<void*> WideJmpSet;
259 struct JmpOutOfRange : std::exception {};
261 size_t relocate(RelocationInfo& rel,
262 CodeBlock& destBlock,
263 TCA start, TCA end,
264 CodeGenFixups& fixups,
265 TCA* exitAddr) override {
266 WideJmpSet wideJmps;
267 while (true) {
268 try {
269 return relocateImpl(rel, destBlock, start, end,
270 fixups, exitAddr, wideJmps);
271 } catch (JmpOutOfRange& j) {
276 size_t relocateImpl(RelocationInfo& rel,
277 CodeBlock& destBlock,
278 TCA start, TCA end,
279 CodeGenFixups& fixups,
280 TCA* exitAddr,
281 WideJmpSet& wideJmps) {
282 TCA src = start;
283 size_t range = end - src;
284 bool hasInternalRefs = false;
285 bool internalRefsNeedUpdating = false;
286 TCA destStart = destBlock.frontier();
287 size_t asm_count{0};
288 TCA jmpDest = nullptr;
289 TCA keepNopLow = nullptr;
290 TCA keepNopHigh = nullptr;
291 try {
292 while (src != end) {
293 assert(src < end);
294 DecodedInstruction di(src);
295 asm_count++;
297 int destRange = 0;
298 auto af = fixups.m_alignFixups.equal_range(src);
299 while (af.first != af.second) {
300 auto low = src + af.first->second.second;
301 auto hi = src + af.first->second.first;
302 assert(low < hi);
303 if (!keepNopLow || keepNopLow > low) keepNopLow = low;
304 if (!keepNopHigh || keepNopHigh < hi) keepNopHigh = hi;
305 TCA tmp = destBlock.frontier();
306 prepareForSmashImpl(destBlock,
307 af.first->second.first, af.first->second.second);
308 if (destBlock.frontier() != tmp) {
309 destRange += destBlock.frontier() - tmp;
310 internalRefsNeedUpdating = true;
312 ++af.first;
315 bool preserveAlignment = keepNopLow && keepNopHigh &&
316 keepNopLow <= src && keepNopHigh > src;
317 TCA target = nullptr;
318 TCA dest = destBlock.frontier();
319 destBlock.bytes(di.size(), src);
320 DecodedInstruction d2(dest);
321 if (di.hasPicOffset()) {
322 if (di.isBranch(false)) {
323 target = di.picAddress();
326 * Rip-relative offsets that point outside the range
327 * being moved need to be adjusted so they continue
328 * to point at the right thing
330 if (size_t(di.picAddress() - start) >= range) {
331 bool DEBUG_ONLY success = d2.setPicAddress(di.picAddress());
332 assert(success);
333 } else {
334 if (!preserveAlignment && d2.isBranch()) {
335 if (wideJmps.count(src)) {
336 if (d2.size() < kJmpLen) {
337 d2.widenBranch();
338 internalRefsNeedUpdating = true;
340 } else if (d2.shrinkBranch()) {
341 internalRefsNeedUpdating = true;
344 hasInternalRefs = true;
347 if (di.hasImmediate()) {
348 if (fixups.m_addressImmediates.count(src)) {
349 if (size_t(di.immediate() - (uint64_t)start) < range) {
350 hasInternalRefs = internalRefsNeedUpdating = true;
352 } else {
353 if (fixups.m_addressImmediates.count((TCA)~uintptr_t(src))) {
354 // Handle weird, encoded offset, used by cgLdObjMethod
355 always_assert(di.immediate() == ((uintptr_t(src) << 1) | 1));
356 bool DEBUG_ONLY success =
357 d2.setImmediate(((uintptr_t)dest << 1) | 1);
358 assert(success);
361 * An immediate that points into the range being moved, but which
362 * isn't tagged as an addressImmediate, is most likely a bug
363 * and its instruction's address needs to be put into
364 * fixups.m_addressImmediates. But it could just happen by bad
365 * luck, so just log it.
367 if (size_t(di.immediate() - (uint64_t)start) < range) {
368 FTRACE(3,
369 "relocate: instruction at {} has immediate 0x{:x}"
370 "which looks like an address that needs relocating\n",
371 src, di.immediate());
376 if (src == start) {
377 // for the start of the range, we only want to overwrite the "after"
378 // address (since the "before" address could belong to the previous
379 // tracelet, which could be being relocated to a completely different
380 // address. recordRange will do that for us, so just make sure we
381 // have the right address setup.
382 destStart = dest;
383 } else {
384 rel.recordAddress(src, dest - destRange, destRange);
386 if (preserveAlignment && di.size() == kJmpLen &&
387 di.isNop() && src + kJmpLen == end) {
388 smashJmp(dest, src + kJmpLen);
389 dest += kJmpLen;
390 } else if (di.isNop() && !preserveAlignment) {
391 internalRefsNeedUpdating = true;
392 } else {
393 dest += d2.size();
395 jmpDest = target;
396 assert(dest <= destBlock.frontier());
397 destBlock.setFrontier(dest);
398 src += di.size();
399 if (keepNopHigh && src >= keepNopHigh) {
400 keepNopLow = keepNopHigh = nullptr;
404 if (exitAddr) {
405 *exitAddr = jmpDest;
408 rel.recordRange(start, end, destStart, destBlock.frontier());
410 if (hasInternalRefs && internalRefsNeedUpdating) {
411 src = start;
412 bool ok = true;
413 while (src != end) {
414 DecodedInstruction di(src);
415 TCA newPicAddress = nullptr;
416 int64_t newImmediate = 0;
417 if (di.hasPicOffset() &&
418 size_t(di.picAddress() - start) < range) {
419 newPicAddress = rel.adjustedAddressAfter(di.picAddress());
420 always_assert(newPicAddress);
422 if (di.hasImmediate() &&
423 size_t((TCA)di.immediate() - start) < range &&
424 fixups.m_addressImmediates.count(src)) {
425 newImmediate =
426 (int64_t)rel.adjustedAddressAfter((TCA)di.immediate());
427 always_assert(newImmediate);
429 if (newImmediate || newPicAddress) {
430 TCA dest = rel.adjustedAddressAfter(src);
431 DecodedInstruction d2(dest);
432 if (newPicAddress) {
433 if (!d2.setPicAddress(newPicAddress)) {
434 always_assert(d2.isBranch() && d2.size() == 2);
435 wideJmps.insert(src);
436 ok = false;
439 if (newImmediate) {
440 if (!d2.setImmediate(newImmediate)) {
441 always_assert(false);
445 src += di.size();
447 if (!ok) {
448 throw JmpOutOfRange();
451 rel.markAddressImmediates(fixups.m_addressImmediates);
452 } catch (...) {
453 rel.rewind(start, end);
454 destBlock.setFrontier(destStart);
455 throw;
457 return asm_count;
460 template <typename T>
461 void fixupStateVector(StateVector<T, TcaRange>& sv, RelocationInfo& rel) {
462 for (auto& ii : sv) {
463 if (!ii.empty()) {
465 * We have to be careful with before/after here.
466 * If we relocate two consecutive regions of memory,
467 * but relocate them to two different destinations, then
468 * the end address of the first region is also the start
469 * address of the second region; so adjustedAddressBefore(end)
470 * gives us the relocated address of the end of the first
471 * region, while adjustedAddressAfter(end) gives us the
472 * relocated address of the start of the second region.
474 auto s = rel.adjustedAddressAfter(ii.begin());
475 auto e = rel.adjustedAddressBefore(ii.end());
476 if (e || s) {
477 if (!s) s = ii.begin();
478 if (!e) e = ii.end();
479 ii = TcaRange(s, e);
485 void adjustForRelocation(RelocationInfo& rel) override {
486 for (const auto& range : rel.srcRanges()) {
487 adjustForRelocation(rel, range.first, range.second);
491 void adjustForRelocation(RelocationInfo& rel,
492 TCA srcStart, TCA srcEnd) override {
493 auto start = rel.adjustedAddressAfter(srcStart);
494 auto end = rel.adjustedAddressBefore(srcEnd);
495 if (!start) {
496 start = srcStart;
497 end = srcEnd;
498 } else {
499 always_assert(end);
501 while (start != end) {
502 assert(start < end);
503 DecodedInstruction di(start);
505 if (di.hasPicOffset()) {
507 * A pointer into something that has been relocated needs to be
508 * updated.
510 if (TCA adjusted = rel.adjustedAddressAfter(di.picAddress())) {
511 di.setPicAddress(adjusted);
515 if (di.hasImmediate()) {
517 * Similarly for addressImmediates - and see comment above
518 * for non-address immediates.
520 if (TCA adjusted = rel.adjustedAddressAfter((TCA)di.immediate())) {
521 if (rel.isAddressImmediate(start)) {
522 di.setImmediate((int64_t)adjusted);
523 } else {
524 FTRACE(3,
525 "relocate: instruction at {} has immediate 0x{:x}"
526 "which looks like an address that needs relocating\n",
527 start, di.immediate());
532 start += di.size();
534 if (start == end && di.isNop() &&
535 di.size() == kJmpLen &&
536 rel.adjustedAddressAfter(srcEnd)) {
538 smashJmp(start - di.size(), rel.adjustedAddressAfter(end));
544 * Adjusts the addresses in asmInfo and fixups to match the new
545 * location of the code.
546 * This will not "hook up" the relocated code in any way, so is safe
547 * to call before the relocated code is ready to run.
549 void adjustMetaDataForRelocation(RelocationInfo& rel,
550 AsmInfo* asmInfo,
551 CodeGenFixups& fixups) override {
552 auto& ip = fixups.m_inProgressTailJumps;
553 for (size_t i = 0; i < ip.size(); ++i) {
554 IncomingBranch& ib = const_cast<IncomingBranch&>(ip[i]);
555 TCA adjusted = rel.adjustedAddressAfter(ib.toSmash());
556 always_assert(adjusted);
557 ib.adjust(adjusted);
560 for (auto& fixup : fixups.m_pendingFixups) {
562 * Pending fixups always point after the call instruction,
563 * so use the "before" address, since there may be nops
564 * before the next actual instruction.
566 if (TCA adjusted = rel.adjustedAddressBefore(fixup.m_tca)) {
567 fixup.m_tca = adjusted;
571 for (auto& ct : fixups.m_pendingCatchTraces) {
573 * Similar to fixups - this is a return address so get
574 * the address returned to.
576 if (CTCA adjusted = rel.adjustedAddressBefore(ct.first)) {
577 ct.first = adjusted;
580 * But the target is an instruction, so skip over any nops
581 * that might have been inserted (eg for alignment).
583 if (TCA adjusted = rel.adjustedAddressAfter(ct.second)) {
584 ct.second = adjusted;
588 for (auto& jt : fixups.m_pendingJmpTransIDs) {
589 if (TCA adjusted = rel.adjustedAddressAfter(jt.first)) {
590 jt.first = adjusted;
595 * Most of the time we want to adjust to a corresponding "before" address
596 * with the exception of the start of the range where "before" can point to
597 * the end of a previous range.
599 if (!fixups.m_bcMap.empty()) {
600 auto const aStart = fixups.m_bcMap[0].aStart;
601 auto const acoldStart = fixups.m_bcMap[0].acoldStart;
602 auto const afrozenStart = fixups.m_bcMap[0].afrozenStart;
603 for (auto& tbc : fixups.m_bcMap) {
604 if (TCA adjusted = (tbc.aStart == aStart
605 ? rel.adjustedAddressAfter(aStart)
606 : rel.adjustedAddressBefore(tbc.aStart))) {
607 tbc.aStart = adjusted;
609 if (TCA adjusted = (tbc.acoldStart == acoldStart
610 ? rel.adjustedAddressAfter(acoldStart)
611 : rel.adjustedAddressBefore(tbc.acoldStart))) {
612 tbc.acoldStart = adjusted;
614 if (TCA adjusted = (tbc.afrozenStart == afrozenStart
615 ? rel.adjustedAddressAfter(afrozenStart)
616 : rel.adjustedAddressBefore(tbc.afrozenStart))) {
617 tbc.afrozenStart = adjusted;
622 decltype(fixups.m_addressImmediates) updatedAI;
623 for (auto addrImm : fixups.m_addressImmediates) {
624 if (TCA adjusted = rel.adjustedAddressAfter(addrImm)) {
625 updatedAI.insert(adjusted);
626 } else if (TCA odd = rel.adjustedAddressAfter((TCA)~uintptr_t(addrImm))) {
627 // just for cgLdObjMethod
628 updatedAI.insert((TCA)~uintptr_t(odd));
629 } else {
630 updatedAI.insert(addrImm);
633 updatedAI.swap(fixups.m_addressImmediates);
635 decltype(fixups.m_alignFixups) updatedAF;
636 for (auto af : fixups.m_alignFixups) {
637 if (TCA adjusted = rel.adjustedAddressAfter(af.first)) {
638 updatedAF.emplace(adjusted, af.second);
639 } else {
640 updatedAF.emplace(af);
643 updatedAF.swap(fixups.m_alignFixups);
645 if (asmInfo) {
646 fixupStateVector(asmInfo->asmInstRanges, rel);
647 fixupStateVector(asmInfo->asmBlockRanges, rel);
648 fixupStateVector(asmInfo->coldInstRanges, rel);
649 fixupStateVector(asmInfo->coldBlockRanges, rel);
650 fixupStateVector(asmInfo->frozenInstRanges, rel);
651 fixupStateVector(asmInfo->frozenBlockRanges, rel);
655 void adjustCodeForRelocation(RelocationInfo& rel,
656 CodeGenFixups& fixups) override {
657 for (auto addr : fixups.m_reusedStubs) {
659 * The stubs are terminated by a ud2. Check for it.
661 while (addr[0] != 0x0f || addr[1] != 0x0b) {
662 DecodedInstruction di(addr);
663 if (di.hasPicOffset()) {
664 if (TCA adjusted = rel.adjustedAddressAfter(di.picAddress())) {
665 di.setPicAddress(adjusted);
668 addr += di.size();
672 for (auto codePtr : fixups.m_codePointers) {
673 if (TCA adjusted = rel.adjustedAddressAfter(*codePtr)) {
674 *codePtr = adjusted;
679 private:
680 void smashJmpOrCall(TCA addr, TCA dest, bool isCall) {
681 // Unconditional rip-relative jmps can also be encoded with an EB as the
682 // first byte, but that means the delta is 1 byte, and we shouldn't be
683 // encoding smashable jumps that way.
684 assert(kJmpLen == kCallLen);
686 // XXX The LLVM check here is terrible and awful and temporary until we fix
687 // llvm's smashable tail call support: t5742980. For now it just means it's
688 // not safe to run multiple PHP threads when LLVM is enabled.
689 always_assert(RuntimeOption::EvalJitLLVM ||
690 isSmashable(addr, x64::kJmpLen));
692 auto& cb = mcg->code.blockFor(addr);
693 CodeCursor cursor { cb, addr };
694 X64Assembler a { cb };
695 if (dest > addr && dest - addr <= x64::kJmpLen) {
696 assert(!isCall);
697 a. emitNop(dest - addr);
698 } else if (isCall) {
699 a. call (dest);
700 } else {
701 a. jmp (dest);
705 public:
706 void smashJmp(TCA jmpAddr, TCA newDest) override {
707 assert(MCGenerator::canWrite());
708 FTRACE(2, "smashJmp: {} -> {}\n", jmpAddr, newDest);
709 smashJmpOrCall(jmpAddr, newDest, false);
712 void smashCall(TCA callAddr, TCA newDest) override {
713 assert(MCGenerator::canWrite());
714 FTRACE(2, "smashCall: {} -> {}\n", callAddr, newDest);
715 smashJmpOrCall(callAddr, newDest, true);
718 void smashJcc(TCA jccAddr, TCA newDest) override {
719 assert(MCGenerator::canWrite());
720 FTRACE(2, "smashJcc: {} -> {}\n", jccAddr, newDest);
721 // Make sure the encoding is what we expect. It has to be a rip-relative jcc
722 // with a 4-byte delta.
723 assert(*jccAddr == 0x0F && (*(jccAddr + 1) & 0xF0) == 0x80);
724 assert(isSmashable(jccAddr, x64::kJmpccLen));
726 // Can't use the assembler to write out a new instruction, because we have
727 // to preserve the condition code.
728 auto newDelta = safe_cast<int32_t>(newDest - jccAddr - x64::kJmpccLen);
729 auto deltaAddr = reinterpret_cast<int32_t*>(jccAddr
730 + x64::kJmpccLen
731 - x64::kJmpImmBytes);
732 *deltaAddr = newDelta;
735 void emitSmashableJump(CodeBlock& cb, TCA dest, ConditionCode cc) override {
736 X64Assembler a { cb };
737 if (cc == CC_None) {
738 assert(isSmashable(cb.frontier(), x64::kJmpLen));
739 a. jmp(dest);
740 } else {
741 assert(isSmashable(cb.frontier(), x64::kJmpccLen));
742 a. jcc(cc, dest);
746 TCA smashableCallFromReturn(TCA retAddr) override {
747 auto addr = retAddr - x64::kCallLen;
748 assert(isSmashable(addr, x64::kCallLen));
749 return addr;
752 void emitSmashableCall(CodeBlock& cb, TCA dest) override {
753 X64Assembler a { cb };
754 assert(isSmashable(cb.frontier(), x64::kCallLen));
755 a. call(dest);
758 TCA jmpTarget(TCA jmp) override {
759 if (jmp[0] != 0xe9) {
760 if (jmp[0] == 0x0f &&
761 jmp[1] == 0x1f &&
762 jmp[2] == 0x44) {
763 // 5 byte nop
764 return jmp + 5;
766 return nullptr;
768 return jmp + 5 + ((int32_t*)(jmp + 5))[-1];
771 TCA jccTarget(TCA jmp) override {
772 if (jmp[0] != 0x0F || (jmp[1] & 0xF0) != 0x80) return nullptr;
773 return jmp + 6 + ((int32_t*)(jmp + 6))[-1];
776 TCA callTarget(TCA call) override {
777 if (call[0] != 0xE8) return nullptr;
778 return call + 5 + ((int32_t*)(call + 5))[-1];
781 void addDbgGuard(CodeBlock& codeMain, CodeBlock& codeCold,
782 SrcKey sk, size_t dbgOff) override {
783 Asm a { codeMain };
785 // Emit the checks for debugger attach
786 auto rtmp = rAsm;
787 emitTLSLoad<ThreadInfo>(a, ThreadInfo::s_threadInfo, rtmp);
788 a. loadb (rtmp[dbgOff], rbyte(rtmp));
789 a. testb ((int8_t)0xff, rbyte(rtmp));
791 // Branch to a special REQ_INTERPRET if attached
792 auto const fallback =
793 emitServiceReq(codeCold, REQ_INTERPRET, sk.offset());
794 a. jnz (fallback);
797 void streamPhysReg(std::ostream& os, PhysReg reg) override {
798 auto name = (reg.type() == PhysReg::GP) ? reg::regname(Reg64(reg)) :
799 (reg.type() == PhysReg::SIMD) ? reg::regname(RegXMM(reg)) :
800 /* (reg.type() == PhysReg::SF) ? */ reg::regname(RegSF(reg));
801 os << name;
804 void disasmRange(std::ostream& os, int indent, bool dumpIR, TCA begin,
805 TCA end) override {
806 Disasm disasm(Disasm::Options().indent(indent + 4)
807 .printEncoding(dumpIR)
808 .color(color(ANSI_COLOR_BROWN)));
809 disasm.disasm(os, begin, end);
812 void genCodeImpl(IRUnit& unit, AsmInfo*) override;
815 std::unique_ptr<jit::BackEnd> newBackEnd() {
816 return folly::make_unique<BackEnd>();
819 static size_t genBlock(CodegenState& state, Vout& v, Vout& vc, Block* block) {
820 FTRACE(6, "genBlock: {}\n", block->id());
821 CodeGenerator cg(state, v, vc);
822 size_t hhir_count{0};
823 for (IRInstruction& inst : *block) {
824 hhir_count++;
825 if (inst.is(EndGuards)) state.pastGuards = true;
826 v.setOrigin(&inst);
827 vc.setOrigin(&inst);
828 cg.cgInst(&inst);
830 return hhir_count;
833 auto const vasm_gp = x64::abi.gpUnreserved | RegSet(rAsm).add(r11);
834 auto const vasm_simd = x64::kXMMRegs;
835 UNUSED const Abi vasm_abi {
836 .gpUnreserved = vasm_gp,
837 .gpReserved = x64::abi.gp() - vasm_gp,
838 .simdUnreserved = vasm_simd,
839 .simdReserved = x64::abi.simd() - vasm_simd,
840 .calleeSaved = x64::kCalleeSaved,
841 .sf = x64::abi.sf
844 void BackEnd::genCodeImpl(IRUnit& unit, AsmInfo* asmInfo) {
845 Timer _t(Timer::codeGen);
846 CodeBlock& mainCodeIn = mcg->code.main();
847 CodeBlock& coldCodeIn = mcg->code.cold();
848 CodeBlock* frozenCode = &mcg->code.frozen();
850 CodeBlock mainCode;
851 CodeBlock coldCode;
852 const bool useLLVM = mcg->useLLVM();
853 bool relocate = false;
854 if (!useLLVM &&
855 RuntimeOption::EvalJitRelocationSize &&
856 supportsRelocation() &&
857 coldCodeIn.canEmit(RuntimeOption::EvalJitRelocationSize * 3)) {
859 * This is mainly to exercise the relocator, and ensure that its
860 * not broken by new non-relocatable code. Later, it will be
861 * used to do some peephole optimizations, such as reducing branch
862 * sizes.
863 * Allocate enough space that the relocated cold code doesn't
864 * overlap the emitted cold code.
867 static unsigned seed = 42;
868 auto off = rand_r(&seed) & (cacheLineSize() - 1);
869 coldCode.init(coldCodeIn.frontier() +
870 RuntimeOption::EvalJitRelocationSize + off,
871 RuntimeOption::EvalJitRelocationSize - off, "cgRelocCold");
873 mainCode.init(coldCode.frontier() +
874 RuntimeOption::EvalJitRelocationSize + off,
875 RuntimeOption::EvalJitRelocationSize - off, "cgRelocMain");
877 relocate = true;
878 } else {
880 * Use separate code blocks, so that attempts to use the mcg's
881 * code blocks directly will fail (eg by overwriting the same
882 * memory being written through these locals).
884 coldCode.init(coldCodeIn.frontier(), coldCodeIn.available(),
885 coldCodeIn.name().c_str());
886 mainCode.init(mainCodeIn.frontier(), mainCodeIn.available(),
887 mainCodeIn.name().c_str());
890 if (frozenCode == &coldCodeIn) {
891 frozenCode = &coldCode;
894 auto frozenStart = frozenCode->frontier();
895 auto coldStart DEBUG_ONLY = coldCodeIn.frontier();
896 auto mainStart DEBUG_ONLY = mainCodeIn.frontier();
897 size_t hhir_count{0};
900 mcg->code.lock();
901 mcg->cgFixups().setBlocks(&mainCode, &coldCode, frozenCode);
903 SCOPE_EXIT {
904 mcg->cgFixups().setBlocks(nullptr, nullptr, nullptr);
905 mcg->code.unlock();
908 if (RuntimeOption::EvalHHIRGenerateAsserts) {
909 emitTraceCall(mainCode, unit.bcOff());
912 CodegenState state(unit, asmInfo, *frozenCode);
913 auto const blocks = rpoSortCfg(unit);
914 Vasm vasm;
915 auto& vunit = vasm.unit();
916 // create the initial set of vasm numbered the same as hhir blocks.
917 for (uint32_t i = 0, n = unit.numBlocks(); i < n; ++i) {
918 state.labels[i] = vunit.makeBlock(AreaIndex::Main);
920 // create vregs for all relevant SSATmps
921 assignRegs(unit, vunit, state, blocks, this);
922 vunit.entry = state.labels[unit.entry()];
923 vasm.main(mainCode);
924 vasm.cold(coldCode);
925 vasm.frozen(*frozenCode);
926 for (auto block : blocks) {
927 auto& v = block->hint() == Block::Hint::Unlikely ? vasm.cold() :
928 block->hint() == Block::Hint::Unused ? vasm.frozen() :
929 vasm.main();
930 FTRACE(6, "genBlock {} on {}\n", block->id(),
931 area_names[(unsigned)v.area()]);
932 auto b = state.labels[block];
933 vunit.blocks[b].area = v.area();
934 v.use(b);
935 hhir_count += genBlock(state, v, vasm.cold(), block);
936 assert(v.closed());
937 assert(vasm.main().empty() || vasm.main().closed());
938 assert(vasm.cold().empty() || vasm.cold().closed());
939 assert(vasm.frozen().empty() || vasm.frozen().closed());
941 printUnit(kInitialVasmLevel, "after initial vasm generation", vunit);
942 assert(check(vunit));
944 if (useLLVM) {
945 try {
946 genCodeLLVM(vunit, vasm.areas(), sortBlocks(vunit));
947 } catch (const FailedLLVMCodeGen& e) {
948 FTRACE(1, "LLVM codegen failed ({}); falling back to x64 backend\n",
949 e.what());
950 vasm.finishX64(vasm_abi, state.asmInfo);
952 } else {
953 vasm.finishX64(vasm_abi, state.asmInfo);
957 auto bcMap = &mcg->cgFixups().m_bcMap;
958 if (relocate && !bcMap->empty()) {
959 TRACE(1, "BCMAPS before relocation\n");
960 for (UNUSED auto& map : *bcMap) {
961 TRACE(1, "%s %-6d %p %p %p\n", map.md5.toString().c_str(),
962 map.bcStart, map.aStart, map.acoldStart, map.afrozenStart);
966 assert(coldCodeIn.frontier() == coldStart);
967 assert(mainCodeIn.frontier() == mainStart);
969 if (relocate) {
970 if (asmInfo) {
971 printUnit(kRelocationLevel, unit, " before relocation ", asmInfo);
974 auto& be = mcg->backEnd();
975 RelocationInfo rel;
976 size_t asm_count{0};
977 asm_count += be.relocate(rel, mainCodeIn,
978 mainCode.base(), mainCode.frontier(),
979 mcg->cgFixups(), nullptr);
981 asm_count += be.relocate(rel, coldCodeIn,
982 coldCode.base(), coldCode.frontier(),
983 mcg->cgFixups(), nullptr);
984 TRACE(1, "hhir-inst-count %ld asm %ld\n", hhir_count, asm_count);
986 if (frozenCode != &coldCode) {
987 rel.recordRange(frozenStart, frozenCode->frontier(),
988 frozenStart, frozenCode->frontier());
990 be.adjustForRelocation(rel);
991 be.adjustMetaDataForRelocation(rel, asmInfo, mcg->cgFixups());
992 be.adjustCodeForRelocation(rel, mcg->cgFixups());
994 if (asmInfo) {
995 static int64_t mainDeltaTot = 0, coldDeltaTot = 0;
996 int64_t mainDelta =
997 (mainCodeIn.frontier() - mainStart) -
998 (mainCode.frontier() - mainCode.base());
999 int64_t coldDelta =
1000 (coldCodeIn.frontier() - coldStart) -
1001 (coldCode.frontier() - coldCode.base());
1003 mainDeltaTot += mainDelta;
1004 HPHP::Trace::traceRelease("main delta after relocation: "
1005 "%" PRId64 " (%" PRId64 ")\n",
1006 mainDelta, mainDeltaTot);
1007 coldDeltaTot += coldDelta;
1008 HPHP::Trace::traceRelease("cold delta after relocation: "
1009 "%" PRId64 " (%" PRId64 ")\n",
1010 coldDelta, coldDeltaTot);
1012 #ifndef NDEBUG
1013 auto& ip = mcg->cgFixups().m_inProgressTailJumps;
1014 for (size_t i = 0; i < ip.size(); ++i) {
1015 const auto& ib = ip[i];
1016 assert(!mainCode.contains(ib.toSmash()));
1017 assert(!coldCode.contains(ib.toSmash()));
1019 memset(mainCode.base(), 0xcc, mainCode.frontier() - mainCode.base());
1020 memset(coldCode.base(), 0xcc, coldCode.frontier() - coldCode.base());
1021 #endif
1022 } else {
1023 coldCodeIn.skip(coldCode.frontier() - coldCodeIn.frontier());
1024 mainCodeIn.skip(mainCode.frontier() - mainCodeIn.frontier());
1027 if (asmInfo) {
1028 printUnit(kCodeGenLevel, unit, " after code gen ", asmInfo);