hphp/runtime/vm/jit/vasm-arm.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 /*
  18  * The HHVM's ARM64 backend works with an early-truncation policy.
  19  * That means that:
  20  *
  21  *  A Vreg8 is an extended W-register with a u8 value.
  22  *  A Vreg16 is an extended W-register with a u16 value.
  23  *  A Vreg32 is a W-register with a u32 value.
  24  *  A Vreg64 is a X-register with a u64 value.
  25  *
  26  * This allows to omit truncation instructions for sub-32-bit
  27  * operations. E.g. a testb{Vreg8 s0, Vreg8 s1} has to truncate
  28  * s0 and s1 before emitting a tst instruction. When using the
  29  * early-truncation policy, the testb{} emitter can rely on the
  30  * fact, that s0 and s1 are already truncated and can emit a
  31  * cmp instruction without preceding uxtb's.
  32  *
  33  * Conversely any arithmetic instruction has to sign extend any
  34  * Vreg8 before operating on it. Vasm is light on these instructions,
  35  * with only the following, currently: csinc[bw]{} and cmp[bw][i]{}.
  36  *
  37  * Early-truncation has also consequences to extension/truncation
  38  * vasm instructions. The following list shows how to use them:
  39  *
  40  * movzbw: Vreg8 -> Vreg16: mov w0, w0 #nop if s==d
  41  * movzbl: Vreg8 -> Vreg32: mov w0, w0 #nop if s==d
  42  * movzbq: Vreg8 -> Vreg64: uxtb x0, x0
  43  * movzwl: Vreg16 -> Vreg32 mov w0, w0 #nop if s==d
  44  * movzwq: Vreg16 -> Vreg64 uxth x0, x0
  45  * movzlq: Vreg32 -> Vreg64 uxtw x0, x0
  46  * movtqb: Vreg64 -> Vreg8:  uxtb w0, w0
  47  * movtql: Vreg64 -> Vreg32: uxtw w0, w0
  48  *
  49  * Early-truncation also implies, that instructions have to truncate
  50  * after performing the actual operation if it cannot guarantee that
  51  * the resulting VregN type matches. E.g. emitting code for the vasm
  52  * instruction andbi{Immed imm, Vreg8 s, Vreg8 d} has to truncate the
  53  * result to guarantee that register d indeed holds a u8 value.
  54  *
  55  * Note, that the early-truncation policy allows aarch64 specific
  56  * optimizations, which are not relevant on other architectures.
  57  * E.g. the x86_64 does not need this policy as the ISA allows
  58  * direct register accesses for Vreg8, Vreg16, Vreg32 and Vreg64
  59  * (e.g. AL, AX, EAX, RAX).
  60  *
  61  * The early-truncation policy relies on the following
  62  * requirements of the Vreg type-system:
  63  *
  64  *  * All VregNs are created for values of up to N bits
  65  *  * All conversions between VregNs are done via movz/movt vasm instructions
  66  */
  67
  68 #include "hphp/runtime/vm/jit/vasm-emit.h"
  69
  70 #include "hphp/runtime/vm/jit/abi-arm.h"
  71 #include "hphp/runtime/vm/jit/ir-instruction.h"
  72 #include "hphp/runtime/vm/jit/print.h"
  73 #include "hphp/runtime/vm/jit/service-requests.h"
  74 #include "hphp/runtime/vm/jit/smashable-instr-arm.h"
  75 #include "hphp/runtime/vm/jit/timer.h"
  76 #include "hphp/runtime/vm/jit/vasm-gen.h"
  77 #include "hphp/runtime/vm/jit/vasm.h"
  78 #include "hphp/runtime/vm/jit/vasm-instr.h"
  79 #include "hphp/runtime/vm/jit/vasm-internal.h"
  80 #include "hphp/runtime/vm/jit/vasm-lower.h"
  81 #include "hphp/runtime/vm/jit/vasm-print.h"
  82 #include "hphp/runtime/vm/jit/vasm-reg.h"
  83 #include "hphp/runtime/vm/jit/vasm-unit.h"
  84 #include "hphp/runtime/vm/jit/vasm-util.h"
  85 #include "hphp/runtime/vm/jit/vasm-visit.h"
  86
  87 #include "hphp/vixl/a64/macro-assembler-a64.h"
  88
  89 TRACE_SET_MOD(vasm);
  90
  91 namespace HPHP { namespace jit {
  92 ///////////////////////////////////////////////////////////////////////////////
  93
  94 using namespace arm;
  95 using namespace vixl;
  96
  97 namespace arm { struct ImmFolder; }
  98
  99 namespace {
 100 ///////////////////////////////////////////////////////////////////////////////
 101
 102 static_assert(folly::kIsLittleEndian,
 103   "Code contains little-endian specific optimizations.");
 104
 105 vixl::Register X(Vreg64 r) {
 106   PhysReg pr(r.asReg());
 107   return x2a(pr);
 108 }
 109
 110 vixl::Register W(Vreg64 r) {
 111   PhysReg pr(r.asReg());
 112   return x2a(pr).W();
 113 }
 114
 115 vixl::Register W(Vreg32 r) {
 116   PhysReg pr(r.asReg());
 117   return x2a(pr).W();
 118 }
 119
 120 vixl::Register W(Vreg16 r) {
 121   PhysReg pr(r.asReg());
 122   return x2a(pr).W();
 123 }
 124
 125 vixl::Register W(Vreg8 r) {
 126   PhysReg pr(r.asReg());
 127   return x2a(pr).W();
 128 }
 129
 130 vixl::FPRegister D(Vreg r) {
 131   return x2f(r);
 132 }
 133
 134 vixl::VRegister V(Vreg r) {
 135   return x2v(r);
 136 }
 137
 138 uint8_t Log2(uint8_t value) {
 139   switch (value) {
 140     case 1:
 141       return 0;
 142     case 2:
 143       return 1;
 144     case 4:
 145       return 2;
 146     case 8:
 147       return 3;
 148     default:
 149       always_assert(false);
 150   }
 151 }
 152
 153 vixl::MemOperand M(Vptr p) {
 154   assertx(p.base.isValid());
 155   if (p.index.isValid()) {
 156     assertx(p.disp == 0);
 157     return MemOperand(X(p.base), X(p.index), LSL, Log2(p.scale));
 158   }
 159   return MemOperand(X(p.base), p.disp);
 160 }
 161
 162 vixl::Condition C(ConditionCode cc) {
 163   return arm::convertCC(cc);
 164 }
 165
 166 /*
 167  * Uses the flags from the Vinstr which defs SF to determine
 168  * whether or not the Vixl assembler should emit code which
 169  * sets the status flags.
 170  */
 171 vixl::FlagsUpdate UF(Vflags flags) {
 172   return flags ? SetFlags : LeaveFlags;
 173 }
 174
 175 /*
 176  * There are numerous ARM instructions that don't set status flags, and
 177  * therefore those flags must be set synthetically in the emitters. This
 178  * assertion is applied to the emitters which don't set all of the status
 179  * flags required by the Vinstr which defs SF. The flags field of the
 180  * Vinstr is used to determine which bits are required. Those required
 181  * bits are compared against the bits which are actually set by the
 182  * implementation.
 183  */
 184 template<class Inst> void checkSF(const Inst& i, StatusFlags s) {
 185   Vflags required = i.fl;
 186   Vflags set = static_cast<Vflags>(s);
 187   always_assert_flog((required & set) == required,
 188                      "should def SF but does not: {}\n",
 189                      vinst_names[Vinstr(i).op]);
 190 }
 191
 192 template<class Inst> void checkSF(const Inst& i) {
 193   checkSF(i, StatusFlags::None);
 194 }
 195
 196 /*
 197  * Returns true if the queried flag(s) is in the set of required flags.
 198  */
 199 bool flagRequired(Vflags flags, StatusFlags flag) {
 200   return (flags & static_cast<Vflags>(flag));
 201 }
 202
 203 ///////////////////////////////////////////////////////////////////////////////
 204
 205 struct Vgen {
 206   explicit Vgen(Venv& env)
 207     : env(env)
 208     , assem(*env.cb)
 209     , a(&assem)
 210     , base(a->frontier())
 211     , current(env.current)
 212     , next(env.next)
 213     , jmps(env.jmps)
 214     , jccs(env.jccs)
 215     , catches(env.catches)
 216   {}
 217   ~Vgen() {
 218     env.cb->sync(base);
 219   }
 220
 221   static void emitVeneers(Venv& env);
 222   static void handleLiterals(Venv& env);
 223   static void patch(Venv& env);
 224
 225   static void pad(CodeBlock& cb) {
 226     vixl::MacroAssembler a { cb };
 227     auto const begin = cb.frontier();
 228     while (cb.available() >= 4) a.Brk(1);
 229     assertx(cb.available() == 0);
 230     cb.sync(begin);
 231   }
 232
 233   /////////////////////////////////////////////////////////////////////////////
 234
 235   template<class Inst> void emit(const Inst& i) {
 236     always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
 237                        vinst_names[Vinstr(i).op], size_t(current));
 238   }
 239
 240   // intrinsics
 241   void emit(const copy& i);
 242   void emit(const copy2& i);
 243   void emit(const debugtrap& /*i*/) { a->Brk(0); }
 244   void emit(const fallthru& /*i*/);
 245   void emit(const ldimmb& i);
 246   void emit(const ldimml& i);
 247   void emit(const ldimmq& i);
 248   void emit(const ldimmw& i);
 249   void emit(const ldundefq& /*i*/) {}
 250   void emit(const load& i);
 251   void emit(const store& i);
 252   void emit(const mcprep& i);
 253
 254   // native function abi
 255   void emit(const call& i);
 256   void emit(const callr& i) { a->Blr(X(i.target)); }
 257   void emit(const calls& i);
 258   void emit(const ret& /*i*/) { a->Ret(); }
 259
 260   // stub function abi
 261   void emit(const callstub& i);
 262   void emit(const callfaststub& i);
 263
 264   // php function abi
 265   void emit(const callphp& i) {
 266     emit(call{i.target, i.args});
 267     setCallFuncId(env, a->frontier());
 268   }
 269   void emit(const callphpr& i) {
 270     emit(callr{i.target, i.args});
 271     setCallFuncId(env, a->frontier());
 272   }
 273   void emit(const contenter& i);
 274   void emit(const phpret& i);
 275
 276   // vm entry abi
 277   void emit(const inittc& /*i*/) {}
 278   void emit(const leavetc& i);
 279
 280   // exceptions
 281   void emit(const landingpad& /*i*/) {}
 282   void emit(const nothrow& i);
 283   void emit(const syncpoint& i);
 284   void emit(const unwind& i);
 285
 286   // instructions
 287   void emit(const absdbl& i) { a->Fabs(D(i.d), D(i.s)); }
 288   void emit(const addl& i) { a->Add(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
 289   void emit(const addli& i) { a->Add(W(i.d), W(i.s1), i.s0.l(), UF(i.fl)); }
 290   void emit(const addq& i) { a->Add(X(i.d), X(i.s1), X(i.s0), UF(i.fl));}
 291   void emit(const addqi& i) { a->Add(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
 292   void emit(const addsd& i) { a->Fadd(D(i.d), D(i.s1), D(i.s0)); }
 293   void emit(const andb& i) { a->And(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
 294   void emit(const andbi& i) { a->And(W(i.d), W(i.s1), i.s0.ub(), UF(i.fl)); }
 295   void emit(const andw& i) { a->And(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
 296   void emit(const andwi& i) { a->And(W(i.d), W(i.s1), i.s0.uw(), UF(i.fl)); }
 297   void emit(const andl& i) { a->And(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
 298   void emit(const andli& i) { a->And(W(i.d), W(i.s1), i.s0.l(), UF(i.fl)); }
 299   void emit(const andq& i) { a->And(X(i.d), X(i.s1), X(i.s0), UF(i.fl)); }
 300   void emit(const andqi& i) { a->And(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
 301   void emit(const andqi64& i) { a->And(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
 302   void emit(const cmovb& i) { a->Csel(W(i.d), W(i.t), W(i.f), C(i.cc)); }
 303   void emit(const cmovw& i) { a->Csel(W(i.d), W(i.t), W(i.f), C(i.cc)); }
 304   void emit(const cmovl& i) { a->Csel(W(i.d), W(i.t), W(i.f), C(i.cc)); }
 305   void emit(const cmovq& i) { a->Csel(X(i.d), X(i.t), X(i.f), C(i.cc)); }
 306   // note: cmp{bw}[i] are emitted only for narrow comparisons and _do not_ sign
 307   // extend their arguments--these instructions are lowered to cmp{lq}[i] if
 308   // the comparison is not narrow or not equality/inequality
 309   void emit(const cmpb& i) { a->Cmp(W(i.s1), W(i.s0)); }
 310   void emit(const cmpbi& i) { a->Cmp(W(i.s1), static_cast<uint8_t>(i.s0.b())); }
 311   void emit(const cmpw& i) { a->Cmp(W(i.s1), W(i.s0)); }
 312   void emit(const cmpwi& i) { a->Cmp(W(i.s1), static_cast<uint16_t>(i.s0.w())); }
 313   void emit(const cmpl& i) { a->Cmp(W(i.s1), W(i.s0)); }
 314   void emit(const cmpli& i) { a->Cmp(W(i.s1), i.s0.l()); }
 315   void emit(const cmpq& i) { a->Cmp(X(i.s1), X(i.s0)); }
 316   void emit(const cmpqi& i) { a->Cmp(X(i.s1), i.s0.q()); }
 317   void emit(const cmpsd& i);
 318   // TODO(CDE): csinc[bw]{} Should a) sign extend and b) set SF for overflow
 319   void emit(const csincb& i) { a->Csinc(W(i.d), W(i.t), W(i.f), C(i.cc)); }
 320   void emit(const csincw& i) { a->Csinc(W(i.d), W(i.t), W(i.f), C(i.cc)); }
 321   void emit(const csincl& i) { a->Csinc(W(i.d), W(i.t), W(i.f), C(i.cc)); }
 322   void emit(const csincq& i) { a->Csinc(X(i.d), X(i.t), X(i.f), C(i.cc)); }
 323   void emit(const cvtsi2sd& i) { a->Scvtf(D(i.d), X(i.s)); }
 324   void emit(const decl& i) { a->Sub(W(i.d), W(i.s), 1, UF(i.fl)); }
 325   void emit(const decq& i) { a->Sub(X(i.d), X(i.s), 1, UF(i.fl)); }
 326   void emit(const decqmlock& i);
 327   void emit(const divint& i) { a->Sdiv(X(i.d), X(i.s0), X(i.s1)); }
 328   void emit(const divsd& i) { a->Fdiv(D(i.d), D(i.s1), D(i.s0)); }
 329   void emit(const imul& i);
 330   void emit(const incl& i) { a->Add(W(i.d), W(i.s), 1, UF(i.fl)); }
 331   void emit(const incq& i) { a->Add(X(i.d), X(i.s), 1, UF(i.fl)); }
 332   void emit(const incw& i) { a->Add(W(i.d), W(i.s), 1, UF(i.fl)); }
 333   void emit(const jcc& i);
 334   void emit(const jcci& i);
 335   void emit(const jmp& i);
 336   void emit(const jmpi& i);
 337   void emit(const jmpr& i) { a->Br(X(i.target)); }
 338   void emit(const lea& i);
 339   void emit(const leap& i);
 340   void emit(const leav& i);
 341   void emit(const lead& i);
 342   void emit(const loadb& i) { a->Ldrb(W(i.d), M(i.s)); }
 343   void emit(const loadl& i) { a->Ldr(W(i.d), M(i.s)); }
 344   void emit(const loadsd& i) { a->Ldr(D(i.d), M(i.s)); }
 345   void emit(const loadtqb& i) { a->Ldrb(W(i.d), M(i.s)); }
 346   void emit(const loadtql& i) { a->Ldr(W(i.d), M(i.s)); }
 347   void emit(const loadups& i);
 348   void emit(const loadw& i) { a->Ldrh(W(i.d), M(i.s)); }
 349   void emit(const loadzbl& i) { a->Ldrb(W(i.d), M(i.s)); }
 350   void emit(const loadzbq& i) { a->Ldrb(W(i.d), M(i.s)); }
 351   void emit(const loadsbq& i) { a->Ldrsb(X(i.d), M(i.s)); }
 352   void emit(const loadsbl& i) { a->Ldrsb(W(i.d), M(i.s)); }
 353   void emit(const loadzwq& i) { a->Ldrh(W(i.d), M(i.s)); }
 354   void emit(const loadzlq& i) { a->Ldr(W(i.d), M(i.s)); }
 355   void emit(const movb& i) { if (i.d != i.s) a->Mov(W(i.d), W(i.s)); }
 356   void emit(const movw& i) { if (i.d != i.s) a->Mov(W(i.d), W(i.s)); }
 357   void emit(const movl& i) { if (i.d != i.s) a->Mov(W(i.d), W(i.s)); }
 358   void emit(const movsbl& i) { a->Sxtb(W(i.d), W(i.s)); }
 359   void emit(const movsbq& i) { a->Sxtb(X(i.d), W(i.s).X()); }
 360   void emit(const movswl& i) { a->Sxth(W(i.d), W(i.s)); }
 361   void emit(const movtqb& i) { a->Uxtb(W(i.d), W(i.s)); }
 362   void emit(const movtqw& i) { a->Uxth(W(i.d), W(i.s)); }
 363   void emit(const movtql& i) { a->Uxtw(W(i.d), W(i.s)); }
 364   void emit(const movzbq& i) { a->Uxtb(X(i.d), W(i.s).X()); }
 365   void emit(const movzwq& i) { a->Uxth(X(i.d), W(i.s).X()); }
 366   void emit(const movzlq& i) { a->Uxtw(X(i.d), W(i.s).X()); }
 367   void emit(const mulsd& i) { a->Fmul(D(i.d), D(i.s1), D(i.s0)); }
 368   void emit(const neg& i) { a->Neg(X(i.d), X(i.s), UF(i.fl)); }
 369   void emit(const nop& /*i*/) { a->Nop(); }
 370   void emit(const notb& i) { a->Mvn(W(i.d), W(i.s)); }
 371   void emit(const not& i) { a->Mvn(X(i.d), X(i.s)); }
 372   void emit(const orbi& i);
 373   void emit(const orq& i);
 374   void emit(const orwi& i);
 375   void emit(const orli& i);
 376   void emit(const orqi& i);
 377   void emit(const pop& i);
 378   void emit(const popp& i);
 379   void emit(const push& i);
 380   void emit(const pushp& i);
 381   void emit(const roundsd& i);
 382   void emit(const sar& i);
 383   void emit(const sarqi& i);
 384   void emit(const setcc& i) { a->Cset(W(i.d), C(i.cc)); }
 385   void emit(const shl& i);
 386   void emit(const shlli& i);
 387   void emit(const shlqi& i);
 388   void emit(const shrli& i);
 389   void emit(const shrqi& i);
 390   void emit(const sqrtsd& i) { a->Fsqrt(D(i.d), D(i.s)); }
 391   void emit(const srem& i);
 392   void emit(const storeb& i) { a->Strb(W(i.s), M(i.m)); }
 393   void emit(const storel& i) { a->Str(W(i.s), M(i.m)); }
 394   void emit(const storesd& i) { emit(store{i.s, i.m}); }
 395   void emit(const storeups& i);
 396   void emit(const storew& i) { a->Strh(W(i.s), M(i.m)); }
 397   void emit(const subl& i) { a->Sub(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
 398   void emit(const subli& i) { a->Sub(W(i.d), W(i.s1), i.s0.l(), UF(i.fl)); }
 399   void emit(const subq& i) { a->Sub(X(i.d), X(i.s1), X(i.s0), UF(i.fl)); }
 400   void emit(const subqi& i) { a->Sub(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
 401   void emit(const subsd& i) { a->Fsub(D(i.d), D(i.s1), D(i.s0)); }
 402   void emit(const testb& i){ a->Tst(W(i.s1), W(i.s0)); }
 403   void emit(const testbi& i){ a->Tst(W(i.s1), i.s0.ub()); }
 404   void emit(const testw& i){ a->Tst(W(i.s1), W(i.s0)); }
 405   void emit(const testwi& i){ a->Tst(W(i.s1), i.s0.uw()); }
 406   void emit(const testl& i) { a->Tst(W(i.s1), W(i.s0)); }
 407   void emit(const testli& i) { a->Tst(W(i.s1), i.s0.l()); }
 408   void emit(const testq& i) { a->Tst(X(i.s1), X(i.s0)); }
 409   void emit(const testqi& i) { a->Tst(X(i.s1), i.s0.q()); }
 410   void emit(const trap& /*i*/);
 411   void emit(const ucomisd& i) { a->Fcmp(D(i.s0), D(i.s1)); }
 412   void emit(const unpcklpd&);
 413   void emit(const xorb& i);
 414   void emit(const xorbi& i);
 415   void emit(const xorw& i);
 416   void emit(const xorwi& i);
 417   void emit(const xorl& i);
 418   void emit(const xorq& i);
 419   void emit(const xorqi& i);
 420
 421   // arm intrinsics
 422   void emit(const fcvtzs& i) { a->Fcvtzs(X(i.d), D(i.s)); }
 423   void emit(const mrs& i) { a->Mrs(X(i.r), vixl::SystemRegister(i.s.l())); }
 424   void emit(const msr& i) { a->Msr(vixl::SystemRegister(i.s.l()), X(i.r)); }
 425   void emit(const ubfmli& i) { a->ubfm(W(i.d), W(i.s), i.mr.w(), i.ms.w()); }
 426
 427   void emit_nop() { a->Nop(); }
 428
 429 private:
 430   CodeBlock& frozen() { return env.text.frozen().code; }
 431   static void recordAddressImmediate(Venv& env, TCA addr) {
 432     env.meta.addressImmediates.insert(addr);
 433   }
 434   void recordAddressImmediate() {
 435     env.meta.addressImmediates.insert(env.cb->frontier());
 436   }
 437
 438 private:
 439   Venv& env;
 440   vixl::MacroAssembler assem;
 441   vixl::MacroAssembler* a;
 442   Address base;
 443
 444   const Vlabel current;
 445   const Vlabel next;
 446   jit::vector<Venv::LabelPatch>& jmps;
 447   jit::vector<Venv::LabelPatch>& jccs;
 448   jit::vector<Venv::LabelPatch>& catches;
 449 };
 450
 451 ///////////////////////////////////////////////////////////////////////////////
 452
 453 static CodeBlock* getBlock(Venv& env, CodeAddress a) {
 454   for (auto const& area : env.text.areas()) {
 455     if (area.code.contains(a)) {
 456       return &area.code;
 457     }
 458   }
 459   return nullptr;
 460 }
 461
 462 static CodeAddress toReal(Venv& env, CodeAddress a) {
 463   CodeBlock* b = getBlock(env, a);
 464   return (b == nullptr) ? a : b->toDestAddress(a);
 465 }
 466
 467 void Vgen::emitVeneers(Venv& env) {
 468   auto& meta = env.meta;
 469   decltype(env.meta.veneers) notEmitted;
 470
 471   for (auto const& veneer : meta.veneers) {
 472     auto cb = getBlock(env, veneer.source);
 473     if (!cb) {
 474       // If we can't find the code block, it must have been emitted by a Vunit
 475       // wrapping this one (bindjmp emits a Vunit within a Vunit).
 476       notEmitted.push_back(veneer);
 477       continue;
 478     }
 479     auto const vaddr = cb->frontier();
 480
 481     FTRACE(1, "emitVeneers: source = {}, target = {}, veneer at {}\n",
 482            veneer.source, veneer.target, vaddr);
 483
 484     // Emit the veneer code: LDR + BR.
 485     meta.veneerAddrs.insert(vaddr);
 486     MacroAssembler av{*cb};
 487     vixl::Label target_data;
 488     meta.addressImmediates.insert(vaddr);
 489     poolLiteral(*cb, meta, (uint64_t)makeTarget32(veneer.target), 32, true);
 490     av.bind(&target_data);
 491     av.Ldr(rAsm_w, &target_data);
 492     av.Br(rAsm);
 493
 494     // Update the veneer source instruction to jump/call the veneer.
 495     auto const realSource = toReal(env, veneer.source);
 496     CodeBlock tmpBlock;
 497     tmpBlock.init(realSource, kInstructionSize, "emitVeneers");
 498     MacroAssembler at{tmpBlock};
 499     int64_t offset = vaddr - veneer.source;
 500     auto sourceInst = Instruction::Cast(realSource);
 501
 502     if (sourceInst->Mask(UnconditionalBranchMask) == B) {
 503       always_assert(is_int28(offset));
 504       at.b(offset >> kInstructionSizeLog2);
 505
 506     } else if (sourceInst->Mask(UnconditionalBranchMask) == BL) {
 507       always_assert(is_int28(offset));
 508       at.bl(offset >> kInstructionSizeLog2);
 509
 510     } else if (sourceInst->IsCondBranchImm()) {
 511       auto const cond = static_cast<Condition>(sourceInst->ConditionBranch());
 512       if (is_int21(offset)) {
 513         at.b(offset >> kInstructionSizeLog2, cond);
 514       } else {
 515         // The offset doesn't fit in a conditional jump. Hopefully it still fits
 516         // in an unconditional jump, in which case we add an appendix to the
 517         // veneer.
 518         offset += 2 * kInstructionSize;
 519         always_assert(is_int28(offset));
 520         // Add an appendix to the veneer, and jump to it instead.  The full
 521         // veneer in this case looks like:
 522         //   VENEER:
 523         //      LDR RX, LITERAL_ADDR
 524         //      BR  RX
 525         //   APPENDIX:
 526         //      B.CC VENEER
 527         //      B NEXT
 528         // And the conditional jump into the veneer is turned into a jump to the
 529         // appendix:
 530         //      B APPENDIX
 531         //   NEXT:
 532
 533         // Turn the original conditional branch into an unconditional one.
 534         at.b(offset >> kInstructionSizeLog2);
 535
 536         // Emit appendix.
 537         auto const appendix = cb->frontier();
 538         av.b(-2 /* veneer starts 2 instructions before the appendix */, cond);
 539         const int64_t nextOffset = (veneer.source + kInstructionSize) - // NEXT
 540           (vaddr + 3 * kInstructionSize); // addr of "B NEXT"
 541         always_assert(is_int28(nextOffset));
 542         av.b(nextOffset >> kInstructionSizeLog2);
 543
 544         // Replace veneer.source with appendix in the relevant metadata.
 545         meta.smashableLocations.erase(veneer.source);
 546         meta.smashableLocations.insert(appendix);
 547         for (auto& tj : meta.inProgressTailJumps) {
 548           if (tj.toSmash() == veneer.source) tj.adjust(appendix);
 549         }
 550         for (auto& stub : env.stubs) {
 551           if (stub.jcc == veneer.source) stub.jcc = appendix;
 552         }
 553       }
 554     } else {
 555       always_assert_flog(0, "emitVeneers: invalid source instruction at source"
 556                          " {} (realSource = {})",
 557                          veneer.source, realSource);
 558     }
 559   }
 560
 561   env.meta.veneers.swap(notEmitted);
 562 }
 563
 564 void Vgen::handleLiterals(Venv& env) {
 565   decltype(env.meta.literalsToPool) notEmitted;
 566   for (auto const& pl : env.meta.literalsToPool) {
 567     auto const cb = getBlock(env, pl.patchAddress);
 568     if (!cb) {
 569       // If we can't find the code block it must have been emitted by a Vunit
 570       // wrapping this one.  (bindjmp emits a Vunit within a Vunit)
 571       notEmitted.push_back(pl);
 572       continue;
 573     }
 574
 575     // Emit the literal.
 576     auto literalAddress = cb->frontier();
 577     if (pl.width == 32) {
 578       cb->dword(static_cast<uint32_t>(pl.value));
 579     } else if (pl.width == 64) {
 580       if (pl.smashable) {
 581         // Although the region is actually dead, we mark it as live, so that
 582         // the relocator can remove the padding.
 583         align(*cb, &env.meta, Alignment::QuadWordSmashable, AlignContext::Live);
 584         literalAddress = cb->frontier();
 585       }
 586       cb->qword(pl.value);
 587     } else {
 588       not_reached();
 589     }
 590
 591     // Patch the LDR.
 592     auto const patchAddressActual =
 593       Instruction::Cast(toReal(env, pl.patchAddress));
 594     assertx(patchAddressActual->IsLoadLiteral());
 595     patchAddressActual->SetImmPCOffsetTarget(
 596       Instruction::Cast(literalAddress),
 597       Instruction::Cast(pl.patchAddress));
 598   }
 599
 600   if (env.meta.fallthru) {
 601     auto const fallthru = *env.meta.fallthru;
 602     auto const cb = getBlock(env, fallthru);
 603     if (!cb) {
 604       always_assert_flog(false,
 605                          "Fallthrus shouldn't be used in nested Vunits.");
 606     }
 607     auto const blockEndAddr = cb->frontier();
 608     auto const startAddr = cb->toDestAddress(fallthru);
 609     CodeBlock tmp;
 610     tmp.init(startAddr, kInstructionSize, "Tmp");
 611     // Write the jmp.
 612     Assembler a { tmp };
 613     recordAddressImmediate(env, fallthru);
 614     a.b((blockEndAddr - fallthru) >> kInstructionSizeLog2);
 615   }
 616
 617   env.meta.literalsToPool.swap(notEmitted);
 618 }
 619
 620 void Vgen::patch(Venv& env) {
 621   // Patch the 32 bit target of the LDR
 622   auto patch = [&env](TCA instr, TCA target) {
 623     // The LDR loading the address to branch to.
 624     auto ldr = Instruction::Cast(instr);
 625     auto const DEBUG_ONLY br = ldr->NextInstruction();
 626     assertx(ldr->Mask(LoadLiteralMask) == LDR_w_lit &&
 627             br->Mask(UnconditionalBranchToRegisterMask) == BR &&
 628             ldr->Rd() == br->Rn());
 629     // The address the LDR loads.
 630     auto targetAddr = ldr->LiteralAddress();
 631     // Patch the 32 bit target following the LDR and BR
 632     patchTarget32(targetAddr, target);
 633   };
 634
 635   for (auto const& p : env.jmps) {
 636     auto addr = toReal(env, p.instr);
 637     auto const target = env.addrs[p.target];
 638     assertx(target);
 639     if (env.meta.smashableLocations.count(p.instr)) {
 640       assertx(possiblySmashableJmp(addr));
 641       // Update `addr' to point to the veneer.
 642       addr = TCA(vixl::Instruction::Cast(addr)->ImmPCOffsetTarget());
 643     }
 644     // Patch the address we are jumping to.
 645     patch(addr, target);
 646   }
 647   for (auto const& p : env.jccs) {
 648     auto addr = toReal(env, p.instr);
 649     auto const target = env.addrs[p.target];
 650     assertx(target);
 651     if (env.meta.smashableLocations.count(p.instr)) {
 652       assertx(possiblySmashableJcc(addr));
 653       // Update `addr' to point to the veneer.
 654       addr = TCA(vixl::Instruction::Cast(addr)->ImmPCOffsetTarget());
 655     } else {
 656       assertx(Instruction::Cast(addr)->IsCondBranchImm());
 657       // If the jcc starts with a conditional jump, patch the next instruction
 658       // (which should start with a LDR).
 659       addr += kInstructionSize;
 660     }
 661     patch(addr, target);
 662   }
 663   for (auto const& p : env.leas) {
 664     (void)p;
 665     not_implemented();
 666   }
 667 }
 668
 669 ///////////////////////////////////////////////////////////////////////////////
 670
 671 void Vgen::emit(const copy& i) {
 672   if (i.s == i.d) return;
 673   if (i.s.isGP() && i.d.isGP()) {
 674     a->Mov(X(i.d), X(i.s));
 675   } else if (i.s.isSIMD() && i.d.isGP()) {
 676     a->Fmov(X(i.d), D(i.s));
 677   } else if (i.s.isGP() && i.d.isSIMD()) {
 678     a->Fmov(D(i.d), X(i.s));
 679   } else {
 680     assertx(i.s.isSIMD() && i.d.isSIMD());
 681     a->mov(V(i.d), V(i.s));
 682   }
 683 }
 684
 685 void Vgen::emit(const copy2& i) {
 686   assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid());
 687   auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1;
 688   assertx(d0 != d1);
 689   if (d0 == s1) {
 690     if (d1 == s0) {
 691       a->Eor(X(d0), X(d0), X(s0));
 692       a->Eor(X(s0), X(d0), X(s0));
 693       a->Eor(X(d0), X(d0), X(s0));
 694     } else {
 695       // could do this in a simplify pass
 696       if (s1 != d1) a->Mov(X(s1), X(d1)); // save s1 first; d1 != s0
 697       if (s0 != d0) a->Mov(X(s0), X(d0));
 698     }
 699   } else {
 700     // could do this in a simplify pass
 701     if (s0 != d0) a->Mov(X(s0), X(d0));
 702     if (s1 != d1) a->Mov(X(s1), X(d1));
 703   }
 704 }
 705
 706 void emitSimdImmInt(vixl::MacroAssembler* a, uint64_t val, Vreg d) {
 707   // Assembler::fmov emits a ldr from a literal pool if IsImmFP64 is false.
 708   // In that case, emit the raw bits into a GPR first and then move them
 709   // unmodified into destination SIMD
 710   union { double dval; uint64_t ival; };
 711   ival = val;
 712   if (vixl::Assembler::IsImmFP64(dval)) {
 713     a->Fmov(D(d), dval);
 714   } else if (ival == 0) {
 715     a->Fmov(D(d), vixl::xzr);
 716   } else {
 717     a->Mov(rAsm, ival);
 718     a->Fmov(D(d), rAsm);
 719   }
 720 }
 721 void Vgen::emit(const fallthru& /*i*/) {
 722   always_assert(!env.meta.fallthru);
 723   env.meta.fallthru = a->frontier();
 724   a->nop();
 725 }
 726
 727 #define Y(vasm_opc, simd_w, vr_w, gpr_w, imm) \
 728 void Vgen::emit(const vasm_opc& i) {          \
 729   if (i.d.isSIMD()) {                         \
 730     emitSimdImmInt(a, static_cast<uint##vr_w##_t>(i.s.simd_w()), i.d);     \
 731   } else {                                    \
 732     Vreg##vr_w d = i.d;                       \
 733     a->Mov(gpr_w(d), imm);                    \
 734   }                                           \
 735 }
 736
 737 Y(ldimmb, ub, 8, W, i.s.ub())
 738 Y(ldimmw, uw, 16, W, i.s.uw())
 739 Y(ldimml, l, 32, W, i.s.l())
 740 Y(ldimmq, q, 64, X, i.s.q())
 741
 742 #undef Y
 743
 744 void Vgen::emit(const load& i) {
 745   if (i.d.isGP()) {
 746     a->Ldr(X(i.d), M(i.s));
 747   } else {
 748     a->Ldr(D(i.d), M(i.s));
 749   }
 750 }
 751
 752 void Vgen::emit(const store& i) {
 753   if (i.s.isGP()) {
 754     if (i.s == rsp()) {
 755       a->Mov(rAsm, X(i.s));
 756       a->Str(rAsm, M(i.d));
 757     } else {
 758       a->Str(X(i.s), M(i.d));
 759     }
 760   } else {
 761     a->Str(D(i.s), M(i.d));
 762   }
 763 }
 764
 765 ///////////////////////////////////////////////////////////////////////////////
 766
 767 void Vgen::emit(const mcprep& i) {
 768   /*
 769    * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
 770    * address of the movq) so that we can find the movq from the handler.
 771    *
 772    * We set the low bit for two reasons: the Class* will never be a valid
 773    * Class*, so we'll always miss the inline check before it's smashed, and
 774    * MethodCache::handleStaticCall can tell it's not been smashed yet
 775    */
 776
 777   align(*env.cb, &env.meta, Alignment::SmashMovq, AlignContext::Live);
 778   auto const imm = reinterpret_cast<uint64_t>(a->frontier());
 779   emitSmashableMovq(*env.cb, env.meta, (imm << 1) | 1, r64(i.d));
 780
 781   env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm));
 782 }
 783
 784 ///////////////////////////////////////////////////////////////////////////////
 785
 786 void Vgen::emit(const call& i) {
 787   recordAddressImmediate();
 788   a->Mov(rAsm, i.target);
 789   a->Blr(rAsm);
 790   if (i.watch) {
 791     *i.watch = a->frontier();
 792     env.meta.watchpoints.push_back(i.watch);
 793   }
 794 }
 795
 796 void Vgen::emit(const calls& i) {
 797   emitSmashableCall(*env.cb, env.meta, i.target);
 798 }
 799
 800 ///////////////////////////////////////////////////////////////////////////////
 801
 802 void Vgen::emit(const callstub& i) {
 803   emit(call{i.target, i.args});
 804 }
 805
 806 void Vgen::emit(const callfaststub& i) {
 807   emit(call{i.target, i.args});
 808   emit(syncpoint{i.fix});
 809 }
 810
 811 ///////////////////////////////////////////////////////////////////////////////
 812
 813 void Vgen::emit(const phpret& i) {
 814   // prefer load-pair instruction
 815   if (!i.noframe) {
 816     a->ldp(X(rvmfp()), X(rlr()), X(i.fp)[AROFF(m_sfp)]);
 817   } else {
 818     a->Ldr(X(rlr()), X(i.fp)[AROFF(m_savedRip)]);
 819   }
 820   emit(ret{});
 821 }
 822
 823 void Vgen::emit(const contenter& i) {
 824   vixl::Label stub, end;
 825
 826   // Jump past the stub below.
 827   recordAddressImmediate();
 828   a->B(&end);
 829
 830   // We call into this stub from the end below. Take that LR and store it in
 831   // m_savedRip. Then jump to the target.
 832   a->bind(&stub);
 833   a->Str(X(rlr()), M(i.fp[AROFF(m_savedRip)]));
 834   a->Br(X(i.target));
 835
 836   // Call to stub above and then unwind.
 837   a->bind(&end);
 838   recordAddressImmediate();
 839   a->Bl(&stub);
 840   emit(unwind{{i.targets[0], i.targets[1]}});
 841 }
 842
 843 ///////////////////////////////////////////////////////////////////////////////
 844
 845 void Vgen::emit(const leavetc& /*i*/) {
 846   // The LR was preserved on the stack by resumetc. Pop it while preserving
 847   // SP alignment and return.
 848   a->Ldp(rAsm, X(rlr()), MemOperand(sp, 16, PostIndex));
 849   a->Ret();
 850 }
 851
 852 ///////////////////////////////////////////////////////////////////////////////
 853
 854 void Vgen::emit(const nothrow& /*i*/) {
 855   env.meta.catches.emplace_back(a->frontier(), nullptr);
 856   env.record_inline_stack(a->frontier());
 857 }
 858
 859 void Vgen::emit(const syncpoint& i) {
 860   FTRACE(5, "IR recordSyncPoint: {} {}\n", a->frontier(), i.fix.show());
 861   env.meta.fixups.emplace_back(a->frontier(), i.fix);
 862   env.record_inline_stack(a->frontier());
 863 }
 864
 865 void Vgen::emit(const unwind& i) {
 866   catches.push_back({a->frontier(), i.targets[1]});
 867   env.record_inline_stack(a->frontier());
 868   emit(jmp{i.targets[0]});
 869 }
 870
 871 ///////////////////////////////////////////////////////////////////////////////
 872
 873 /*
 874  * Flags
 875  *   SF should be set to MSB of the result
 876  *   CF, OF should be set to (1, 1) if the result is truncated, (0, 0) otherwise
 877  *   ZF, AF, PF are undefined
 878  *
 879  * In the following implementation,
 880  *   N, Z, V are updated according to result
 881  *   C is cleared (FIXME)
 882  */
 883 void Vgen::emit(const imul& i) {
 884
 885   // Do the multiplication
 886   a->Mul(X(i.d), X(i.s0), X(i.s1));
 887
 888   // If we have to set any flags, then always set N and Z since it's cheap.
 889   // Only set V when absolutely necessary. C is not supported.
 890   if (i.fl) {
 891     vixl::Label after;
 892
 893     checkSF(i, StatusFlags::NotC);
 894
 895     if (flagRequired(i.fl, StatusFlags::V)) {
 896       vixl::Label checkSign;
 897       vixl::Label Overflow;
 898
 899       // Do the multiplication for the upper 64 bits of a 128 bit result.
 900       // If the result is not all zeroes or all ones, then we have overflow.
 901       // If the result is all zeroes or all ones, and the sign is the same,
 902       // for both hi and low, then there is no overflow.
 903       a->smulh(rAsm, X(i.s0), X(i.s1));
 904
 905       // If hi is all 0's or 1's, then check the sign, else overflow
 906       // (fallthrough).
 907       recordAddressImmediate();
 908       a->Cbz(rAsm, &checkSign);
 909       a->Cmp(rAsm, -1);
 910       recordAddressImmediate();
 911       a->B(&checkSign, vixl::eq);
 912
 913       // Overflow, so conditionally set N and Z bits and then or in V bit.
 914       a->Bind(&Overflow);
 915       a->Bic(vixl::xzr, X(i.d), vixl::xzr, SetFlags);
 916       a->Mrs(rAsm, NZCV);
 917       a->Orr(rAsm, rAsm, 1<<28);
 918       a->Msr(NZCV, rAsm);
 919       recordAddressImmediate();
 920       a->B(&after);
 921
 922       // Check the signs of hi and lo.
 923       a->Bind(&checkSign);
 924       a->Eor(rAsm, rAsm, X(i.d));
 925       recordAddressImmediate();
 926       a->Tbnz(rAsm, 63, &Overflow);
 927     }
 928
 929     // No Overflow, so conditionally set the N and Z only
 930     a->Bic(vixl::xzr, X(i.d), vixl::xzr, SetFlags);
 931
 932     a->bind(&after);
 933   }
 934 }
 935
 936 void Vgen::emit(const decqmlock& i) {
 937   auto adr = M(i.m);
 938   /* Use VIXL's macroassembler scratch regs. */
 939   a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);
 940   if (RuntimeOption::EvalJitArmLse) {
 941     a->Mov(rVixlScratch0, -1);
 942     a->ldaddal(rVixlScratch0, rVixlScratch0, adr);
 943     a->Sub(rAsm, rVixlScratch0, 1, SetFlags);
 944   } else {
 945     vixl::Label again;
 946     a->bind(&again);
 947     a->ldxr(rAsm, adr);
 948     a->Sub(rAsm, rAsm, 1, SetFlags);
 949     a->stxr(rVixlScratch0, rAsm, adr);
 950     recordAddressImmediate();
 951     a->Cbnz(rVixlScratch0, &again);
 952   }
 953   /* Restore VIXL's scratch regs. */
 954   a->SetScratchRegisters(rVixlScratch0, rVixlScratch1);
 955 }
 956
 957 void Vgen::emit(const jcc& i) {
 958   if (i.targets[1] != i.targets[0]) {
 959     if (next == i.targets[1]) {
 960       return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}});
 961     }
 962     auto taken = i.targets[1];
 963     jccs.push_back({a->frontier(), taken});
 964     vixl::Label skip, data;
 965
 966     // Emit a "far JCC" sequence for easy patching later.  Static relocation
 967     // might be able to simplify this later (see optimizeFarJcc()).
 968     recordAddressImmediate();
 969     a->B(&skip, vixl::InvertCondition(C(i.cc)));
 970     recordAddressImmediate();
 971     poolLiteral(*env.cb, env.meta, (uint64_t)makeTarget32(a->frontier()),
 972                 32, false);
 973     a->bind(&data);  // This will be remmaped during the handleLiterals phase.
 974     a->Ldr(rAsm_w, &data);
 975     a->Br(rAsm);
 976     a->bind(&skip);
 977   }
 978   emit(jmp{i.targets[0]});
 979 }
 980
 981 void Vgen::emit(const jcci& i) {
 982   vixl::Label skip;
 983
 984   recordAddressImmediate();
 985   a->B(&skip, vixl::InvertCondition(C(i.cc)));
 986   emit(jmpi{i.taken});
 987   a->bind(&skip);
 988   emit(jmp{i.target});
 989 }
 990
 991 void Vgen::emit(const jmp& i) {
 992   if (next == i.target) return;
 993   jmps.push_back({a->frontier(), i.target});
 994   vixl::Label data;
 995
 996   // Emit a "far JMP" sequence for easy patching later.  Static relocation
 997   // might be able to simplify this (see optimizeFarJmp()).
 998   recordAddressImmediate();
 999   poolLiteral(*env.cb, env.meta, (uint64_t)a->frontier(), 32, false);
1000   a->bind(&data); // This will be remapped during the handleLiterals phase.
1001   a->Ldr(rAsm_w, &data);
1002   a->Br(rAsm);
1003 }
1004
1005 void Vgen::emit(const jmpi& i) {
1006   vixl::Label data;
1007
1008   // If target can be addressed by pc relative offset (signed 26 bits), emit
1009   // PC relative jump. Else, emit target address into code and load from there.
1010   auto diff = (i.target - a->frontier()) >> vixl::kInstructionSizeLog2;
1011   if (vixl::is_int26(diff)) {
1012     recordAddressImmediate();
1013     a->b(diff);
1014   } else {
1015     // Cannot use simple a->Mov() since such a sequence cannot be
1016     // adjusted while live following a relocation.
1017     recordAddressImmediate();
1018     poolLiteral(*env.cb, env.meta, (uint64_t)i.target, 32, false);
1019     a->bind(&data); // This will be remapped during the handleLiterals phase.
1020     a->Ldr(rAsm_w, &data);
1021     a->Br(rAsm);
1022   }
1023 }
1024
1025 void Vgen::emit(const lea& i) {
1026   auto p = i.s;
1027   assertx(p.base.isValid());
1028   if (p.index.isValid()) {
1029     assertx(p.disp == 0);
1030     a->Add(X(i.d), X(p.base), Operand(X(p.index), LSL, Log2(p.scale)));
1031   } else {
1032     a->Add(X(i.d), X(p.base), p.disp);
1033   }
1034 }
1035
1036 void Vgen::emit(const leav& i) {
1037   auto const addr = a->frontier();
1038   emit(leap{reg::rip[0xdeadbeef], i.d});
1039   env.leas.push_back({addr, i.s});
1040 }
1041
1042 void Vgen::emit(const leap& i) {
1043   vixl::Label imm_data;
1044   vixl::Label after_data;
1045
1046   // Cannot use simple a->Mov() since such a sequence cannot be
1047   // adjusted while live following a relocation.
1048   recordAddressImmediate();
1049   poolLiteral(*env.cb, env.meta, (uint64_t)makeTarget32(i.s.r.disp),
1050               32, false);
1051   a->bind(&imm_data);  // This will be remapped during the handleLiterals phase.
1052   a->Ldr(W(i.d), &imm_data);
1053 }
1054
1055 void Vgen::emit(const lead& i) {
1056   recordAddressImmediate();
1057   a->Mov(X(i.d), i.s.get());
1058 }
1059
1060 #define Y(vasm_opc, arm_opc, src_dst, m)                             \
1061 void Vgen::emit(const vasm_opc& i) {                                 \
1062   assertx(i.m.base.isValid());                                       \
1063   a->Mov(rAsm, X(i.m.base));                                         \
1064   if (i.m.index.isValid()) {                                         \
1065     a->Add(rAsm, rAsm, Operand(X(i.m.index), LSL, Log2(i.m.scale))); \
1066   }                                                                  \
1067   if (i.m.disp != 0) {                                               \
1068     a->Add(rAsm, rAsm, i.m.disp);                                    \
1069   }                                                                  \
1070   a->arm_opc(V(i.src_dst), MemOperand(rAsm));                        \
1071 }
1072
1073 Y(loadups, ld1, d, s)
1074 Y(storeups, st1, s, m)
1075
1076 #undef Y
1077
1078 /*
1079  * Flags
1080  *   SF, ZF, PF should be updated according to result
1081  *   CF, OF should be cleared
1082  *   AF is undefined
1083  *
1084  * In the following implementation,
1085  *   N, Z are updated according to result
1086  *   C, V are cleared
1087  */
1088 #define Y(vasm_opc, arm_opc, gpr_w, s0, zr)           \
1089 void Vgen::emit(const vasm_opc& i) {                  \
1090   a->arm_opc(gpr_w(i.d), gpr_w(i.s1), s0);            \
1091   if (i.fl) {                                         \
1092     a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags); \
1093   }                                                   \
1094 }
1095
1096 Y(orbi, Orr, W, i.s0.ub(), wzr);
1097 Y(orwi, Orr, W, i.s0.uw(), xzr);
1098 Y(orli, Orr, W, i.s0.l(), xzr);
1099 Y(orqi, Orr, X, i.s0.q(), xzr);
1100 Y(orq, Orr, X, X(i.s0), xzr);
1101 Y(xorb, Eor, W, W(i.s0), wzr);
1102 Y(xorbi, Eor, W, i.s0.ub(), wzr);
1103 Y(xorw, Eor, W, W(i.s0), wzr);
1104 Y(xorwi, Eor, W, i.s0.uw(), wzr);
1105 Y(xorl, Eor, W, W(i.s0), wzr);
1106 Y(xorq, Eor, X, X(i.s0), xzr);
1107 Y(xorqi, Eor, X, i.s0.q(), xzr);
1108
1109 #undef Y
1110
1111 void Vgen::emit(const pop& i) {
1112   // SP access must be 8 byte aligned. Use rAsm instead.
1113   a->Mov(rAsm, sp);
1114   a->Ldr(X(i.d), MemOperand(rAsm, 8, PostIndex));
1115   a->Mov(sp, rAsm);
1116 }
1117
1118 void Vgen::emit(const push& i) {
1119   // SP access must be 8 byte aligned. Use rAsm instead.
1120   a->Mov(rAsm, sp);
1121   a->Str(X(i.s), MemOperand(rAsm, -8, PreIndex));
1122   a->Mov(sp, rAsm);
1123 }
1124
1125 void Vgen::emit(const roundsd& i) {
1126   switch (i.dir) {
1127     case RoundDirection::nearest: {
1128       a->frintn(D(i.d), D(i.s));
1129       break;
1130     }
1131
1132     case RoundDirection::floor: {
1133       a->frintm(D(i.d), D(i.s));
1134       break;
1135     }
1136
1137     case RoundDirection:: ceil: {
1138       a->frintp(D(i.d), D(i.s));
1139       break;
1140     }
1141
1142     default: {
1143       assertx(i.dir == RoundDirection::truncate);
1144       a->frintz(D(i.d), D(i.s));
1145     }
1146   }
1147 }
1148
1149 void Vgen::emit(const srem& i) {
1150   a->Sdiv(rAsm, X(i.s0), X(i.s1));
1151   a->Msub(X(i.d), rAsm, X(i.s1), X(i.s0));
1152 }
1153
1154 void Vgen::emit(const trap& i) {
1155   env.meta.trapReasons.emplace_back(a->frontier(), i.reason);
1156   a->Brk(1);
1157 }
1158
1159 void Vgen::emit(const unpcklpd& i) {
1160   // i.d and i.s1 can be same, i.s0 is unique.
1161   if (i.d != i.s1) a->fmov(D(i.d), D(i.s1));
1162   a->fmov(rAsm, D(i.s0));
1163   a->fmov(D(i.d), 1, rAsm);
1164 }
1165
1166 ///////////////////////////////////////////////////////////////////////////////
1167
1168 void Vgen::emit(const cmpsd& i) {
1169   /*
1170    * cmpsd doesn't update SD, so read the flags into a temp.
1171    * Use one of the macroassembler scratch regs .
1172    */
1173   a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);
1174   a->Mrs(rVixlScratch0, NZCV);
1175
1176   a->Fcmp(D(i.s0), D(i.s1));
1177   switch (i.pred) {
1178   case ComparisonPred::eq_ord:
1179     a->Csetm(rAsm, C(jit::CC_E));
1180     break;
1181   case ComparisonPred::ne_unord:
1182     a->Csetm(rAsm, C(jit::CC_NE));
1183     break;
1184   default:
1185     always_assert(false);
1186   }
1187   a->Fmov(D(i.d), rAsm);
1188
1189   /* Copy the flags back to the system register. */
1190   a->Msr(NZCV, rVixlScratch0);
1191   a->SetScratchRegisters(rVixlScratch0, rVixlScratch1);
1192 }
1193
1194
1195 ///////////////////////////////////////////////////////////////////////////////
1196
1197 /*
1198  * For the shifts:
1199  *
1200  * C is set through inspection
1201  * N, Z are updated according to result
1202  * V is cleared (FIXME)
1203  * PF, AF are not available
1204  *
1205  * Only set the flags if there are any required flags (i.fl).
1206  * Setting the C flag is particularly expensive, so when setting
1207  * flags check this flag specifically.
1208  */
1209 #define Y(vasm_opc, arm_opc, gpr_w, zr)                      \
1210 void Vgen::emit(const vasm_opc& i) {                         \
1211   if (!i.fl) {                                               \
1212     /* Just perform the shift. */                            \
1213     a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0));        \
1214   } else {                                                   \
1215     checkSF(i, StatusFlags::NotV);                           \
1216     if (!flagRequired(i.fl, StatusFlags::C)) {               \
1217       /* Perform the shift and set N and Z. */               \
1218       a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0));      \
1219       a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags);      \
1220     } else {                                                 \
1221       /* Use VIXL's macroassembler scratch regs. */          \
1222       a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);      \
1223       /* Perform the shift using temp and set N and Z. */    \
1224       a->arm_opc(rVixlScratch0, gpr_w(i.s1), gpr_w(i.s0));   \
1225       a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags);   \
1226       /* Read the flags into a temp. */                      \
1227       a->Mrs(rAsm, NZCV);                                    \
1228       /* Reshift right leaving the last bit as bit 0. */     \
1229       a->Sub(rVixlScratch1, gpr_w(i.s0), 1);                 \
1230       a->Lsr(rVixlScratch1, gpr_w(i.s1), rVixlScratch1);     \
1231       /* Negate the bits, including bit 0 to match X64. */   \
1232       a->Mvn(rVixlScratch1, rVixlScratch1);                  \
1233       /* Copy bit zero into bit 29 of the flags. */          \
1234       a->bfm(rAsm, rVixlScratch1, 35, 0);                    \
1235       /* Copy the flags back to the system register. */      \
1236       a->Msr(NZCV, rAsm);                                    \
1237       /* Copy the result to the destination. */              \
1238       a->Mov(gpr_w(i.d), rVixlScratch0);                     \
1239       /* Restore VIXL's scratch regs. */                     \
1240       a->SetScratchRegisters(rVixlScratch0, rVixlScratch1);  \
1241     }                                                        \
1242   }                                                          \
1243 }
1244
1245 Y(sar, Asr, X, xzr)
1246
1247 #undef Y
1248
1249 #define Y(vasm_opc, arm_opc, gpr_w, sz, zr)                 \
1250 void Vgen::emit(const vasm_opc& i) {                        \
1251   if (!i.fl) {                                              \
1252     /* Just perform the shift. */                           \
1253     a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0));       \
1254   } else {                                                  \
1255     checkSF(i, StatusFlags::NotV);                          \
1256     if (!flagRequired(i.fl, StatusFlags::C)) {              \
1257       /* Perform the shift and set N and Z. */              \
1258       a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0));     \
1259       a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags);     \
1260     } else {                                                \
1261       /* Use VIXL's macroassembler scratch regs. */         \
1262       a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);     \
1263       /* Perform the shift using temp and set N and Z. */   \
1264       a->arm_opc(rVixlScratch0, gpr_w(i.s1), gpr_w(i.s0));  \
1265       a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags);  \
1266       /* Read the flags into a temp. */                     \
1267       a->Mrs(rAsm, NZCV);                                   \
1268       /* Reshift right leaving the last bit as bit 0. */    \
1269       a->Mov(rVixlScratch1, sz);                            \
1270       a->Sub(rVixlScratch1, rVixlScratch1, gpr_w(i.s0));    \
1271       a->Lsr(rVixlScratch1, gpr_w(i.s1), rVixlScratch1);    \
1272       /* Negate the bits, including bit 0 to match X64. */  \
1273       a->Mvn(rVixlScratch1, rVixlScratch1);                 \
1274       /* Copy bit zero into bit 29 of the flags. */         \
1275       a->bfm(rAsm, rVixlScratch1, 35, 0);                   \
1276       /* Copy the flags back to the system register. */     \
1277       a->Msr(NZCV, rAsm);                                   \
1278       /* Copy the result to the destination. */             \
1279       a->Mov(gpr_w(i.d), rVixlScratch0);                    \
1280       /* Restore VIXL's scratch regs. */                    \
1281       a->SetScratchRegisters(rVixlScratch0, rVixlScratch1); \
1282     }                                                       \
1283   }                                                         \
1284 }
1285
1286 Y(shl, Lsl, X, 64, xzr)
1287
1288 #undef Y
1289
1290 #define Y(vasm_opc, arm_opc, gpr_w, zr)                     \
1291 void Vgen::emit(const vasm_opc& i) {                        \
1292   if (!i.fl) {                                              \
1293     /* Just perform the shift. */                           \
1294     a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l());          \
1295   } else {                                                  \
1296     checkSF(i, StatusFlags::NotV);                          \
1297     if (!flagRequired(i.fl, StatusFlags::C)) {              \
1298       /* Perform the shift and set N and Z. */              \
1299       a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l());        \
1300       a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags);     \
1301     } else {                                                \
1302       /* Use VIXL's macroassembler scratch regs. */         \
1303       a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);     \
1304       /* Perform the shift using temp and set N and Z. */   \
1305       a->arm_opc(rVixlScratch0, gpr_w(i.s1), i.s0.l());     \
1306       a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags);  \
1307       /* Read the flags into a temp. */                     \
1308       a->Mrs(rAsm, NZCV);                                   \
1309       /* Reshift right leaving the last bit as bit 0. */    \
1310       a->Lsr(rVixlScratch1, gpr_w(i.s1), i.s0.l() - 1);     \
1311       /* Negate the bits, including bit 0 to match X64. */  \
1312       a->Mvn(rVixlScratch1, rVixlScratch1);                 \
1313       /* Copy bit zero into bit 29 of the flags. */         \
1314       a->bfm(rAsm, rVixlScratch1, 35, 0);                   \
1315       /* Copy the flags back to the system register. */     \
1316       a->Msr(NZCV, rAsm);                                   \
1317       /* Copy the result to the destination. */             \
1318       a->Mov(gpr_w(i.d), rVixlScratch0);                    \
1319       /* Restore VIXL's scratch regs. */                    \
1320       a->SetScratchRegisters(rVixlScratch0, rVixlScratch1); \
1321     }                                                       \
1322   }                                                         \
1323 }
1324
1325 Y(sarqi, Asr, X, xzr)
1326 Y(shrli, Lsr, W, wzr)
1327 Y(shrqi, Lsr, X, xzr)
1328
1329 #undef Y
1330
1331 #define Y(vasm_opc, arm_opc, gpr_w, sz, zr)                  \
1332 void Vgen::emit(const vasm_opc& i) {                         \
1333   if (!i.fl) {                                               \
1334     /* Just perform the shift. */                            \
1335     a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l());           \
1336   } else {                                                   \
1337     checkSF(i, StatusFlags::NotV);                           \
1338     if (!flagRequired(i.fl, StatusFlags::C)) {               \
1339       /* Perform the shift and set N and Z. */               \
1340       a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l());         \
1341       a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags);      \
1342     } else {                                                 \
1343       /* Use VIXL's macroassembler scratch regs. */          \
1344       a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);      \
1345       /* Perform the shift using temp and set N and Z. */    \
1346       a->arm_opc(rVixlScratch0, gpr_w(i.s1), i.s0.l());      \
1347       a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags);   \
1348       /* Read the flags into a temp. */                      \
1349       a->Mrs(rAsm, NZCV);                                    \
1350       /* Reshift right leaving the last bit as bit 0. */     \
1351       a->Lsr(rVixlScratch1, gpr_w(i.s1), sz - i.s0.l());     \
1352       /* Negate the bits, including bit 0 to match X64. */   \
1353       a->Mvn(rVixlScratch1, rVixlScratch1);                  \
1354       /* Copy bit zero into bit 29 of the flags. */          \
1355       a->bfm(rAsm, rVixlScratch1, 35, 0);                    \
1356       /* Copy the flags back to the system register. */      \
1357       a->Msr(NZCV, rAsm);                                    \
1358       /* Copy the result to the destination. */              \
1359       a->Mov(gpr_w(i.d), rVixlScratch0);                     \
1360       /* Restore VIXL's scratch regs. */                     \
1361       a->SetScratchRegisters(rVixlScratch0, rVixlScratch1);  \
1362     }                                                        \
1363   }                                                          \
1364 }
1365
1366 Y(shlli, Lsl, W, 32, wzr)
1367 Y(shlqi, Lsl, X, 64, xzr)
1368
1369 #undef Y
1370
1371 ///////////////////////////////////////////////////////////////////////////////
1372
1373 void Vgen::emit(const popp& i) {
1374   a->Ldp(X(i.d0), X(i.d1), MemOperand(sp, 16, PostIndex));
1375 }
1376
1377 void Vgen::emit(const pushp& i) {
1378   a->Stp(X(i.s1), X(i.s0), MemOperand(sp, -16, PreIndex));
1379 }
1380
1381 ///////////////////////////////////////////////////////////////////////////////
1382
1383 template<typename Lower>
1384 void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) {
1385   vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; });
1386 }
1387
1388 template <typename Inst>
1389 void lower(const VLS& /*env*/, Inst& /*inst*/, Vlabel /*b*/, size_t /*i*/) {}
1390
1391 ///////////////////////////////////////////////////////////////////////////////
1392
1393 /*
1394  * TODO: Using load size (ldr[bh]?), apply scaled address if 'disp' is unsigned
1395  */
1396 void lowerVptr(Vptr& p, Vout& v) {
1397   enum {
1398     BASE = 1,
1399     INDEX = 2,
1400     DISP = 4
1401   };
1402
1403   uint8_t mode = (((p.base.isValid()  & 0x1) << 0) |
1404                   ((p.index.isValid() & 0x1) << 1) |
1405                   (((p.disp != 0)     & 0x1) << 2));
1406   switch (mode) {
1407     case BASE:
1408     case BASE | INDEX:
1409       // ldr/str allow [base] and [base, index], nothing to lower.
1410       break;
1411
1412     case INDEX:
1413       // Not supported, convert to [base].
1414       if (p.scale > 1) {
1415         auto t = v.makeReg();
1416         v << shlqi{Log2(p.scale), p.index, t, v.makeReg()};
1417         p.base = t;
1418       } else {
1419         p.base = p.index;
1420       }
1421       p.index = Vreg{};
1422       p.scale = 1;
1423       break;
1424
1425     case BASE | DISP: {
1426       // ldr/str allow [base, #imm], where #imm is [-256 .. 255].
1427       if (p.disp >= -256 && p.disp <= 255)
1428         break;
1429
1430       // #imm is out of range, convert to [base, index]
1431       auto index = v.makeReg();
1432       v << ldimmq{Immed64(p.disp), index};
1433       p.index = index;
1434       p.scale = 1;
1435       p.disp = 0;
1436       break;
1437     }
1438
1439     case DISP: {
1440       // Not supported, convert to [base].
1441       auto base = v.makeReg();
1442       v << ldimmq{Immed64(p.disp), base};
1443       p.base = base;
1444       p.index = Vreg{};
1445       p.scale = 1;
1446       p.disp = 0;
1447       break;
1448     }
1449
1450     case INDEX | DISP:
1451       // Not supported, convert to [base, #imm] or [base, index].
1452       if (p.scale > 1) {
1453         auto t = v.makeReg();
1454         v << shlqi{Log2(p.scale), p.index, t, v.makeReg()};
1455         p.base = t;
1456       } else {
1457         p.base = p.index;
1458       }
1459       if (p.disp >= -256 && p.disp <= 255) {
1460         p.index = Vreg{};
1461         p.scale = 1;
1462       } else {
1463         auto index = v.makeReg();
1464         v << ldimmq{Immed64(p.disp), index};
1465         p.index = index;
1466         p.scale = 1;
1467         p.disp = 0;
1468       }
1469       break;
1470
1471     case BASE | INDEX | DISP: {
1472       // Not supported, convert to [base, index].
1473       auto index = v.makeReg();
1474       if (p.scale > 1) {
1475         auto t = v.makeReg();
1476         v << shlqi{Log2(p.scale), p.index, t, v.makeReg()};
1477         v << addqi{p.disp, t, index, v.makeReg()};
1478       } else {
1479         v << addqi{p.disp, p.index, index, v.makeReg()};
1480       }
1481       p.index = index;
1482       p.scale = 1;
1483       p.disp = 0;
1484       break;
1485     }
1486   }
1487 }
1488
1489 #define Y(vasm_opc, m)                                      \
1490 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
1491   lower_impl(e.unit, b, z, [&] (Vout& v) {                  \
1492     lowerVptr(i.m, v);                                      \
1493     v << i;                                                 \
1494   });                                                       \
1495 }
1496
1497 Y(decqmlock, m)
1498 Y(lea, s)
1499 Y(load, s)
1500 Y(loadb, s)
1501 Y(loadl, s)
1502 Y(loadsd, s)
1503 Y(loadtqb, s)
1504 Y(loadtql, s)
1505 Y(loadups, s)
1506 Y(loadw, s)
1507 Y(loadzbl, s)
1508 Y(loadzbq, s)
1509 Y(loadzlq, s)
1510 Y(store, d)
1511 Y(storeb, m)
1512 Y(storel, m)
1513 Y(storesd, m)
1514 Y(storeups, m)
1515 Y(storew, m)
1516
1517 #undef Y
1518
1519 #define Y(vasm_opc, lower_opc, load_opc, store_opc, arg, m) \
1520 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
1521   lower_impl(e.unit, b, z, [&] (Vout& v) {                  \
1522     lowerVptr(i.m, v);                                      \
1523     auto r0 = v.makeReg(), r1 = v.makeReg();                \
1524     v << load_opc{i.m, r0};                                 \
1525     v << lower_opc{arg, r0, r1, i.sf, i.fl};                \
1526     v << store_opc{r1, i.m};                                \
1527   });                                                       \
1528 }
1529
1530 Y(addlim, addli, loadl, storel, i.s0, m)
1531 Y(addlm, addl, loadl, storel, i.s0, m)
1532 Y(addwm, addl, loadw, storew, Reg32(i.s0), m)
1533 Y(addqim, addqi, load, store, i.s0, m)
1534 Y(andbim, andbi, loadb, storeb, i.s, m)
1535 Y(orbim, orqi, loadb, storeb, i.s0, m)
1536 Y(orqim, orqi, load, store, i.s0, m)
1537 Y(orwim, orqi, loadw, storew, i.s0, m)
1538 Y(orlim, orqi, loadl, storel, i.s0, m)
1539
1540 #undef Y
1541
1542 #define Y(vasm_opc, lower_opc, movs_opc)                                \
1543 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) {             \
1544   if (!i.fl || (i.fl & static_cast<Vflags>(StatusFlags::NV))) {         \
1545     lower_impl(e.unit, b, z, [&] (Vout& v) {                            \
1546       auto r0 = v.makeReg(), r1 = v.makeReg();                          \
1547       v << movs_opc{i.s0, r0};                                          \
1548       v << movs_opc{i.s1, r1};                                          \
1549       v << lower_opc{r0, r1, i.sf, i.fl};                               \
1550     });                                                                 \
1551   }                                                                     \
1552 }
1553
1554 Y(cmpb, cmpl, movsbl)
1555 Y(cmpw, cmpl, movswl)
1556
1557 #undef Y
1558
1559 #define Y(vasm_opc, lower_opc, movs_opc)                                \
1560 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) {             \
1561   if (!i.fl || (i.fl & static_cast<Vflags>(StatusFlags::NV))) {         \
1562     lower_impl(e.unit, b, z, [&] (Vout& v) {                            \
1563       auto r = v.makeReg();                                             \
1564       v << movs_opc{i.s1, r};                                           \
1565       v << lower_opc{i.s0, r, i.sf, i.fl};                              \
1566     });                                                                 \
1567   }                                                                     \
1568 }
1569
1570 Y(cmpbi, cmpli, movsbl)
1571 Y(cmpwi, cmpli, movswl)
1572
1573 #undef Y
1574
1575 #define Y(vasm_opc, lower_opc, load_opc)                         \
1576 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) {      \
1577   lower_impl(e.unit, b, z, [&] (Vout& v) {                       \
1578     lowerVptr(i.s1, v);                                          \
1579     auto r = e.allow_vreg() ? v.makeReg() : Vreg(PhysReg(rAsm)); \
1580     v << load_opc{i.s1, r};                                      \
1581     v << lower_opc{i.s0, r, i.sf, i.fl};                         \
1582   });                                                            \
1583 }
1584
1585 Y(cmpbim, cmpbi, loadb)
1586 Y(cmplim, cmpli, loadl)
1587 Y(cmpbm, cmpb, loadb)
1588 Y(cmpwm, cmpw, loadb)
1589 Y(cmplm, cmpl, loadl)
1590 Y(cmpqim, cmpqi, load)
1591 Y(cmpqm, cmpq, load)
1592 Y(cmpwim, cmpwi, loadw)
1593 Y(testbim, testli, loadb)
1594 Y(testlim, testli, loadl)
1595 Y(testqim, testqi, load)
1596 Y(testbm, testb, loadb)
1597 Y(testwm, testw, loadw)
1598 Y(testlm, testl, loadl)
1599 Y(testqm, testq, load)
1600 Y(testwim, testli, loadw)
1601
1602 #undef Y
1603
1604 void lower(const VLS& e, cvtsi2sdm& i, Vlabel b, size_t z) {
1605   lower_impl(e.unit, b, z, [&] (Vout& v) {
1606     lowerVptr(i.s, v);
1607     auto r = v.makeReg();
1608     v << load{i.s, r};
1609     v << cvtsi2sd{r, i.d};
1610   });
1611 }
1612
1613 #define Y(vasm_opc, lower_opc, load_opc, store_opc, m)            \
1614 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) {       \
1615   lower_impl(e.unit, b, z, [&] (Vout& v) {                        \
1616     lowerVptr(i.m, v);                                            \
1617     auto r0 = e.allow_vreg() ? v.makeReg() : Vreg(PhysReg(rAsm)); \
1618     auto r1 = e.allow_vreg() ? v.makeReg() : Vreg(PhysReg(rAsm)); \
1619     v << load_opc{i.m, r0};                                       \
1620     v << lower_opc{r0, r1, i.sf, i.fl};                           \
1621     v << store_opc{r1, i.m};                                      \
1622   });                                                             \
1623 }
1624
1625 Y(declm, decl, loadl, storel, m)
1626 Y(decqm, decq, load, store, m)
1627 Y(inclm, incl, loadl, storel, m)
1628 Y(incqm, incq, load, store, m)
1629 Y(incwm, incw, loadw, storew, m)
1630
1631 #undef Y
1632
1633 void lower(const VLS& e, cvttsd2siq& i, Vlabel b, size_t idx) {
1634   lower_impl(e.unit, b, idx, [&] (Vout& v) {
1635     // Clear FPSR IOC flag.
1636     auto const tmp1 = v.makeReg();
1637     auto const tmp2 = v.makeReg();
1638     v << mrs{FPSR, tmp1};
1639     v << andqi{~0x01, tmp1, tmp2, v.makeReg()};
1640     v << msr{tmp2, FPSR};
1641
1642     // Load error value.
1643     auto const err = v.makeReg();
1644     v << ldimmq{0x8000000000000000, err};
1645
1646     // Do ARM64's double to signed int64 conversion.
1647     auto const res = v.makeReg();
1648     v << fcvtzs{i.s, res};
1649
1650     // Check if there was a conversion error.
1651     auto const fpsr = v.makeReg();
1652     auto const sf = v.makeReg();
1653     v << mrs{FPSR, fpsr};
1654     v << testqi{1, fpsr, sf};
1655
1656     // Move converted value or error.
1657     v << cmovq{CC_NZ, sf, res, err, i.d};
1658   });
1659 }
1660
1661 void lower(const VLS& e, callm& i, Vlabel b, size_t z) {
1662   lower_impl(e.unit, b, z, [&] (Vout& v) {
1663     lowerVptr(i.target, v);
1664
1665     auto const scratch = v.makeReg();
1666
1667     // Load the target from memory and then call it.
1668     v << load{i.target, scratch};
1669     v << callr{scratch, i.args};
1670   });
1671 }
1672
1673 void lower(const VLS& e, jmpm& i, Vlabel b, size_t z) {
1674   lower_impl(e.unit, b, z, [&] (Vout& v) {
1675     lowerVptr(i.target, v);
1676
1677     auto const scratch = v.makeReg();
1678
1679     v << load{i.target, scratch};
1680     v << jmpr{scratch, i.args};
1681   });
1682 }
1683
1684 ///////////////////////////////////////////////////////////////////////////////
1685
1686 void lower(const VLS& e, stublogue& /*i*/, Vlabel b, size_t z) {
1687   lower_impl(e.unit, b, z, [&] (Vout& v) {
1688     // Push both the LR and FP regardless of i.saveframe to align SP.
1689     v << pushp{rlr(), rvmfp()};
1690   });
1691 }
1692
1693 void lower(const VLS& e, unstublogue& /*i*/, Vlabel b, size_t z) {
1694   lower_impl(e.unit, b, z, [&] (Vout& v) {
1695     // Pop LR and remove FP from the stack.
1696     v << popp{PhysReg(rAsm), rlr()};
1697   });
1698 }
1699
1700 void lower(const VLS& e, stubret& i, Vlabel b, size_t z) {
1701   lower_impl(e.unit, b, z, [&] (Vout& v) {
1702     // Pop LR and (optionally) FP.
1703     if (i.saveframe) {
1704       v << popp{rvmfp(), rlr()};
1705     } else {
1706       v << popp{PhysReg(rAsm), rlr()};
1707     }
1708
1709     v << ret{i.args};
1710   });
1711 }
1712
1713 void lower(const VLS& e, tailcallstub& i, Vlabel b, size_t z) {
1714   lower_impl(e.unit, b, z, [&] (Vout& v) {
1715     // Restore LR from native stack and adjust SP.
1716     v << popp{PhysReg(rAsm), rlr()};
1717
1718     // Then directly jump to the target.
1719     v << jmpi{i.target, i.args};
1720   });
1721 }
1722
1723 void lower(const VLS& e, tailcallstubr& i, Vlabel b, size_t z) {
1724   lower_impl(e.unit, b, z, [&] (Vout& v) {
1725     // Restore LR from native stack and adjust SP.
1726     v << popp{PhysReg(rAsm), rlr()};
1727
1728     v << jmpr{i.target, i.args};
1729   });
1730 }
1731
1732 void lower(const VLS& e, stubunwind& i, Vlabel b, size_t z) {
1733   lower_impl(e.unit, b, z, [&] (Vout& v) {
1734     // Pop the call frame.
1735     v << popp{PhysReg(rAsm), i.d};
1736   });
1737 }
1738
1739 void lower(const VLS& e, stubtophp& /*i*/, Vlabel b, size_t z) {
1740   lower_impl(e.unit, b, z, [&] (Vout& v) {
1741     // Pop the call frame
1742     v << lea{rsp()[16], rsp()};
1743   });
1744 }
1745
1746 void lower(const VLS& e, loadstubret& i, Vlabel b, size_t z) {
1747   lower_impl(e.unit, b, z, [&] (Vout& v) {
1748     // Load the LR to the destination.
1749     v << load{rsp()[AROFF(m_savedRip)], i.d};
1750   });
1751 }
1752
1753 ///////////////////////////////////////////////////////////////////////////////
1754
1755 void lower(const VLS& e, phplogue& i, Vlabel b, size_t z) {
1756   lower_impl(e.unit, b, z, [&] (Vout& v) {
1757     v << store{rlr(), i.fp[AROFF(m_savedRip)]};
1758   });
1759 }
1760
1761 ///////////////////////////////////////////////////////////////////////////////
1762
1763 void lower(const VLS& e, resumetc& i, Vlabel b, size_t z) {
1764   lower_impl(e.unit, b, z, [&] (Vout& v) {
1765     // Call the translation target.
1766     v << callr{i.target, i.args};
1767
1768     // After returning to the translation, jump directly to the exit.
1769     v << jmpi{i.exittc};
1770   });
1771 }
1772
1773 ///////////////////////////////////////////////////////////////////////////////
1774
1775 void lower(const VLS& e, popm& i, Vlabel b, size_t z) {
1776   lower_impl(e.unit, b, z, [&] (Vout& v) {
1777     auto r = v.makeReg();
1778     v << pop{r};
1779     lowerVptr(i.d, v);
1780     v << store{r, i.d};
1781   });
1782 }
1783
1784 void lower(const VLS& e, poppm& i, Vlabel b, size_t z) {
1785   lower_impl(e.unit, b, z, [&] (Vout& v) {
1786     auto r0 = v.makeReg();
1787     auto r1 = v.makeReg();
1788     v << popp{r0, r1};
1789     lowerVptr(i.d0, v);
1790     lowerVptr(i.d1, v);
1791     v << store{r0, i.d0};
1792     v << store{r1, i.d1};
1793   });
1794 }
1795
1796 void lower(const VLS& e, pushm& i, Vlabel b, size_t z) {
1797   lower_impl(e.unit, b, z, [&] (Vout& v) {
1798     auto r = v.makeReg();
1799     lowerVptr(i.s, v);
1800     v << load{i.s, r};
1801     v << push{r};
1802   });
1803 }
1804
1805 void lower(const VLS& e, pushpm& i, Vlabel b, size_t z) {
1806   lower_impl(e.unit, b, z, [&] (Vout& v) {
1807     auto r0 = v.makeReg();
1808     auto r1 = v.makeReg();
1809     lowerVptr(i.s0, v);
1810     lowerVptr(i.s1, v);
1811     v << load{i.s0, r0};
1812     v << load{i.s1, r1};
1813     v << pushp{r0, r1};
1814   });
1815 }
1816
1817 template<typename movz>
1818 void lower_movz(const VLS& e, movz& i, Vlabel b, size_t z) {
1819   lower_impl(e.unit, b, z, [&] (Vout& v) {
1820     v << copy{i.s, i.d};
1821   });
1822 }
1823
1824 void lower(const VLS& e, movzbw& i, Vlabel b, size_t z) {
1825   lower_movz(e, i, b, z);
1826 }
1827
1828 void lower(const VLS& e, movzbl& i, Vlabel b, size_t z) {
1829   lower_movz(e, i, b, z);
1830 }
1831
1832 void lower(const VLS& e, movzwl& i, Vlabel b, size_t z) {
1833   lower_movz(e, i, b, z);
1834 }
1835
1836 void lower(const VLS& e, movtdb& i, Vlabel b, size_t z) {
1837   lower_impl(e.unit, b, z, [&] (Vout& v) {
1838     auto d = v.makeReg();
1839     v << copy{i.s, d};
1840     v << movtqb{d, i.d};
1841   });
1842 }
1843
1844 void lower(const VLS& e, movtdq& i, Vlabel b, size_t z) {
1845   lower_impl(e.unit, b, z, [&] (Vout& v) {
1846     v << copy{i.s, i.d};
1847   });
1848 }
1849
1850 #define Y(vasm_opc, lower_opc, load_opc, imm, zr, sz)   \
1851 void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
1852   lower_impl(e.unit, b, z, [&] (Vout& v) {                   \
1853     lowerVptr(i.m, v);                                  \
1854     if (imm.sz() == 0u) {                               \
1855       v << lower_opc{PhysReg(vixl::zr), i.m};           \
1856     } else {                                            \
1857       auto r = v.makeReg();                             \
1858       v << load_opc{imm, r};                            \
1859       v << lower_opc{r, i.m};                           \
1860     }                                                   \
1861   });                                                   \
1862 }
1863
1864 Y(storebi, storeb, ldimmb, i.s, wzr, b)
1865 Y(storewi, storew, ldimmw, i.s, wzr, w)
1866 Y(storeli, storel, ldimml, i.s, wzr, l)
1867 //storeqi only supports 32-bit immediates
1868 Y(storeqi, store, ldimmq, Immed64(i.s.l()), wzr, q)
1869
1870 #undef Y
1871
1872 void lower(const VLS& e, cloadq& i, Vlabel b, size_t z) {
1873   lower_impl(e.unit, b, z, [&] (Vout& v) {
1874     auto const scratch = v.makeReg();
1875
1876     lowerVptr(i.t, v);
1877
1878     v << load{i.t, scratch};
1879     v << cmovq{i.cc, i.sf, i.f, scratch, i.d};
1880   });
1881 }
1882
1883 void lower(const VLS& e, loadqp& i, Vlabel b, size_t z) {
1884   lower_impl(e.unit, b, z, [&] (Vout& v) {
1885     auto const scratch = v.makeReg();
1886
1887     v << leap{i.s, scratch};
1888     v << load{scratch[0], i.d};
1889   });
1890 }
1891
1892 void lower(const VLS& e, loadqd& i, Vlabel b, size_t z) {
1893   lower_impl(e.unit, b, z, [&] (Vout& v) {
1894     auto const scratch = v.makeReg();
1895
1896     v << lead{i.s.getRaw(), scratch};
1897     v << load{scratch[0], i.d};
1898   });
1899 }
1900
1901 ///////////////////////////////////////////////////////////////////////////////
1902
1903 void lowerForARM(Vunit& unit) {
1904   vasm_lower(unit, [&] (const VLS& env, Vinstr& inst, Vlabel b, size_t i) {
1905     switch (inst.op) {
1906 #define O(name, ...)                      \
1907       case Vinstr::name:                  \
1908         lower(env, inst.name##_, b, i);   \
1909         break;
1910
1911       VASM_OPCODES
1912 #undef O
1913     }
1914   });
1915 }
1916
1917 ///////////////////////////////////////////////////////////////////////////////
1918 }
1919
1920 void optimizeARM(Vunit& unit, const Abi& abi, bool regalloc) {
1921   Timer timer(Timer::vasm_optimize);
1922
1923   removeTrivialNops(unit);
1924   optimizePhis(unit);
1925   fuseBranches(unit);
1926   optimizeJmps(unit);
1927   optimizeExits(unit);
1928
1929   assertx(checkWidths(unit));
1930
1931   simplify(unit);
1932
1933   annotateSFUses(unit);
1934   lowerForARM(unit);
1935
1936   simplify(unit);
1937
1938   if (!unit.constToReg.empty()) {
1939     foldImms<arm::ImmFolder>(unit);
1940   }
1941   reuseImmq(unit);
1942
1943   optimizeCopies(unit, abi);
1944
1945   annotateSFUses(unit);
1946   if (unit.needsRegAlloc()) {
1947     removeDeadCode(unit);
1948     if (regalloc) {
1949       if (RuntimeOption::EvalUseGraphColor &&
1950           unit.context &&
1951           (unit.context->kind == TransKind::Optimize ||
1952            unit.context->kind == TransKind::OptPrologue)) {
1953         allocateRegistersWithGraphColor(unit, abi);
1954       } else {
1955         allocateRegistersWithXLS(unit, abi);
1956       }
1957     }
1958   }
1959   if (unit.blocks.size() > 1) {
1960     optimizeJmps(unit);
1961   }
1962 }
1963
1964 void emitARM(Vunit& unit, Vtext& text, CGMeta& fixups,
1965              AsmInfo* asmInfo) {
1966   vasm_emit<Vgen>(unit, text, fixups, asmInfo);
1967 }
1968
1969 ///////////////////////////////////////////////////////////////////////////////
1970 }}