gdbserver_tests filters remove python rpm module load warning
[valgrind.git] / VEX / priv / host_amd64_isel.c
blob21d20c77f090006b03a3be04f0b690bba01326fe
2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 #include "libvex_basictypes.h"
35 #include "libvex_ir.h"
36 #include "libvex.h"
38 #include "ir_match.h"
39 #include "main_util.h"
40 #include "main_globals.h"
41 #include "host_generic_regs.h"
42 #include "host_generic_simd64.h"
43 #include "host_generic_simd128.h"
44 #include "host_generic_simd256.h"
45 #include "host_amd64_maddf.h"
46 #include "host_generic_maddf.h"
47 #include "host_amd64_defs.h"
50 /*---------------------------------------------------------*/
51 /*--- x87/SSE control word stuff ---*/
52 /*---------------------------------------------------------*/
54 /* Vex-generated code expects to run with the FPU set as follows: all
55 exceptions masked, round-to-nearest, precision = 53 bits. This
56 corresponds to a FPU control word value of 0x027F.
58 Similarly the SSE control word (%mxcsr) should be 0x1F80.
60 %fpucw and %mxcsr should have these values on entry to
61 Vex-generated code, and should those values should be
62 unchanged at exit.
65 #define DEFAULT_FPUCW 0x027F
67 #define DEFAULT_MXCSR 0x1F80
69 /* debugging only, do not use */
70 /* define DEFAULT_FPUCW 0x037F */
73 /*---------------------------------------------------------*/
74 /*--- misc helpers ---*/
75 /*---------------------------------------------------------*/
77 /* These are duplicated in guest-amd64/toIR.c */
78 static IRExpr* unop ( IROp op, IRExpr* a )
80 return IRExpr_Unop(op, a);
83 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
85 return IRExpr_Binop(op, a1, a2);
88 static IRExpr* bind ( Int binder )
90 return IRExpr_Binder(binder);
93 static Bool isZeroU8 ( const IRExpr* e )
95 return e->tag == Iex_Const
96 && e->Iex.Const.con->tag == Ico_U8
97 && e->Iex.Const.con->Ico.U8 == 0;
101 /*---------------------------------------------------------*/
102 /*--- ISelEnv ---*/
103 /*---------------------------------------------------------*/
105 /* This carries around:
107 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
108 might encounter. This is computed before insn selection starts,
109 and does not change.
111 - A mapping from IRTemp to HReg. This tells the insn selector
112 which virtual register is associated with each IRTemp
113 temporary. This is computed before insn selection starts, and
114 does not change. We expect this mapping to map precisely the
115 same set of IRTemps as the type mapping does.
117 - vregmap holds the primary register for the IRTemp.
118 - vregmapHI is only used for 128-bit integer-typed
119 IRTemps. It holds the identity of a second
120 64-bit virtual HReg, which holds the high half
121 of the value.
123 - The host subarchitecture we are selecting insns for.
124 This is set at the start and does not change.
126 - The code array, that is, the insns selected so far.
128 - A counter, for generating new virtual registers.
130 - A Bool for indicating whether we may generate chain-me
131 instructions for control flow transfers, or whether we must use
132 XAssisted.
134 - The maximum guest address of any guest insn in this block.
135 Actually, the address of the highest-addressed byte from any insn
136 in this block. Is set at the start and does not change. This is
137 used for detecting jumps which are definitely forward-edges from
138 this block, and therefore can be made (chained) to the fast entry
139 point of the destination, thereby avoiding the destination's
140 event check.
142 Note, this is all host-independent. (JRS 20050201: well, kinda
143 ... not completely. Compare with ISelEnv for X86.)
146 typedef
147 struct {
148 /* Constant -- are set at the start and do not change. */
149 IRTypeEnv* type_env;
151 HReg* vregmap;
152 HReg* vregmapHI;
153 Int n_vregmap;
155 UInt hwcaps;
157 Bool chainingAllowed;
158 Addr64 max_ga;
160 /* These are modified as we go along. */
161 HInstrArray* code;
162 Int vreg_ctr;
164 ISelEnv;
167 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
169 vassert(tmp < env->n_vregmap);
170 return env->vregmap[tmp];
173 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
174 ISelEnv* env, IRTemp tmp )
176 vassert(tmp < env->n_vregmap);
177 vassert(! hregIsInvalid(env->vregmapHI[tmp]));
178 *vrLO = env->vregmap[tmp];
179 *vrHI = env->vregmapHI[tmp];
182 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
184 addHInstr(env->code, instr);
185 if (vex_traceflags & VEX_TRACE_VCODE) {
186 ppAMD64Instr(instr, True);
187 vex_printf("\n");
191 static HReg newVRegI ( ISelEnv* env )
193 HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
194 env->vreg_ctr++;
195 return reg;
198 static HReg newVRegV ( ISelEnv* env )
200 HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
201 env->vreg_ctr++;
202 return reg;
206 /*---------------------------------------------------------*/
207 /*--- ISEL: Forward declarations ---*/
208 /*---------------------------------------------------------*/
210 /* These are organised as iselXXX and iselXXX_wrk pairs. The
211 iselXXX_wrk do the real work, but are not to be called directly.
212 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
213 checks that all returned registers are virtual. You should not
214 call the _wrk version directly.
216 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
217 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e );
219 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
220 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e );
222 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
223 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e );
225 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
226 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e );
228 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
229 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e );
231 static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
232 ISelEnv* env, const IRExpr* e );
233 static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
234 ISelEnv* env, const IRExpr* e );
236 static AMD64CondCode iselCondCode_C_wrk ( ISelEnv* env, const IRExpr* e );
237 static AMD64CondCode iselCondCode_C ( ISelEnv* env, const IRExpr* e );
239 static HReg iselCondCode_R_wrk ( ISelEnv* env, const IRExpr* e );
240 static HReg iselCondCode_R ( ISelEnv* env, const IRExpr* e );
242 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
243 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e );
245 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
246 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e );
248 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
249 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e );
251 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252 ISelEnv* env, const IRExpr* e );
253 static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
254 ISelEnv* env, const IRExpr* e );
257 /*---------------------------------------------------------*/
258 /*--- ISEL: Misc helpers ---*/
259 /*---------------------------------------------------------*/
261 static Bool sane_AMode ( AMD64AMode* am )
263 switch (am->tag) {
264 case Aam_IR:
265 return
266 toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267 && (hregIsVirtual(am->Aam.IR.reg)
268 || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269 case Aam_IRRS:
270 return
271 toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272 && hregIsVirtual(am->Aam.IRRS.base)
273 && hregClass(am->Aam.IRRS.index) == HRcInt64
274 && hregIsVirtual(am->Aam.IRRS.index) );
275 default:
276 vpanic("sane_AMode: unknown amd64 amode tag");
281 /* Can the lower 32 bits be signedly widened to produce the whole
282 64-bit value? In other words, are the top 33 bits either all 0 or
283 all 1 ? */
284 static Bool fitsIn32Bits ( ULong x )
286 Long y1;
287 y1 = x << 32;
288 y1 >>=/*s*/ 32;
289 return toBool(x == y1);
292 /* Is this a 64-bit zero expression? */
294 static Bool isZeroU64 ( const IRExpr* e )
296 return e->tag == Iex_Const
297 && e->Iex.Const.con->tag == Ico_U64
298 && e->Iex.Const.con->Ico.U64 == 0ULL;
301 static Bool isZeroU32 ( const IRExpr* e )
303 return e->tag == Iex_Const
304 && e->Iex.Const.con->tag == Ico_U32
305 && e->Iex.Const.con->Ico.U32 == 0;
308 /* Are both args atoms and the same? This is copy of eqIRAtom
309 that omits the assertions that the args are indeed atoms. */
311 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
313 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
314 return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
315 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
316 return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
317 return False;
320 /* Make a int reg-reg move. */
322 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
324 vassert(hregClass(src) == HRcInt64);
325 vassert(hregClass(dst) == HRcInt64);
326 return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
329 /* Make a vector (128 bit) reg-reg move. */
331 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
333 vassert(hregClass(src) == HRcVec128);
334 vassert(hregClass(dst) == HRcVec128);
335 return AMD64Instr_SseReRg(Asse_MOV, src, dst);
338 /* Advance/retreat %rsp by n. */
340 static void add_to_rsp ( ISelEnv* env, Int n )
342 vassert(n > 0 && n < 256 && (n%8) == 0);
343 addInstr(env,
344 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
345 hregAMD64_RSP()));
348 static void sub_from_rsp ( ISelEnv* env, Int n )
350 vassert(n > 0 && n < 256 && (n%8) == 0);
351 addInstr(env,
352 AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
353 hregAMD64_RSP()));
356 /* Push 64-bit constants on the stack. */
357 static void push_uimm64( ISelEnv* env, ULong uimm64 )
359 /* If uimm64 can be expressed as the sign extension of its
360 lower 32 bits, we can do it the easy way. */
361 Long simm64 = (Long)uimm64;
362 if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
363 addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
364 } else {
365 HReg tmp = newVRegI(env);
366 addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
367 addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
372 /* Used only in doHelperCall. If possible, produce a single
373 instruction which computes 'e' into 'dst'. If not possible, return
374 NULL. */
376 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
377 HReg dst,
378 IRExpr* e )
380 /* Per comments in doHelperCall below, appearance of
381 Iex_VECRET implies ill-formed IR. */
382 vassert(e->tag != Iex_VECRET);
384 /* In this case we give out a copy of the BaseBlock pointer. */
385 if (UNLIKELY(e->tag == Iex_GSPTR)) {
386 return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
389 vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
391 if (e->tag == Iex_Const) {
392 vassert(e->Iex.Const.con->tag == Ico_U64);
393 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
394 return AMD64Instr_Alu64R(
395 Aalu_MOV,
396 AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
399 } else {
400 return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
404 if (e->tag == Iex_RdTmp) {
405 HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
406 return mk_iMOVsd_RR(src, dst);
409 if (e->tag == Iex_Get) {
410 vassert(e->Iex.Get.ty == Ity_I64);
411 return AMD64Instr_Alu64R(
412 Aalu_MOV,
413 AMD64RMI_Mem(
414 AMD64AMode_IR(e->Iex.Get.offset,
415 hregAMD64_RBP())),
416 dst);
419 if (e->tag == Iex_Unop
420 && e->Iex.Unop.op == Iop_32Uto64
421 && e->Iex.Unop.arg->tag == Iex_RdTmp) {
422 HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
423 return AMD64Instr_MovxLQ(False, src, dst);
426 if (0) { ppIRExpr(e); vex_printf("\n"); }
428 return NULL;
432 /* Do a complete function call. |guard| is a Ity_Bit expression
433 indicating whether or not the call happens. If guard==NULL, the
434 call is unconditional. |retloc| is set to indicate where the
435 return value is after the call. The caller (of this fn) must
436 generate code to add |stackAdjustAfterCall| to the stack pointer
437 after the call is done. */
439 static
440 void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall,
441 /*OUT*/RetLoc* retloc,
442 ISelEnv* env,
443 IRExpr* guard,
444 IRCallee* cee, IRType retTy, IRExpr** args )
446 AMD64CondCode cc;
447 HReg argregs[6];
448 HReg tmpregs[6];
449 AMD64Instr* fastinstrs[6];
450 UInt n_args, i;
452 /* Set default returns. We'll update them later if needed. */
453 *stackAdjustAfterCall = 0;
454 *retloc = mk_RetLoc_INVALID();
456 /* These are used for cross-checking that IR-level constraints on
457 the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
458 UInt nVECRETs = 0;
459 UInt nGSPTRs = 0;
461 /* Marshal args for a call and do the call.
463 This function only deals with a tiny set of possibilities, which
464 cover all helpers in practice. The restrictions are that only
465 arguments in registers are supported, hence only 6x64 integer
466 bits in total can be passed. In fact the only supported arg
467 type is I64.
469 The return type can be I{64,32,16,8} or V{128,256}. In the
470 latter two cases, it is expected that |args| will contain the
471 special node IRExpr_VECRET(), in which case this routine
472 generates code to allocate space on the stack for the vector
473 return value. Since we are not passing any scalars on the
474 stack, it is enough to preallocate the return space before
475 marshalling any arguments, in this case.
477 |args| may also contain IRExpr_GSPTR(), in which case the
478 value in %rbp is passed as the corresponding argument.
480 Generating code which is both efficient and correct when
481 parameters are to be passed in registers is difficult, for the
482 reasons elaborated in detail in comments attached to
483 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
484 of the method described in those comments.
486 The problem is split into two cases: the fast scheme and the
487 slow scheme. In the fast scheme, arguments are computed
488 directly into the target (real) registers. This is only safe
489 when we can be sure that computation of each argument will not
490 trash any real registers set by computation of any other
491 argument.
493 In the slow scheme, all args are first computed into vregs, and
494 once they are all done, they are moved to the relevant real
495 regs. This always gives correct code, but it also gives a bunch
496 of vreg-to-rreg moves which are usually redundant but are hard
497 for the register allocator to get rid of.
499 To decide which scheme to use, all argument expressions are
500 first examined. If they are all so simple that it is clear they
501 will be evaluated without use of any fixed registers, use the
502 fast scheme, else use the slow scheme. Note also that only
503 unconditional calls may use the fast scheme, since having to
504 compute a condition expression could itself trash real
505 registers. Note that for simplicity, in the case where
506 IRExpr_VECRET() is present, we use the slow scheme. This is
507 motivated by the desire to avoid any possible complexity
508 w.r.t. nested calls.
510 Note this requires being able to examine an expression and
511 determine whether or not evaluation of it might use a fixed
512 register. That requires knowledge of how the rest of this insn
513 selector works. Currently just the following 3 are regarded as
514 safe -- hopefully they cover the majority of arguments in
515 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
518 /* Note that the cee->regparms field is meaningless on AMD64 host
519 (since there is only one calling convention) and so we always
520 ignore it. */
521 n_args = 0;
522 for (i = 0; args[i]; i++)
523 n_args++;
525 if (n_args > 6)
526 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
528 argregs[0] = hregAMD64_RDI();
529 argregs[1] = hregAMD64_RSI();
530 argregs[2] = hregAMD64_RDX();
531 argregs[3] = hregAMD64_RCX();
532 argregs[4] = hregAMD64_R8();
533 argregs[5] = hregAMD64_R9();
535 tmpregs[0] = tmpregs[1] = tmpregs[2] =
536 tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
538 fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
539 fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
541 /* First decide which scheme (slow or fast) is to be used. First
542 assume the fast scheme, and select slow if any contraindications
543 (wow) appear. */
545 /* We'll need space on the stack for the return value. Avoid
546 possible complications with nested calls by using the slow
547 scheme. */
548 if (retTy == Ity_V128 || retTy == Ity_V256)
549 goto slowscheme;
551 if (guard) {
552 if (guard->tag == Iex_Const
553 && guard->Iex.Const.con->tag == Ico_U1
554 && guard->Iex.Const.con->Ico.U1 == True) {
555 /* unconditional */
556 } else {
557 /* Not manifestly unconditional -- be conservative. */
558 goto slowscheme;
562 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
563 use the slow scheme. Because this is tentative, we can't call
564 addInstr (that is, commit to) any instructions until we're
565 handled all the arguments. So park the resulting instructions
566 in a buffer and emit that if we're successful. */
568 /* FAST SCHEME */
569 /* In this loop, we process args that can be computed into the
570 destination (real) register with a single instruction, without
571 using any fixed regs. That also includes IRExpr_GSPTR(), but
572 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
573 never see IRExpr_VECRET() at this point, since the return-type
574 check above should ensure all those cases use the slow scheme
575 instead. */
576 vassert(n_args <= 6);
577 for (i = 0; i < n_args; i++) {
578 IRExpr* arg = args[i];
579 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
580 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
582 fastinstrs[i]
583 = iselIntExpr_single_instruction( env, argregs[i], args[i] );
584 if (fastinstrs[i] == NULL)
585 goto slowscheme;
588 /* Looks like we're in luck. Emit the accumulated instructions and
589 move on to doing the call itself. */
590 for (i = 0; i < n_args; i++)
591 addInstr(env, fastinstrs[i]);
593 /* Fast scheme only applies for unconditional calls. Hence: */
594 cc = Acc_ALWAYS;
596 goto handle_call;
599 /* SLOW SCHEME; move via temporaries */
600 slowscheme:
602 # if 0 /* debug only */
603 if (n_args > 0) {for (i = 0; args[i]; i++) {
604 ppIRExpr(args[i]); vex_printf(" "); }
605 vex_printf("\n");}
606 # endif
608 /* If we have a vector return type, allocate a place for it on the
609 stack and record its address. */
610 HReg r_vecRetAddr = INVALID_HREG;
611 if (retTy == Ity_V128) {
612 r_vecRetAddr = newVRegI(env);
613 sub_from_rsp(env, 16);
614 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
616 else if (retTy == Ity_V256) {
617 r_vecRetAddr = newVRegI(env);
618 sub_from_rsp(env, 32);
619 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
622 vassert(n_args <= 6);
623 for (i = 0; i < n_args; i++) {
624 IRExpr* arg = args[i];
625 if (UNLIKELY(arg->tag == Iex_GSPTR)) {
626 tmpregs[i] = newVRegI(env);
627 addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
628 nGSPTRs++;
630 else if (UNLIKELY(arg->tag == Iex_VECRET)) {
631 /* We stashed the address of the return slot earlier, so just
632 retrieve it now. */
633 vassert(!hregIsInvalid(r_vecRetAddr));
634 tmpregs[i] = r_vecRetAddr;
635 nVECRETs++;
637 else {
638 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
639 tmpregs[i] = iselIntExpr_R(env, args[i]);
643 /* Now we can compute the condition. We can't do it earlier
644 because the argument computations could trash the condition
645 codes. Be a bit clever to handle the common case where the
646 guard is 1:Bit. */
647 cc = Acc_ALWAYS;
648 if (guard) {
649 if (guard->tag == Iex_Const
650 && guard->Iex.Const.con->tag == Ico_U1
651 && guard->Iex.Const.con->Ico.U1 == True) {
652 /* unconditional -- do nothing */
653 } else {
654 cc = iselCondCode_C( env, guard );
658 /* Move the args to their final destinations. */
659 for (i = 0; i < n_args; i++) {
660 /* None of these insns, including any spill code that might
661 be generated, may alter the condition codes. */
662 addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
666 /* Do final checks, set the return values, and generate the call
667 instruction proper. */
668 handle_call:
670 if (retTy == Ity_V128 || retTy == Ity_V256) {
671 vassert(nVECRETs == 1);
672 } else {
673 vassert(nVECRETs == 0);
676 vassert(nGSPTRs == 0 || nGSPTRs == 1);
678 vassert(*stackAdjustAfterCall == 0);
679 vassert(is_RetLoc_INVALID(*retloc));
680 switch (retTy) {
681 case Ity_INVALID:
682 /* Function doesn't return a value. */
683 *retloc = mk_RetLoc_simple(RLPri_None);
684 break;
685 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
686 *retloc = mk_RetLoc_simple(RLPri_Int);
687 break;
688 case Ity_V128:
689 *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
690 *stackAdjustAfterCall = 16;
691 break;
692 case Ity_V256:
693 *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
694 *stackAdjustAfterCall = 32;
695 break;
696 default:
697 /* IR can denote other possible return types, but we don't
698 handle those here. */
699 vassert(0);
702 /* Finally, generate the call itself. This needs the *retloc value
703 set in the switch above, which is why it's at the end. */
704 addInstr(env,
705 AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
709 /* Given a guest-state array descriptor, an index expression and a
710 bias, generate an AMD64AMode holding the relevant guest state
711 offset. */
713 static
714 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
715 IRExpr* off, Int bias )
717 HReg tmp, roff;
718 Int elemSz = sizeofIRType(descr->elemTy);
719 Int nElems = descr->nElems;
721 /* Throw out any cases not generated by an amd64 front end. In
722 theory there might be a day where we need to handle them -- if
723 we ever run non-amd64-guest on amd64 host. */
725 if (nElems != 8 || (elemSz != 1 && elemSz != 8))
726 vpanic("genGuestArrayOffset(amd64 host)");
728 /* Compute off into a reg, %off. Then return:
730 movq %off, %tmp
731 addq $bias, %tmp (if bias != 0)
732 andq %tmp, 7
733 ... base(%rbp, %tmp, shift) ...
735 tmp = newVRegI(env);
736 roff = iselIntExpr_R(env, off);
737 addInstr(env, mk_iMOVsd_RR(roff, tmp));
738 if (bias != 0) {
739 /* Make sure the bias is sane, in the sense that there are
740 no significant bits above bit 30 in it. */
741 vassert(-10000 < bias && bias < 10000);
742 addInstr(env,
743 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
745 addInstr(env,
746 AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
747 vassert(elemSz == 1 || elemSz == 8);
748 return
749 AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
750 elemSz==8 ? 3 : 0);
754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
755 static
756 void set_SSE_rounding_default ( ISelEnv* env )
758 /* pushq $DEFAULT_MXCSR
759 ldmxcsr 0(%rsp)
760 addq $8, %rsp
762 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
763 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
764 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
765 add_to_rsp(env, 8);
768 /* Mess with the FPU's rounding mode: set to the default rounding mode
769 (DEFAULT_FPUCW). */
770 static
771 void set_FPU_rounding_default ( ISelEnv* env )
773 /* movq $DEFAULT_FPUCW, -8(%rsp)
774 fldcw -8(%esp)
776 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
777 addInstr(env, AMD64Instr_Alu64M(
778 Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
779 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
784 expression denoting a value in the range 0 .. 3, indicating a round
785 mode encoded as per type IRRoundingMode. Set the SSE machinery to
786 have the same rounding.
788 static
789 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
791 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
792 both rounding bits == 0. If that wasn't the case, we couldn't
793 create a new rounding field simply by ORing the new value into
794 place. */
796 /* movq $3, %reg
797 andq [[mode]], %reg -- shouldn't be needed; paranoia
798 shlq $13, %reg
799 orq $DEFAULT_MXCSR, %reg
800 pushq %reg
801 ldmxcsr 0(%esp)
802 addq $8, %rsp
804 HReg reg = newVRegI(env);
805 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
806 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
807 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
808 iselIntExpr_RMI(env, mode), reg));
809 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
810 addInstr(env, AMD64Instr_Alu64R(
811 Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
812 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
813 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
814 add_to_rsp(env, 8);
818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
819 expression denoting a value in the range 0 .. 3, indicating a round
820 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
821 the same rounding.
823 static
824 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
826 HReg rrm = iselIntExpr_R(env, mode);
827 HReg rrm2 = newVRegI(env);
828 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
830 /* movq %rrm, %rrm2
831 andq $3, %rrm2 -- shouldn't be needed; paranoia
832 shlq $10, %rrm2
833 orq $DEFAULT_FPUCW, %rrm2
834 movq %rrm2, -8(%rsp)
835 fldcw -8(%esp)
837 addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
838 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
839 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
840 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
841 AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
842 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
843 AMD64RI_Reg(rrm2), m8_rsp));
844 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
848 /* Generate all-zeroes into a new vector register.
850 static HReg generate_zeroes_V128 ( ISelEnv* env )
852 HReg dst = newVRegV(env);
853 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
854 return dst;
857 /* Generate all-ones into a new vector register.
859 static HReg generate_ones_V128 ( ISelEnv* env )
861 HReg dst = newVRegV(env);
862 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
863 return dst;
867 /* Generate !src into a new vector register. Amazing that there isn't
868 a less crappy way to do this.
870 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
872 HReg dst = generate_ones_V128(env);
873 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
874 return dst;
878 /* Expand the given byte into a 64-bit word, by cloning each bit
879 8 times. */
880 static ULong bitmask8_to_bytemask64 ( UShort w8 )
882 vassert(w8 == (w8 & 0xFF));
883 ULong w64 = 0;
884 Int i;
885 for (i = 0; i < 8; i++) {
886 if (w8 & (1<<i))
887 w64 |= (0xFFULL << (8 * i));
889 return w64;
893 /*---------------------------------------------------------*/
894 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
895 /*---------------------------------------------------------*/
897 /* Select insns for an integer-typed expression, and add them to the
898 code list. Return a reg holding the result. This reg will be a
899 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
900 want to modify it, ask for a new vreg, copy it in there, and modify
901 the copy. The register allocator will do its best to map both
902 vregs to the same real register, so the copies will often disappear
903 later in the game.
905 This should handle expressions of 64, 32, 16 and 8-bit type. All
906 results are returned in a 64-bit register. For 32-, 16- and 8-bit
907 expressions, the upper 32/48/56 bits are arbitrary, so you should
908 mask or sign extend partial values if necessary.
911 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
913 HReg r = iselIntExpr_R_wrk(env, e);
914 /* sanity checks ... */
915 # if 0
916 vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
917 # endif
918 vassert(hregClass(r) == HRcInt64);
919 vassert(hregIsVirtual(r));
920 return r;
923 /* DO NOT CALL THIS DIRECTLY ! */
924 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
926 MatchInfo mi;
927 DECLARE_PATTERN(p_1Uto8_64to1);
928 DECLARE_PATTERN(p_LDle8_then_8Uto64);
929 DECLARE_PATTERN(p_LDle16_then_16Uto64);
931 IRType ty = typeOfIRExpr(env->type_env,e);
932 switch (ty) {
933 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
934 default: vassert(0);
937 switch (e->tag) {
939 /* --------- TEMP --------- */
940 case Iex_RdTmp: {
941 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
944 /* --------- LOAD --------- */
945 case Iex_Load: {
946 HReg dst = newVRegI(env);
947 AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
949 /* We can't handle big-endian loads, nor load-linked. */
950 if (e->Iex.Load.end != Iend_LE)
951 goto irreducible;
953 if (ty == Ity_I64) {
954 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
955 AMD64RMI_Mem(amode), dst) );
956 return dst;
958 if (ty == Ity_I32) {
959 addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
960 return dst;
962 if (ty == Ity_I16) {
963 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
964 return dst;
966 if (ty == Ity_I8) {
967 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
968 return dst;
970 break;
973 /* --------- BINARY OP --------- */
974 case Iex_Binop: {
975 AMD64AluOp aluOp;
976 AMD64ShiftOp shOp;
978 /* Pattern: Sub64(0,x) */
979 /* and: Sub32(0,x) */
980 if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
981 || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
982 HReg dst = newVRegI(env);
983 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
984 addInstr(env, mk_iMOVsd_RR(reg,dst));
985 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
986 return dst;
989 /* Is it an addition or logical style op? */
990 switch (e->Iex.Binop.op) {
991 case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
992 aluOp = Aalu_ADD; break;
993 case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
994 aluOp = Aalu_SUB; break;
995 case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
996 aluOp = Aalu_AND; break;
997 case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64:
998 aluOp = Aalu_OR; break;
999 case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
1000 aluOp = Aalu_XOR; break;
1001 case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1002 aluOp = Aalu_MUL; break;
1003 default:
1004 aluOp = Aalu_INVALID; break;
1006 /* For commutative ops we assume any literal
1007 values are on the second operand. */
1008 if (aluOp != Aalu_INVALID) {
1009 HReg dst = newVRegI(env);
1010 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
1011 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1012 addInstr(env, mk_iMOVsd_RR(reg,dst));
1013 addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1014 return dst;
1017 /* Perhaps a shift op? */
1018 switch (e->Iex.Binop.op) {
1019 case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1020 shOp = Ash_SHL; break;
1021 case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1022 shOp = Ash_SHR; break;
1023 case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1024 shOp = Ash_SAR; break;
1025 default:
1026 shOp = Ash_INVALID; break;
1028 if (shOp != Ash_INVALID) {
1029 HReg dst = newVRegI(env);
1031 /* regL = the value to be shifted */
1032 HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1033 addInstr(env, mk_iMOVsd_RR(regL,dst));
1035 /* Do any necessary widening for 16/8 bit operands. Also decide on the
1036 final width at which the shift is to be done. */
1037 Bool shift64 = False;
1038 switch (e->Iex.Binop.op) {
1039 case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1040 shift64 = True;
1041 break;
1042 case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1043 break;
1044 case Iop_Shr8:
1045 addInstr(env, AMD64Instr_Alu64R(
1046 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1047 break;
1048 case Iop_Shr16:
1049 addInstr(env, AMD64Instr_Alu64R(
1050 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1051 break;
1052 case Iop_Shr32:
1053 break;
1054 case Iop_Sar8:
1055 addInstr(env, AMD64Instr_Sh32(Ash_SHL, 24, dst));
1056 addInstr(env, AMD64Instr_Sh32(Ash_SAR, 24, dst));
1057 break;
1058 case Iop_Sar16:
1059 addInstr(env, AMD64Instr_Sh32(Ash_SHL, 16, dst));
1060 addInstr(env, AMD64Instr_Sh32(Ash_SAR, 16, dst));
1061 break;
1062 case Iop_Sar32:
1063 break;
1064 default:
1065 ppIROp(e->Iex.Binop.op);
1066 vassert(0);
1069 /* Now consider the shift amount. If it's a literal, we
1070 can do a much better job than the general case. */
1071 if (e->Iex.Binop.arg2->tag == Iex_Const) {
1072 /* assert that the IR is well-typed */
1073 Int nshift;
1074 vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1075 nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1076 vassert(nshift >= 0);
1077 if (nshift > 0) {
1078 /* Can't allow nshift==0 since that means %cl */
1079 if (shift64) {
1080 addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1081 } else {
1082 addInstr(env, AMD64Instr_Sh32(shOp, nshift, dst));
1085 } else {
1086 /* General case; we have to force the amount into %cl. */
1087 HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1088 addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1089 if (shift64) {
1090 addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1091 } else {
1092 addInstr(env, AMD64Instr_Sh32(shOp, 0/* %cl */, dst));
1095 return dst;
1098 /* Handle misc other scalar ops. */
1099 if (e->Iex.Binop.op == Iop_Max32U) {
1100 HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1101 HReg dst = newVRegI(env);
1102 HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1103 addInstr(env, mk_iMOVsd_RR(src1, dst));
1104 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1105 addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1106 return dst;
1109 if (e->Iex.Binop.op == Iop_DivModS64to32
1110 || e->Iex.Binop.op == Iop_DivModU64to32) {
1111 /* 64 x 32 -> (32(rem),32(div)) division */
1112 /* Get the 64-bit operand into edx:eax, and the other into
1113 any old R/M. */
1114 HReg rax = hregAMD64_RAX();
1115 HReg rdx = hregAMD64_RDX();
1116 HReg dst = newVRegI(env);
1117 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1118 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1119 /* Compute the left operand into a reg, and then
1120 put the top half in edx and the bottom in eax. */
1121 HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1122 addInstr(env, mk_iMOVsd_RR(left64, rdx));
1123 addInstr(env, mk_iMOVsd_RR(left64, rax));
1124 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1125 addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1126 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1127 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1128 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1129 addInstr(env, mk_iMOVsd_RR(rax, dst));
1130 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1131 return dst;
1134 if (e->Iex.Binop.op == Iop_32HLto64) {
1135 HReg hi32 = newVRegI(env);
1136 HReg lo32 = newVRegI(env);
1137 HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1138 HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1139 addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1140 addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1141 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1142 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1143 addInstr(env, AMD64Instr_Alu64R(
1144 Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1145 return hi32;
1148 if (e->Iex.Binop.op == Iop_16HLto32) {
1149 HReg hi16 = newVRegI(env);
1150 HReg lo16 = newVRegI(env);
1151 HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1152 HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1153 addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1154 addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1155 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1156 addInstr(env, AMD64Instr_Alu64R(
1157 Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1158 addInstr(env, AMD64Instr_Alu64R(
1159 Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1160 return hi16;
1163 if (e->Iex.Binop.op == Iop_8HLto16) {
1164 HReg hi8 = newVRegI(env);
1165 HReg lo8 = newVRegI(env);
1166 HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1167 HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1168 addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1169 addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1170 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1171 addInstr(env, AMD64Instr_Alu64R(
1172 Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1173 addInstr(env, AMD64Instr_Alu64R(
1174 Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1175 return hi8;
1178 if (e->Iex.Binop.op == Iop_MullS32
1179 || e->Iex.Binop.op == Iop_MullS16
1180 || e->Iex.Binop.op == Iop_MullS8
1181 || e->Iex.Binop.op == Iop_MullU32
1182 || e->Iex.Binop.op == Iop_MullU16
1183 || e->Iex.Binop.op == Iop_MullU8) {
1184 HReg a32 = newVRegI(env);
1185 HReg b32 = newVRegI(env);
1186 HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1187 HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1188 Int shift = 0;
1189 AMD64ShiftOp shr_op = Ash_SHR;
1190 switch (e->Iex.Binop.op) {
1191 case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1192 case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1193 case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break;
1194 case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1195 case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1196 case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break;
1197 default: vassert(0);
1200 addInstr(env, mk_iMOVsd_RR(a32s, a32));
1201 addInstr(env, mk_iMOVsd_RR(b32s, b32));
1202 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1203 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1204 addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32));
1205 addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32));
1206 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1207 return b32;
1210 if (e->Iex.Binop.op == Iop_CmpF64) {
1211 HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1212 HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1213 HReg dst = newVRegI(env);
1214 addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1215 /* Mask out irrelevant parts of the result so as to conform
1216 to the CmpF64 definition. */
1217 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1218 return dst;
1221 if (e->Iex.Binop.op == Iop_F64toI32S
1222 || e->Iex.Binop.op == Iop_F64toI64S) {
1223 Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1224 HReg rf = iselDblExpr(env, e->Iex.Binop.arg2);
1225 HReg dst = newVRegI(env);
1226 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1227 addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1228 set_SSE_rounding_default(env);
1229 return dst;
1232 /* Deal with 64-bit SIMD binary ops. For the most part these are doable
1233 by using the equivalent 128-bit operation and ignoring the upper half
1234 of the result. */
1235 AMD64SseOp op = Asse_INVALID;
1236 Bool arg1isEReg = False;
1237 Bool preShift32R = False;
1238 switch (e->Iex.Binop.op) {
1239 // The following 3 could be done with 128 bit insns too, but
1240 // first require the inputs to be reformatted.
1241 //case Iop_QNarrowBin32Sto16Sx4:
1242 //op = Asse_PACKSSD; arg1isEReg = True; break;
1243 //case Iop_QNarrowBin16Sto8Sx8:
1244 //op = Asse_PACKSSW; arg1isEReg = True; break;
1245 //case Iop_QNarrowBin16Sto8Ux8:
1246 //op = Asse_PACKUSW; arg1isEReg = True; break;
1248 case Iop_InterleaveHI8x8:
1249 op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1250 break;
1251 case Iop_InterleaveHI16x4:
1252 op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1253 break;
1254 case Iop_InterleaveHI32x2:
1255 op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1256 break;
1257 case Iop_InterleaveLO8x8:
1258 op = Asse_UNPCKLB; arg1isEReg = True;
1259 break;
1260 case Iop_InterleaveLO16x4:
1261 op = Asse_UNPCKLW; arg1isEReg = True;
1262 break;
1263 case Iop_InterleaveLO32x2:
1264 op = Asse_UNPCKLD; arg1isEReg = True;
1265 break;
1267 case Iop_Add8x8: op = Asse_ADD8; break;
1268 case Iop_Add16x4: op = Asse_ADD16; break;
1269 case Iop_Add32x2: op = Asse_ADD32; break;
1270 case Iop_QAdd8Sx8: op = Asse_QADD8S; break;
1271 case Iop_QAdd16Sx4: op = Asse_QADD16S; break;
1272 case Iop_QAdd8Ux8: op = Asse_QADD8U; break;
1273 case Iop_QAdd16Ux4: op = Asse_QADD16U; break;
1274 case Iop_Avg8Ux8: op = Asse_AVG8U; break;
1275 case Iop_Avg16Ux4: op = Asse_AVG16U; break;
1276 case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break;
1277 case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break;
1278 case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break;
1279 case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break;
1280 case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1281 case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1282 case Iop_Max16Sx4: op = Asse_MAX16S; break;
1283 case Iop_Max8Ux8: op = Asse_MAX8U; break;
1284 case Iop_Min16Sx4: op = Asse_MIN16S; break;
1285 case Iop_Min8Ux8: op = Asse_MIN8U; break;
1286 case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1287 case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1288 case Iop_Mul16x4: op = Asse_MUL16; break;
1289 case Iop_Sub8x8: op = Asse_SUB8; break;
1290 case Iop_Sub16x4: op = Asse_SUB16; break;
1291 case Iop_Sub32x2: op = Asse_SUB32; break;
1292 case Iop_QSub8Sx8: op = Asse_QSUB8S; break;
1293 case Iop_QSub16Sx4: op = Asse_QSUB16S; break;
1294 case Iop_QSub8Ux8: op = Asse_QSUB8U; break;
1295 case Iop_QSub16Ux4: op = Asse_QSUB16U; break;
1296 default: break;
1298 if (op != Asse_INVALID) {
1299 /* This isn't pretty, but .. move each arg to the low half of an XMM
1300 register, do the operation on the whole register, and move the
1301 result back to an integer register. */
1302 const IRExpr* arg1 = e->Iex.Binop.arg1;
1303 const IRExpr* arg2 = e->Iex.Binop.arg2;
1304 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1305 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1306 HReg iarg1 = iselIntExpr_R(env, arg1);
1307 HReg iarg2 = iselIntExpr_R(env, arg2);
1308 HReg varg1 = newVRegV(env);
1309 HReg varg2 = newVRegV(env);
1310 HReg idst = newVRegI(env);
1311 addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1312 addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1313 if (arg1isEReg) {
1314 if (preShift32R) {
1315 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1316 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1318 addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1319 addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1320 } else {
1321 vassert(!preShift32R);
1322 addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1323 addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1325 return idst;
1328 UInt laneBits = 0;
1329 op = Asse_INVALID;
1330 switch (e->Iex.Binop.op) {
1331 case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1332 case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1333 case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1334 case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1335 case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1336 case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1337 default: break;
1339 if (op != Asse_INVALID) {
1340 const IRExpr* arg1 = e->Iex.Binop.arg1;
1341 const IRExpr* arg2 = e->Iex.Binop.arg2;
1342 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1343 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1344 HReg igreg = iselIntExpr_R(env, arg1);
1345 HReg vgreg = newVRegV(env);
1346 HReg idst = newVRegI(env);
1347 addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1348 /* If it's a shift by an in-range immediate, generate a single
1349 instruction. */
1350 if (arg2->tag == Iex_Const) {
1351 IRConst* c = arg2->Iex.Const.con;
1352 vassert(c->tag == Ico_U8);
1353 UInt shift = c->Ico.U8;
1354 if (shift < laneBits) {
1355 addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1356 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1357 return idst;
1360 /* Otherwise we have to do it the longwinded way. */
1361 HReg ishift = iselIntExpr_R(env, arg2);
1362 HReg vshift = newVRegV(env);
1363 addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1364 addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1365 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1366 return idst;
1369 if (e->Iex.Binop.op == Iop_Mul32x2) {
1370 const IRExpr* arg1 = e->Iex.Binop.arg1;
1371 const IRExpr* arg2 = e->Iex.Binop.arg2;
1372 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1373 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1374 HReg s1 = iselIntExpr_R(env, arg1);
1375 HReg s2 = iselIntExpr_R(env, arg2);
1376 HReg resLo = newVRegI(env);
1377 // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1378 addInstr(env, mk_iMOVsd_RR(s1, resLo));
1379 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1380 addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1382 // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1383 HReg resHi = newVRegI(env);
1384 addInstr(env, mk_iMOVsd_RR(s1, resHi));
1385 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1386 HReg tmp = newVRegI(env);
1387 addInstr(env, mk_iMOVsd_RR(s2, tmp));
1388 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1389 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1390 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1392 // final result = resHi | resLo
1393 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1394 return resLo;
1397 // A few remaining SIMD64 ops require helper functions, at least for
1398 // now.
1399 Bool second_is_UInt = False;
1400 HWord fn = 0;
1401 switch (e->Iex.Binop.op) {
1402 case Iop_CatOddLanes16x4:
1403 fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1404 case Iop_CatEvenLanes16x4:
1405 fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1406 case Iop_PermOrZero8x8:
1407 fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1409 case Iop_QNarrowBin32Sto16Sx4:
1410 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1411 case Iop_QNarrowBin16Sto8Sx8:
1412 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1413 case Iop_QNarrowBin16Sto8Ux8:
1414 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1416 case Iop_NarrowBin16to8x8:
1417 fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1418 case Iop_NarrowBin32to16x4:
1419 fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1421 case Iop_SarN8x8:
1422 fn = (HWord)h_generic_calc_SarN8x8;
1423 second_is_UInt = True;
1424 break;
1426 default:
1427 fn = (HWord)0; break;
1429 if (fn != (HWord)0) {
1430 /* Note: the following assumes all helpers are of signature
1431 ULong fn ( ULong, ULong ), and they are
1432 not marked as regparm functions.
1434 HReg dst = newVRegI(env);
1435 HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1436 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1437 if (second_is_UInt)
1438 addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1439 addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1440 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1441 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1442 mk_RetLoc_simple(RLPri_Int) ));
1443 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1444 return dst;
1447 // Half-float vector conversion
1448 if (e->Iex.Binop.op == Iop_F32toF16x4
1449 && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1450 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1451 HReg dstV = newVRegV(env);
1452 HReg dstI = newVRegI(env);
1453 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1454 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1455 set_SSE_rounding_default(env);
1456 addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1457 return dstI;
1460 break;
1463 /* --------- UNARY OP --------- */
1464 case Iex_Unop: {
1466 /* 1Uto8(64to1(expr64)) */
1468 DEFINE_PATTERN( p_1Uto8_64to1,
1469 unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1470 if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1471 const IRExpr* expr64 = mi.bindee[0];
1472 HReg dst = newVRegI(env);
1473 HReg src = iselIntExpr_R(env, expr64);
1474 addInstr(env, mk_iMOVsd_RR(src,dst) );
1475 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1476 AMD64RMI_Imm(1), dst));
1477 return dst;
1481 /* 8Uto64(LDle(expr64)) */
1483 DEFINE_PATTERN(p_LDle8_then_8Uto64,
1484 unop(Iop_8Uto64,
1485 IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1486 if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1487 HReg dst = newVRegI(env);
1488 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1489 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1490 return dst;
1494 /* 16Uto64(LDle(expr64)) */
1496 DEFINE_PATTERN(p_LDle16_then_16Uto64,
1497 unop(Iop_16Uto64,
1498 IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1499 if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1500 HReg dst = newVRegI(env);
1501 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1502 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1503 return dst;
1507 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1508 Use 32 bit arithmetic and let the default zero-extend rule
1509 do the 32Uto64 for free. */
1510 if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1511 IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1512 IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1513 IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1514 AMD64AluOp aluOp = Aalu_INVALID;
1515 switch (opi) {
1516 case Iop_Add32: aluOp = Aalu_ADD; break;
1517 case Iop_Sub32: aluOp = Aalu_SUB; break;
1518 case Iop_And32: aluOp = Aalu_AND; break;
1519 case Iop_Or32: aluOp = Aalu_OR; break;
1520 case Iop_Xor32: aluOp = Aalu_XOR; break;
1521 default: break;
1523 if (aluOp != Aalu_INVALID) {
1524 /* For commutative ops we assume any literal values are on
1525 the second operand. */
1526 HReg dst = newVRegI(env);
1527 HReg reg = iselIntExpr_R(env, argL);
1528 AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1529 addInstr(env, mk_iMOVsd_RR(reg,dst));
1530 addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1531 return dst;
1533 /* just fall through to normal handling for Iop_32Uto64 */
1536 /* Fallback cases */
1537 switch (e->Iex.Unop.op) {
1538 case Iop_32Uto64:
1539 case Iop_32Sto64: {
1540 HReg dst = newVRegI(env);
1541 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1542 addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1543 src, dst) );
1544 return dst;
1546 case Iop_128HIto64: {
1547 HReg rHi, rLo;
1548 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1549 return rHi; /* and abandon rLo */
1551 case Iop_128to64: {
1552 HReg rHi, rLo;
1553 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1554 return rLo; /* and abandon rHi */
1556 case Iop_8Uto16:
1557 case Iop_8Uto32:
1558 case Iop_8Uto64:
1559 case Iop_16Uto64:
1560 case Iop_16Uto32: {
1561 HReg dst = newVRegI(env);
1562 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1563 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1564 || e->Iex.Unop.op==Iop_16Uto64 );
1565 UInt mask = srcIs16 ? 0xFFFF : 0xFF;
1566 addInstr(env, mk_iMOVsd_RR(src,dst) );
1567 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1568 AMD64RMI_Imm(mask), dst));
1569 return dst;
1571 case Iop_8Sto16:
1572 case Iop_8Sto64:
1573 case Iop_8Sto32:
1574 case Iop_16Sto32:
1575 case Iop_16Sto64: {
1576 HReg dst = newVRegI(env);
1577 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1578 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1579 || e->Iex.Unop.op==Iop_16Sto64 );
1580 UInt amt = srcIs16 ? 48 : 56;
1581 addInstr(env, mk_iMOVsd_RR(src,dst) );
1582 addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1583 addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1584 return dst;
1586 case Iop_Not8:
1587 case Iop_Not16:
1588 case Iop_Not32:
1589 case Iop_Not64: {
1590 HReg dst = newVRegI(env);
1591 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1592 addInstr(env, mk_iMOVsd_RR(src,dst) );
1593 addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1594 return dst;
1596 case Iop_16HIto8:
1597 case Iop_32HIto16:
1598 case Iop_64HIto32: {
1599 HReg dst = newVRegI(env);
1600 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1601 Int shift = 0;
1602 switch (e->Iex.Unop.op) {
1603 case Iop_16HIto8: shift = 8; break;
1604 case Iop_32HIto16: shift = 16; break;
1605 case Iop_64HIto32: shift = 32; break;
1606 default: vassert(0);
1608 addInstr(env, mk_iMOVsd_RR(src,dst) );
1609 addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1610 return dst;
1612 case Iop_1Uto64:
1613 case Iop_1Uto32:
1614 case Iop_1Uto8: {
1615 HReg dst = newVRegI(env);
1616 AMD64CondCode cond = iselCondCode_C(env, e->Iex.Unop.arg);
1617 addInstr(env, AMD64Instr_Set64(cond,dst));
1618 return dst;
1620 case Iop_1Sto8:
1621 case Iop_1Sto16:
1622 case Iop_1Sto32:
1623 case Iop_1Sto64: {
1624 HReg dst = newVRegI(env);
1625 HReg tmp = iselCondCode_R(env, e->Iex.Unop.arg);
1626 addInstr(env, mk_iMOVsd_RR(tmp, dst));
1627 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1628 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1629 return dst;
1631 case Iop_Ctz64: {
1632 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1633 HReg dst = newVRegI(env);
1634 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1635 addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1636 return dst;
1638 case Iop_Clz64: {
1639 /* Count leading zeroes. Do 'bsrq' to establish the index
1640 of the highest set bit, and subtract that value from
1641 63. */
1642 HReg tmp = newVRegI(env);
1643 HReg dst = newVRegI(env);
1644 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1645 addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1646 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1647 AMD64RMI_Imm(63), dst));
1648 addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1649 AMD64RMI_Reg(tmp), dst));
1650 return dst;
1653 case Iop_CmpwNEZ64: {
1654 HReg dst = newVRegI(env);
1655 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1656 addInstr(env, mk_iMOVsd_RR(src,dst));
1657 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1658 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1659 AMD64RMI_Reg(src), dst));
1660 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1661 return dst;
1664 case Iop_CmpwNEZ32: {
1665 HReg src = newVRegI(env);
1666 HReg dst = newVRegI(env);
1667 HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1668 addInstr(env, mk_iMOVsd_RR(pre,src));
1669 addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1670 addInstr(env, mk_iMOVsd_RR(src,dst));
1671 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1672 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1673 AMD64RMI_Reg(src), dst));
1674 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1675 return dst;
1678 case Iop_Left8:
1679 case Iop_Left16:
1680 case Iop_Left32:
1681 case Iop_Left64: {
1682 HReg dst = newVRegI(env);
1683 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1684 addInstr(env, mk_iMOVsd_RR(src, dst));
1685 addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1686 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1687 return dst;
1690 case Iop_V128to32: {
1691 HReg dst = newVRegI(env);
1692 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1693 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1694 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1695 addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1696 return dst;
1699 /* V128{HI}to64 */
1700 case Iop_V128to64: {
1701 HReg dst = newVRegI(env);
1702 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1703 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1704 return dst;
1706 case Iop_V128HIto64: {
1707 HReg dst = newVRegI(env);
1708 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1709 HReg vec2 = newVRegV(env);
1710 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1711 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1712 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1713 return dst;
1716 /* V256to64_{3,2,1,0} */
1717 case Iop_V256to64_0: case Iop_V256to64_1:
1718 case Iop_V256to64_2: case Iop_V256to64_3: {
1719 HReg vHi, vLo, vec;
1720 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1721 /* Do the first part of the selection by deciding which of
1722 the 128 bit registers to look at, and second part using
1723 the same scheme as for V128{HI}to64 above. */
1724 Bool low64of128 = True;
1725 switch (e->Iex.Unop.op) {
1726 case Iop_V256to64_0: vec = vLo; low64of128 = True; break;
1727 case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1728 case Iop_V256to64_2: vec = vHi; low64of128 = True; break;
1729 case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1730 default: vassert(0);
1732 HReg dst = newVRegI(env);
1733 if (low64of128) {
1734 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1735 } else {
1736 HReg vec2 = newVRegV(env);
1737 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1738 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1739 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1741 return dst;
1744 /* ReinterpF64asI64(e) */
1745 /* Given an IEEE754 double, produce an I64 with the same bit
1746 pattern. */
1747 case Iop_ReinterpF64asI64: {
1748 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1749 HReg dst = newVRegI(env);
1750 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1751 /* paranoia */
1752 set_SSE_rounding_default(env);
1753 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1754 addInstr(env, AMD64Instr_Alu64R(
1755 Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1756 return dst;
1759 /* ReinterpF32asI32(e) */
1760 /* Given an IEEE754 single, produce an I64 with the same bit
1761 pattern in the lower half. */
1762 case Iop_ReinterpF32asI32: {
1763 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1764 HReg dst = newVRegI(env);
1765 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1766 /* paranoia */
1767 set_SSE_rounding_default(env);
1768 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1769 addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1770 return dst;
1773 case Iop_16to8:
1774 case Iop_32to8:
1775 case Iop_64to8:
1776 case Iop_32to16:
1777 case Iop_64to16:
1778 case Iop_64to32:
1779 /* These are no-ops. */
1780 return iselIntExpr_R(env, e->Iex.Unop.arg);
1782 case Iop_GetMSBs8x8: {
1783 /* Note: the following assumes the helper is of
1784 signature
1785 UInt fn ( ULong ), and is not a regparm fn.
1787 HReg dst = newVRegI(env);
1788 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1789 HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1790 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1791 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1792 1, mk_RetLoc_simple(RLPri_Int) ));
1793 /* MovxLQ is not exactly the right thing here. We just
1794 need to get the bottom 8 bits of RAX into dst, and zero
1795 out everything else. Assuming that the helper returns
1796 a UInt with the top 24 bits zeroed out, it'll do,
1797 though. */
1798 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1799 return dst;
1802 case Iop_GetMSBs8x16: {
1803 /* Note: the following assumes the helper is of signature
1804 UInt fn ( ULong w64hi, ULong w64Lo ),
1805 and is not a regparm fn. */
1806 HReg dst = newVRegI(env);
1807 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1808 HReg rsp = hregAMD64_RSP();
1809 HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1810 AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
1811 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1812 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1813 16, vec, m16_rsp));
1814 /* hi 64 bits into RDI -- the first arg */
1815 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1816 AMD64RMI_Mem(m8_rsp),
1817 hregAMD64_RDI() )); /* 1st arg */
1818 /* lo 64 bits into RSI -- the 2nd arg */
1819 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1820 AMD64RMI_Mem(m16_rsp),
1821 hregAMD64_RSI() )); /* 2nd arg */
1822 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1823 2, mk_RetLoc_simple(RLPri_Int) ));
1824 /* MovxLQ is not exactly the right thing here. We just
1825 need to get the bottom 16 bits of RAX into dst, and zero
1826 out everything else. Assuming that the helper returns
1827 a UInt with the top 16 bits zeroed out, it'll do,
1828 though. */
1829 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1830 return dst;
1833 default:
1834 break;
1837 /* Deal with unary 64-bit SIMD ops. */
1838 HWord fn = 0;
1839 switch (e->Iex.Unop.op) {
1840 case Iop_CmpNEZ32x2:
1841 fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1842 case Iop_CmpNEZ16x4:
1843 fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1844 case Iop_CmpNEZ8x8:
1845 fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1846 default:
1847 fn = (HWord)0; break;
1849 if (fn != (HWord)0) {
1850 /* Note: the following assumes all helpers are of
1851 signature
1852 ULong fn ( ULong ), and they are
1853 not marked as regparm functions.
1855 HReg dst = newVRegI(env);
1856 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1857 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1858 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1859 mk_RetLoc_simple(RLPri_Int) ));
1860 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1861 return dst;
1864 break;
1867 /* --------- GET --------- */
1868 case Iex_Get: {
1869 if (ty == Ity_I64) {
1870 HReg dst = newVRegI(env);
1871 addInstr(env, AMD64Instr_Alu64R(
1872 Aalu_MOV,
1873 AMD64RMI_Mem(
1874 AMD64AMode_IR(e->Iex.Get.offset,
1875 hregAMD64_RBP())),
1876 dst));
1877 return dst;
1879 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1880 HReg dst = newVRegI(env);
1881 addInstr(env, AMD64Instr_LoadEX(
1882 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1883 False,
1884 AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1885 dst));
1886 return dst;
1888 break;
1891 case Iex_GetI: {
1892 AMD64AMode* am
1893 = genGuestArrayOffset(
1894 env, e->Iex.GetI.descr,
1895 e->Iex.GetI.ix, e->Iex.GetI.bias );
1896 HReg dst = newVRegI(env);
1897 if (ty == Ity_I8) {
1898 addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1899 return dst;
1901 if (ty == Ity_I64) {
1902 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1903 return dst;
1905 break;
1908 /* --------- CCALL --------- */
1909 case Iex_CCall: {
1910 HReg dst = newVRegI(env);
1911 vassert(ty == e->Iex.CCall.retty);
1913 /* be very restrictive for now. Only 64-bit ints allowed for
1914 args, and 64 or 32 bits for return type. */
1915 if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1916 goto irreducible;
1918 /* Marshal args, do the call. */
1919 UInt addToSp = 0;
1920 RetLoc rloc = mk_RetLoc_INVALID();
1921 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1922 e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1923 vassert(is_sane_RetLoc(rloc));
1924 vassert(rloc.pri == RLPri_Int);
1925 vassert(addToSp == 0);
1927 /* Move to dst, and zero out the top 32 bits if the result type is
1928 Ity_I32. Probably overkill, but still .. */
1929 if (e->Iex.CCall.retty == Ity_I64)
1930 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1931 else
1932 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1934 return dst;
1937 /* --------- LITERAL --------- */
1938 /* 64/32/16/8-bit literals */
1939 case Iex_Const:
1940 if (ty == Ity_I64) {
1941 HReg r = newVRegI(env);
1942 addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1943 return r;
1944 } else {
1945 AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1946 HReg r = newVRegI(env);
1947 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1948 return r;
1951 /* --------- MULTIPLEX --------- */
1952 case Iex_ITE: { // VFD
1953 if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1954 && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1955 HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1956 HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1957 HReg dst = newVRegI(env);
1958 addInstr(env, mk_iMOVsd_RR(r1,dst));
1959 AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
1960 addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1961 return dst;
1963 break;
1966 /* --------- TERNARY OP --------- */
1967 case Iex_Triop: {
1968 IRTriop *triop = e->Iex.Triop.details;
1969 /* C3210 flags following FPU partial remainder (fprem), both
1970 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1971 if (triop->op == Iop_PRemC3210F64
1972 || triop->op == Iop_PRem1C3210F64) {
1973 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1974 HReg arg1 = iselDblExpr(env, triop->arg2);
1975 HReg arg2 = iselDblExpr(env, triop->arg3);
1976 HReg dst = newVRegI(env);
1977 addInstr(env, AMD64Instr_A87Free(2));
1979 /* one arg -> top of x87 stack */
1980 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1981 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1983 /* other arg -> top of x87 stack */
1984 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1985 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1987 switch (triop->op) {
1988 case Iop_PRemC3210F64:
1989 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1990 break;
1991 case Iop_PRem1C3210F64:
1992 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1993 break;
1994 default:
1995 vassert(0);
1997 /* Ignore the result, and instead make off with the FPU's
1998 C3210 flags (in the status word). */
1999 addInstr(env, AMD64Instr_A87StSW(m8_rsp));
2000 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
2001 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
2002 return dst;
2004 break;
2007 default:
2008 break;
2009 } /* switch (e->tag) */
2011 /* We get here if no pattern matched. */
2012 irreducible:
2013 ppIRExpr(e);
2014 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2018 /*---------------------------------------------------------*/
2019 /*--- ISEL: Integer expression auxiliaries ---*/
2020 /*---------------------------------------------------------*/
2022 /* --------------------- AMODEs --------------------- */
2024 /* Return an AMode which computes the value of the specified
2025 expression, possibly also adding insns to the code list as a
2026 result. The expression may only be a 32-bit one.
2029 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2031 AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2032 vassert(sane_AMode(am));
2033 return am;
2036 /* DO NOT CALL THIS DIRECTLY ! */
2037 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2039 MatchInfo mi;
2040 DECLARE_PATTERN(p_complex);
2041 IRType ty = typeOfIRExpr(env->type_env,e);
2042 vassert(ty == Ity_I64);
2044 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2045 /* bind0 bind1 bind2 bind3 */
2046 DEFINE_PATTERN(p_complex,
2047 binop( Iop_Add64,
2048 binop( Iop_Add64,
2049 bind(0),
2050 binop(Iop_Shl64, bind(1), bind(2))
2052 bind(3)
2055 if (matchIRExpr(&mi, p_complex, e)) {
2056 const IRExpr* expr1 = mi.bindee[0];
2057 const IRExpr* expr2 = mi.bindee[1];
2058 const IRExpr* imm8 = mi.bindee[2];
2059 const IRExpr* simm32 = mi.bindee[3];
2060 if (imm8->tag == Iex_Const
2061 && imm8->Iex.Const.con->tag == Ico_U8
2062 && imm8->Iex.Const.con->Ico.U8 < 4
2063 /* imm8 is OK, now check simm32 */
2064 && simm32->tag == Iex_Const
2065 && simm32->Iex.Const.con->tag == Ico_U64
2066 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2067 UInt shift = imm8->Iex.Const.con->Ico.U8;
2068 UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2069 HReg r1 = iselIntExpr_R(env, expr1);
2070 HReg r2 = iselIntExpr_R(env, expr2);
2071 vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2072 return AMD64AMode_IRRS(offset, r1, r2, shift);
2076 /* Add64(expr1, Shl64(expr2, imm)) */
2077 if (e->tag == Iex_Binop
2078 && e->Iex.Binop.op == Iop_Add64
2079 && e->Iex.Binop.arg2->tag == Iex_Binop
2080 && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2081 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2082 && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2083 UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2084 if (shift == 1 || shift == 2 || shift == 3) {
2085 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2086 HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2087 return AMD64AMode_IRRS(0, r1, r2, shift);
2091 /* Add64(expr,i) */
2092 if (e->tag == Iex_Binop
2093 && e->Iex.Binop.op == Iop_Add64
2094 && e->Iex.Binop.arg2->tag == Iex_Const
2095 && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2096 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2097 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2098 return AMD64AMode_IR(
2099 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2104 /* Doesn't match anything in particular. Generate it into
2105 a register and use that. */
2107 HReg r1 = iselIntExpr_R(env, e);
2108 return AMD64AMode_IR(0, r1);
2113 /* --------------------- RMIs --------------------- */
2115 /* Similarly, calculate an expression into an X86RMI operand. As with
2116 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2118 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2120 AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2121 /* sanity checks ... */
2122 switch (rmi->tag) {
2123 case Armi_Imm:
2124 return rmi;
2125 case Armi_Reg:
2126 vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2127 vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2128 return rmi;
2129 case Armi_Mem:
2130 vassert(sane_AMode(rmi->Armi.Mem.am));
2131 return rmi;
2132 default:
2133 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2137 /* DO NOT CALL THIS DIRECTLY ! */
2138 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2140 IRType ty = typeOfIRExpr(env->type_env,e);
2141 vassert(ty == Ity_I64 || ty == Ity_I32
2142 || ty == Ity_I16 || ty == Ity_I8);
2144 /* special case: immediate 64/32/16/8 */
2145 if (e->tag == Iex_Const) {
2146 switch (e->Iex.Const.con->tag) {
2147 case Ico_U64:
2148 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2149 return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2151 break;
2152 case Ico_U32:
2153 return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2154 case Ico_U16:
2155 return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2156 case Ico_U8:
2157 return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2158 default:
2159 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2163 /* special case: 64-bit GET */
2164 if (e->tag == Iex_Get && ty == Ity_I64) {
2165 return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2166 hregAMD64_RBP()));
2169 /* special case: 64-bit load from memory */
2170 if (e->tag == Iex_Load && ty == Ity_I64
2171 && e->Iex.Load.end == Iend_LE) {
2172 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2173 return AMD64RMI_Mem(am);
2176 /* default case: calculate into a register and return that */
2178 HReg r = iselIntExpr_R ( env, e );
2179 return AMD64RMI_Reg(r);
2184 /* --------------------- RIs --------------------- */
2186 /* Calculate an expression into an AMD64RI operand. As with
2187 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2188 bits. */
2190 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2192 AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2193 /* sanity checks ... */
2194 switch (ri->tag) {
2195 case Ari_Imm:
2196 return ri;
2197 case Ari_Reg:
2198 vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2199 vassert(hregIsVirtual(ri->Ari.Reg.reg));
2200 return ri;
2201 default:
2202 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2206 /* DO NOT CALL THIS DIRECTLY ! */
2207 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2209 IRType ty = typeOfIRExpr(env->type_env,e);
2210 vassert(ty == Ity_I64 || ty == Ity_I32
2211 || ty == Ity_I16 || ty == Ity_I8);
2213 /* special case: immediate */
2214 if (e->tag == Iex_Const) {
2215 switch (e->Iex.Const.con->tag) {
2216 case Ico_U64:
2217 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2218 return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2220 break;
2221 case Ico_U32:
2222 return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2223 case Ico_U16:
2224 return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2225 case Ico_U8:
2226 return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2227 default:
2228 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2232 /* default case: calculate into a register and return that */
2234 HReg r = iselIntExpr_R ( env, e );
2235 return AMD64RI_Reg(r);
2240 /* --------------------- RMs --------------------- */
2242 /* Similarly, calculate an expression into an AMD64RM operand. As
2243 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2244 bits. */
2246 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2248 AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2249 /* sanity checks ... */
2250 switch (rm->tag) {
2251 case Arm_Reg:
2252 vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2253 vassert(hregIsVirtual(rm->Arm.Reg.reg));
2254 return rm;
2255 case Arm_Mem:
2256 vassert(sane_AMode(rm->Arm.Mem.am));
2257 return rm;
2258 default:
2259 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2263 /* DO NOT CALL THIS DIRECTLY ! */
2264 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2266 IRType ty = typeOfIRExpr(env->type_env,e);
2267 vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2269 /* special case: 64-bit GET */
2270 if (e->tag == Iex_Get && ty == Ity_I64) {
2271 return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2272 hregAMD64_RBP()));
2275 /* special case: load from memory */
2277 /* default case: calculate into a register and return that */
2279 HReg r = iselIntExpr_R ( env, e );
2280 return AMD64RM_Reg(r);
2285 /* --------------------- CONDCODE as %rflag test --------------------- */
2287 /* Generate code to evaluated a bit-typed expression, returning the
2288 condition code which would correspond when the expression would
2289 notionally have returned 1.
2291 Note that iselCondCode_C and iselCondCode_R are mutually recursive. For
2292 future changes to either of them, take care not to introduce an infinite
2293 loop involving the two of them.
2295 static AMD64CondCode iselCondCode_C ( ISelEnv* env, const IRExpr* e )
2297 /* Uh, there's nothing we can sanity check here, unfortunately. */
2298 return iselCondCode_C_wrk(env,e);
2301 /* DO NOT CALL THIS DIRECTLY ! */
2302 static AMD64CondCode iselCondCode_C_wrk ( ISelEnv* env, const IRExpr* e )
2304 vassert(e);
2305 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2307 /* var */
2308 if (e->tag == Iex_RdTmp) {
2309 HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2310 addInstr(env, AMD64Instr_Test64(1,r64));
2311 return Acc_NZ;
2314 /* Constant 1:Bit */
2315 if (e->tag == Iex_Const) {
2316 HReg r;
2317 vassert(e->Iex.Const.con->tag == Ico_U1);
2318 vassert(e->Iex.Const.con->Ico.U1 == True
2319 || e->Iex.Const.con->Ico.U1 == False);
2320 r = newVRegI(env);
2321 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2322 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2323 return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2326 /* Not1(...) */
2327 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2328 /* Generate code for the arg, and negate the test condition */
2329 return 1 ^ iselCondCode_C(env, e->Iex.Unop.arg);
2332 /* --- patterns rooted at: 64to1 --- */
2334 /* 64to1 */
2335 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2336 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2337 addInstr(env, AMD64Instr_Test64(1,reg));
2338 return Acc_NZ;
2341 /* --- patterns rooted at: 32to1 --- */
2343 /* 32to1 */
2344 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2345 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2346 addInstr(env, AMD64Instr_Test64(1,reg));
2347 return Acc_NZ;
2350 /* --- patterns rooted at: CmpNEZ8 --- */
2352 /* CmpNEZ8(x) */
2353 if (e->tag == Iex_Unop
2354 && e->Iex.Unop.op == Iop_CmpNEZ8) {
2355 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2356 addInstr(env, AMD64Instr_Test64(0xFF,r));
2357 return Acc_NZ;
2360 /* --- patterns rooted at: CmpNEZ16 --- */
2362 /* CmpNEZ16(x) */
2363 if (e->tag == Iex_Unop
2364 && e->Iex.Unop.op == Iop_CmpNEZ16) {
2365 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2366 addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2367 return Acc_NZ;
2370 /* --- patterns rooted at: CmpNEZ32 --- */
2372 if (e->tag == Iex_Unop
2373 && e->Iex.Unop.op == Iop_CmpNEZ32) {
2374 IRExpr* arg = e->Iex.Unop.arg;
2375 if (arg->tag == Iex_Binop
2376 && (arg->Iex.Binop.op == Iop_Or32
2377 || arg->Iex.Binop.op == Iop_And32)) {
2378 /* CmpNEZ32(Or32(x,y)) */
2379 /* CmpNEZ32(And32(x,y)) */
2380 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2381 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2382 HReg tmp = newVRegI(env);
2383 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2384 addInstr(env, AMD64Instr_Alu32R(
2385 arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2386 rmi1, tmp));
2387 return Acc_NZ;
2389 /* CmpNEZ32(x) */
2390 HReg r1 = iselIntExpr_R(env, arg);
2391 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2392 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2393 return Acc_NZ;
2396 /* --- patterns rooted at: CmpNEZ64 --- */
2398 if (e->tag == Iex_Unop
2399 && e->Iex.Unop.op == Iop_CmpNEZ64) {
2400 IRExpr* arg = e->Iex.Unop.arg;
2401 if (arg->tag == Iex_Binop
2402 && (arg->Iex.Binop.op == Iop_Or64
2403 || arg->Iex.Binop.op == Iop_And64)) {
2404 /* CmpNEZ64(Or64(x,y)) */
2405 /* CmpNEZ64(And64(x,y)) */
2406 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2407 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2408 HReg tmp = newVRegI(env);
2409 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2410 addInstr(env, AMD64Instr_Alu64R(
2411 arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2412 rmi1, tmp));
2413 return Acc_NZ;
2415 /* CmpNEZ64(x) */
2416 HReg r1 = iselIntExpr_R(env, arg);
2417 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2418 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2419 return Acc_NZ;
2422 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2424 /* CmpEQ8 / CmpNE8 */
2425 if (e->tag == Iex_Binop
2426 && (e->Iex.Binop.op == Iop_CmpEQ8
2427 || e->Iex.Binop.op == Iop_CmpNE8
2428 || e->Iex.Binop.op == Iop_CasCmpEQ8
2429 || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2430 if (isZeroU8(e->Iex.Binop.arg2)) {
2431 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2432 addInstr(env, AMD64Instr_Test64(0xFF,r1));
2433 switch (e->Iex.Binop.op) {
2434 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2435 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2436 default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,0:I8)");
2438 } else {
2439 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2440 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2441 HReg r = newVRegI(env);
2442 addInstr(env, mk_iMOVsd_RR(r1,r));
2443 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2444 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2445 switch (e->Iex.Binop.op) {
2446 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2447 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2448 default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,expr)");
2453 /* CmpEQ16 / CmpNE16 */
2454 if (e->tag == Iex_Binop
2455 && (e->Iex.Binop.op == Iop_CmpEQ16
2456 || e->Iex.Binop.op == Iop_CmpNE16
2457 || e->Iex.Binop.op == Iop_CasCmpEQ16
2458 || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2459 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2460 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2461 HReg r = newVRegI(env);
2462 addInstr(env, mk_iMOVsd_RR(r1,r));
2463 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2464 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2465 switch (e->Iex.Binop.op) {
2466 case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2467 case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2468 default: vpanic("iselCondCode_C(amd64): CmpXX16");
2472 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2473 Saves a "movq %rax, %tmp" compared to the default route. */
2474 if (e->tag == Iex_Binop
2475 && e->Iex.Binop.op == Iop_CmpNE64
2476 && e->Iex.Binop.arg1->tag == Iex_CCall
2477 && e->Iex.Binop.arg2->tag == Iex_Const) {
2478 IRExpr* cal = e->Iex.Binop.arg1;
2479 IRExpr* con = e->Iex.Binop.arg2;
2480 HReg tmp = newVRegI(env);
2481 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2482 vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2483 vassert(con->Iex.Const.con->tag == Ico_U64);
2484 /* Marshal args, do the call. */
2485 UInt addToSp = 0;
2486 RetLoc rloc = mk_RetLoc_INVALID();
2487 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2488 cal->Iex.CCall.cee,
2489 cal->Iex.CCall.retty, cal->Iex.CCall.args );
2490 vassert(is_sane_RetLoc(rloc));
2491 vassert(rloc.pri == RLPri_Int);
2492 vassert(addToSp == 0);
2493 /* */
2494 addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2495 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2496 AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2497 return Acc_NZ;
2500 /* Cmp*64*(x,y) */
2501 if (e->tag == Iex_Binop
2502 && (e->Iex.Binop.op == Iop_CmpEQ64
2503 || e->Iex.Binop.op == Iop_CmpNE64
2504 || e->Iex.Binop.op == Iop_CmpLT64S
2505 || e->Iex.Binop.op == Iop_CmpLT64U
2506 || e->Iex.Binop.op == Iop_CmpLE64S
2507 || e->Iex.Binop.op == Iop_CmpLE64U
2508 || e->Iex.Binop.op == Iop_CasCmpEQ64
2509 || e->Iex.Binop.op == Iop_CasCmpNE64
2510 || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2511 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2512 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2513 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2514 switch (e->Iex.Binop.op) {
2515 case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2516 case Iop_CmpNE64:
2517 case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2518 case Iop_CmpLT64S: return Acc_L;
2519 case Iop_CmpLT64U: return Acc_B;
2520 case Iop_CmpLE64S: return Acc_LE;
2521 case Iop_CmpLE64U: return Acc_BE;
2522 default: vpanic("iselCondCode_C(amd64): CmpXX64");
2526 /* Cmp*32*(x,y) */
2527 if (e->tag == Iex_Binop
2528 && (e->Iex.Binop.op == Iop_CmpEQ32
2529 || e->Iex.Binop.op == Iop_CmpNE32
2530 || e->Iex.Binop.op == Iop_CmpLT32S
2531 || e->Iex.Binop.op == Iop_CmpLT32U
2532 || e->Iex.Binop.op == Iop_CmpLE32S
2533 || e->Iex.Binop.op == Iop_CmpLE32U
2534 || e->Iex.Binop.op == Iop_CasCmpEQ32
2535 || e->Iex.Binop.op == Iop_CasCmpNE32
2536 || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2537 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2538 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2539 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2540 switch (e->Iex.Binop.op) {
2541 case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2542 case Iop_CmpNE32:
2543 case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2544 case Iop_CmpLT32S: return Acc_L;
2545 case Iop_CmpLT32U: return Acc_B;
2546 case Iop_CmpLE32S: return Acc_LE;
2547 case Iop_CmpLE32U: return Acc_BE;
2548 default: vpanic("iselCondCode_C(amd64): CmpXX32");
2552 /* And1(x,y), Or1(x,y) */
2553 if (e->tag == Iex_Binop
2554 && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2555 // Get the result in an int reg, then test the least significant bit.
2556 HReg tmp = iselCondCode_R(env, e);
2557 addInstr(env, AMD64Instr_Test64(1, tmp));
2558 return Acc_NZ;
2561 ppIRExpr(e);
2562 vpanic("iselCondCode_C(amd64)");
2566 /* --------------------- CONDCODE as int reg --------------------- */
2568 /* Generate code to evaluated a bit-typed expression, returning the resulting
2569 value in bit 0 of an integer register. WARNING: all of the other bits in the
2570 register can be arbitrary. Callers must mask them off or otherwise ignore
2571 them, as necessary.
2573 Note that iselCondCode_C and iselCondCode_R are mutually recursive. For
2574 future changes to either of them, take care not to introduce an infinite
2575 loop involving the two of them.
2577 static HReg iselCondCode_R ( ISelEnv* env, const IRExpr* e )
2579 /* Uh, there's nothing we can sanity check here, unfortunately. */
2580 return iselCondCode_R_wrk(env,e);
2583 /* DO NOT CALL THIS DIRECTLY ! */
2584 static HReg iselCondCode_R_wrk ( ISelEnv* env, const IRExpr* e )
2586 vassert(e);
2587 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2589 /* var */
2590 if (e->tag == Iex_RdTmp) {
2591 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2594 /* And1(x,y), Or1(x,y) */
2595 if (e->tag == Iex_Binop
2596 && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2597 HReg x_as_64 = iselCondCode_R(env, e->Iex.Binop.arg1);
2598 HReg y_as_64 = iselCondCode_R(env, e->Iex.Binop.arg2);
2599 HReg res = newVRegI(env);
2600 addInstr(env, mk_iMOVsd_RR(y_as_64, res));
2601 AMD64AluOp aop = e->Iex.Binop.op == Iop_And1 ? Aalu_AND : Aalu_OR;
2602 addInstr(env, AMD64Instr_Alu64R(aop, AMD64RMI_Reg(x_as_64), res));
2603 return res;
2606 /* Anything else, we hand off to iselCondCode_C and force the value into a
2607 register. */
2608 HReg res = newVRegI(env);
2609 AMD64CondCode cc = iselCondCode_C(env, e);
2610 addInstr(env, AMD64Instr_Set64(cc, res));
2611 return res;
2613 // PJF old debug code? - unreachable
2615 ppIRExpr(e);
2616 vpanic("iselCondCode_R(amd64)");
2621 /*---------------------------------------------------------*/
2622 /*--- ISEL: Integer expressions (128 bit) ---*/
2623 /*---------------------------------------------------------*/
2625 /* Compute a 128-bit value into a register pair, which is returned as
2626 the first two parameters. As with iselIntExpr_R, these may be
2627 either real or virtual regs; in any case they must not be changed
2628 by subsequent code emitted by the caller. */
2630 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2631 ISelEnv* env, const IRExpr* e )
2633 iselInt128Expr_wrk(rHi, rLo, env, e);
2634 # if 0
2635 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2636 # endif
2637 vassert(hregClass(*rHi) == HRcInt64);
2638 vassert(hregIsVirtual(*rHi));
2639 vassert(hregClass(*rLo) == HRcInt64);
2640 vassert(hregIsVirtual(*rLo));
2643 /* DO NOT CALL THIS DIRECTLY ! */
2644 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2645 ISelEnv* env, const IRExpr* e )
2647 vassert(e);
2648 vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2650 /* read 128-bit IRTemp */
2651 if (e->tag == Iex_RdTmp) {
2652 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2653 return;
2656 /* --------- BINARY ops --------- */
2657 if (e->tag == Iex_Binop) {
2658 switch (e->Iex.Binop.op) {
2659 /* 64 x 64 -> 128 multiply */
2660 case Iop_MullU64:
2661 case Iop_MullS64: {
2662 /* get one operand into %rax, and the other into a R/M.
2663 Need to make an educated guess about which is better in
2664 which. */
2665 HReg tLo = newVRegI(env);
2666 HReg tHi = newVRegI(env);
2667 Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2668 AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2669 HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2670 addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2671 addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2672 /* Result is now in RDX:RAX. Tell the caller. */
2673 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2674 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2675 *rHi = tHi;
2676 *rLo = tLo;
2677 return;
2680 /* 128 x 64 -> (64(rem),64(div)) division */
2681 case Iop_DivModU128to64:
2682 case Iop_DivModS128to64: {
2683 /* Get the 128-bit operand into rdx:rax, and the other into
2684 any old R/M. */
2685 HReg sHi, sLo;
2686 HReg tLo = newVRegI(env);
2687 HReg tHi = newVRegI(env);
2688 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2689 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2690 iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2691 addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2692 addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2693 addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2694 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2695 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2696 *rHi = tHi;
2697 *rLo = tLo;
2698 return;
2701 /* 64HLto128(e1,e2) */
2702 case Iop_64HLto128:
2703 *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2704 *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2705 return;
2707 default:
2708 break;
2710 } /* if (e->tag == Iex_Binop) */
2712 ppIRExpr(e);
2713 vpanic("iselInt128Expr");
2717 /*---------------------------------------------------------*/
2718 /*--- ISEL: Floating point expressions (32 bit) ---*/
2719 /*---------------------------------------------------------*/
2721 /* Nothing interesting here; really just wrappers for
2722 64-bit stuff. */
2724 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2726 HReg r = iselFltExpr_wrk( env, e );
2727 # if 0
2728 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2729 # endif
2730 vassert(hregClass(r) == HRcVec128);
2731 vassert(hregIsVirtual(r));
2732 return r;
2735 /* DO NOT CALL THIS DIRECTLY */
2736 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2738 IRType ty = typeOfIRExpr(env->type_env,e);
2739 vassert(ty == Ity_F32);
2741 if (e->tag == Iex_RdTmp) {
2742 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2745 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2746 AMD64AMode* am;
2747 HReg res = newVRegV(env);
2748 vassert(e->Iex.Load.ty == Ity_F32);
2749 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2750 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2751 return res;
2754 if (e->tag == Iex_Binop
2755 && e->Iex.Binop.op == Iop_F64toF32) {
2756 /* Although the result is still held in a standard SSE register,
2757 we need to round it to reflect the loss of accuracy/range
2758 entailed in casting it to a 32-bit float. */
2759 HReg dst = newVRegV(env);
2760 HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2761 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2762 addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2763 set_SSE_rounding_default( env );
2764 return dst;
2767 if (e->tag == Iex_Get) {
2768 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2769 hregAMD64_RBP() );
2770 HReg res = newVRegV(env);
2771 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2772 return res;
2775 if (e->tag == Iex_Unop
2776 && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2777 /* Given an I32, produce an IEEE754 float with the same bit
2778 pattern. */
2779 HReg dst = newVRegV(env);
2780 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2781 AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2782 addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2783 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2784 return dst;
2787 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2788 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2789 HReg arg = iselFltExpr(env, e->Iex.Binop.arg2);
2790 HReg dst = newVRegV(env);
2792 /* rf now holds the value to be rounded. The first thing to do
2793 is set the FPU's rounding mode accordingly. */
2795 /* Set host x87 rounding mode */
2796 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2798 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2799 addInstr(env, AMD64Instr_A87Free(1));
2800 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2801 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2802 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2803 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2805 /* Restore default x87 rounding. */
2806 set_FPU_rounding_default( env );
2808 return dst;
2811 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2812 /* Sigh ... very rough code. Could do much better. */
2813 /* Get the 128-bit literal 00---0 10---0 into a register
2814 and xor it with the value to be negated. */
2815 HReg r1 = newVRegI(env);
2816 HReg dst = newVRegV(env);
2817 HReg tmp = newVRegV(env);
2818 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2819 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2820 addInstr(env, mk_vMOVsd_RR(src,tmp));
2821 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2822 addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2823 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2824 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2825 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2826 add_to_rsp(env, 16);
2827 return dst;
2830 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2831 IRQop *qop = e->Iex.Qop.details;
2832 HReg dst = newVRegV(env);
2833 HReg argX = iselFltExpr(env, qop->arg2);
2834 HReg argY = iselFltExpr(env, qop->arg3);
2835 HReg argZ = iselFltExpr(env, qop->arg4);
2836 if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
2837 vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
2838 if (dst.u32 != argX.u32)
2839 addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
2840 addInstr(env, AMD64Instr_Avx32FLo(Asse_VFMADD213, argY, argZ, dst));
2841 return dst;
2843 /* XXXROUNDINGFIXME */
2844 /* set roundingmode here */
2845 /* subq $16, %rsp -- make a space*/
2846 sub_from_rsp(env, 16);
2847 /* Prepare 4 arg regs:
2848 leaq 0(%rsp), %rdi
2849 leaq 4(%rsp), %rsi
2850 leaq 8(%rsp), %rdx
2851 leaq 12(%rsp), %rcx
2853 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2854 hregAMD64_RDI()));
2855 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2856 hregAMD64_RSI()));
2857 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2858 hregAMD64_RDX()));
2859 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2860 hregAMD64_RCX()));
2861 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2862 movss %argX, 0(%rsi)
2863 movss %argY, 0(%rdx)
2864 movss %argZ, 0(%rcx)
2866 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2867 AMD64AMode_IR(0, hregAMD64_RSI())));
2868 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2869 AMD64AMode_IR(0, hregAMD64_RDX())));
2870 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2871 AMD64AMode_IR(0, hregAMD64_RCX())));
2873 /* call the helper with priority order : fma4 -> fallback generic
2874 remark: the fma3 case is handled before without helper*/
2875 #if defined(VGA_amd64)
2876 if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
2877 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2878 (ULong)(HWord)h_amd64_calc_MAddF32_fma4,
2879 4, mk_RetLoc_simple(RLPri_None) ));
2880 }else
2881 #endif
2883 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2884 (ULong)(HWord)h_generic_calc_MAddF32,
2885 4, mk_RetLoc_simple(RLPri_None) ));
2888 /* fetch the result from memory, using %r_argp, which the
2889 register allocator will keep alive across the call. */
2890 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2891 AMD64AMode_IR(0, hregAMD64_RSP())));
2892 /* and finally, clear the space */
2893 add_to_rsp(env, 16);
2894 return dst;
2897 if (e->tag == Iex_ITE) { // VFD
2898 HReg r1, r0, dst;
2899 vassert(ty == Ity_F32);
2900 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
2901 r1 = iselFltExpr(env, e->Iex.ITE.iftrue);
2902 r0 = iselFltExpr(env, e->Iex.ITE.iffalse);
2903 dst = newVRegV(env);
2904 addInstr(env, mk_vMOVsd_RR(r1,dst));
2905 AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
2906 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
2907 return dst;
2910 ppIRExpr(e);
2911 vpanic("iselFltExpr_wrk");
2915 /*---------------------------------------------------------*/
2916 /*--- ISEL: Floating point expressions (64 bit) ---*/
2917 /*---------------------------------------------------------*/
2919 /* Compute a 64-bit floating point value into the lower half of an xmm
2920 register, the identity of which is returned. As with
2921 iselIntExpr_R, the returned reg will be virtual, and it must not be
2922 changed by subsequent code emitted by the caller.
2925 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2927 Type S (1 bit) E (11 bits) F (52 bits)
2928 ---- --------- ----------- -----------
2929 signalling NaN u 2047 (max) .0uuuuu---u
2930 (with at least
2931 one 1 bit)
2932 quiet NaN u 2047 (max) .1uuuuu---u
2934 negative infinity 1 2047 (max) .000000---0
2936 positive infinity 0 2047 (max) .000000---0
2938 negative zero 1 0 .000000---0
2940 positive zero 0 0 .000000---0
2943 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2945 HReg r = iselDblExpr_wrk( env, e );
2946 # if 0
2947 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2948 # endif
2949 vassert(hregClass(r) == HRcVec128);
2950 vassert(hregIsVirtual(r));
2951 return r;
2954 /* DO NOT CALL THIS DIRECTLY */
2955 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2957 IRType ty = typeOfIRExpr(env->type_env,e);
2958 vassert(e);
2959 vassert(ty == Ity_F64);
2961 if (e->tag == Iex_RdTmp) {
2962 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2965 if (e->tag == Iex_Const) {
2966 union { ULong u64; Double f64; } u;
2967 HReg res = newVRegV(env);
2968 HReg tmp = newVRegI(env);
2969 vassert(sizeof(u) == 8);
2970 vassert(sizeof(u.u64) == 8);
2971 vassert(sizeof(u.f64) == 8);
2973 if (e->Iex.Const.con->tag == Ico_F64) {
2974 u.f64 = e->Iex.Const.con->Ico.F64;
2976 else if (e->Iex.Const.con->tag == Ico_F64i) {
2977 u.u64 = e->Iex.Const.con->Ico.F64i;
2979 else
2980 vpanic("iselDblExpr(amd64): const");
2982 addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2983 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2984 addInstr(env, AMD64Instr_SseLdSt(
2985 True/*load*/, 8, res,
2986 AMD64AMode_IR(0, hregAMD64_RSP())
2988 add_to_rsp(env, 8);
2989 return res;
2992 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2993 AMD64AMode* am;
2994 HReg res = newVRegV(env);
2995 vassert(e->Iex.Load.ty == Ity_F64);
2996 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2997 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2998 return res;
3001 if (e->tag == Iex_Get) {
3002 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
3003 hregAMD64_RBP() );
3004 HReg res = newVRegV(env);
3005 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
3006 return res;
3009 if (e->tag == Iex_GetI) {
3010 AMD64AMode* am
3011 = genGuestArrayOffset(
3012 env, e->Iex.GetI.descr,
3013 e->Iex.GetI.ix, e->Iex.GetI.bias );
3014 HReg res = newVRegV(env);
3015 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
3016 return res;
3019 if (e->tag == Iex_Triop) {
3020 IRTriop *triop = e->Iex.Triop.details;
3021 AMD64SseOp op = Asse_INVALID;
3022 switch (triop->op) {
3023 case Iop_AddF64: op = Asse_ADDF; break;
3024 case Iop_SubF64: op = Asse_SUBF; break;
3025 case Iop_MulF64: op = Asse_MULF; break;
3026 case Iop_DivF64: op = Asse_DIVF; break;
3027 default: break;
3029 if (op != Asse_INVALID) {
3030 HReg dst = newVRegV(env);
3031 HReg argL = iselDblExpr(env, triop->arg2);
3032 HReg argR = iselDblExpr(env, triop->arg3);
3033 addInstr(env, mk_vMOVsd_RR(argL, dst));
3034 /* XXXROUNDINGFIXME */
3035 /* set roundingmode here */
3036 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3037 return dst;
3041 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
3042 IRQop *qop = e->Iex.Qop.details;
3043 HReg dst = newVRegV(env);
3044 HReg argX = iselDblExpr(env, qop->arg2);
3045 HReg argY = iselDblExpr(env, qop->arg3);
3046 HReg argZ = iselDblExpr(env, qop->arg4);
3047 if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
3048 vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
3049 if (dst.u32 != argX.u32)
3050 addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
3051 addInstr(env, AMD64Instr_Avx64FLo(Asse_VFMADD213, argY, argZ, dst));
3052 return dst;
3055 /* XXXROUNDINGFIXME */
3056 /* set roundingmode here */
3057 /* subq $32, %rsp -- make a space*/
3058 sub_from_rsp(env, 32);
3059 /* Prepare 4 arg regs:
3060 leaq 0(%rsp), %rdi
3061 leaq 8(%rsp), %rsi
3062 leaq 16(%rsp), %rdx
3063 leaq 24(%rsp), %rcx
3065 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
3066 hregAMD64_RDI()));
3067 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
3068 hregAMD64_RSI()));
3069 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
3070 hregAMD64_RDX()));
3071 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
3072 hregAMD64_RCX()));
3073 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
3074 movsd %argX, 0(%rsi)
3075 movsd %argY, 0(%rdx)
3076 movsd %argZ, 0(%rcx)
3078 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
3079 AMD64AMode_IR(0, hregAMD64_RSI())));
3080 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
3081 AMD64AMode_IR(0, hregAMD64_RDX())));
3082 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
3083 AMD64AMode_IR(0, hregAMD64_RCX())));
3085 /* call the helper with priority order : fma4 -> fallback generic
3086 remark: the fma3 case is handled before without helper*/
3087 #if defined(VGA_amd64)
3088 if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
3089 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
3090 (ULong)(HWord)h_amd64_calc_MAddF64_fma4,
3091 4, mk_RetLoc_simple(RLPri_None) ));
3092 }else
3093 #endif
3095 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
3096 (ULong)(HWord)h_generic_calc_MAddF64,
3097 4, mk_RetLoc_simple(RLPri_None) ));
3100 /* fetch the result from memory, using %r_argp, which the
3101 register allocator will keep alive across the call. */
3102 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
3103 AMD64AMode_IR(0, hregAMD64_RSP())));
3104 /* and finally, clear the space */
3105 add_to_rsp(env, 32);
3106 return dst;
3109 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
3110 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3111 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3112 HReg dst = newVRegV(env);
3114 /* rf now holds the value to be rounded. The first thing to do
3115 is set the FPU's rounding mode accordingly. */
3117 /* Set host x87 rounding mode */
3118 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3120 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3121 addInstr(env, AMD64Instr_A87Free(1));
3122 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3123 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
3124 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3125 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3127 /* Restore default x87 rounding. */
3128 set_FPU_rounding_default( env );
3130 return dst;
3133 IRTriop *triop = e->Iex.Triop.details;
3134 if (e->tag == Iex_Triop
3135 && (triop->op == Iop_ScaleF64
3136 || triop->op == Iop_AtanF64
3137 || triop->op == Iop_Yl2xF64
3138 || triop->op == Iop_Yl2xp1F64
3139 || triop->op == Iop_PRemF64
3140 || triop->op == Iop_PRem1F64)
3142 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3143 HReg arg1 = iselDblExpr(env, triop->arg2);
3144 HReg arg2 = iselDblExpr(env, triop->arg3);
3145 HReg dst = newVRegV(env);
3146 Bool arg2first = toBool(triop->op == Iop_ScaleF64
3147 || triop->op == Iop_PRemF64
3148 || triop->op == Iop_PRem1F64);
3149 addInstr(env, AMD64Instr_A87Free(2));
3151 /* one arg -> top of x87 stack */
3152 addInstr(env, AMD64Instr_SseLdSt(
3153 False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3154 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3156 /* other arg -> top of x87 stack */
3157 addInstr(env, AMD64Instr_SseLdSt(
3158 False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3159 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3161 /* do it */
3162 /* XXXROUNDINGFIXME */
3163 /* set roundingmode here */
3164 switch (triop->op) {
3165 case Iop_ScaleF64:
3166 addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3167 break;
3168 case Iop_AtanF64:
3169 addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3170 break;
3171 case Iop_Yl2xF64:
3172 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3173 break;
3174 case Iop_Yl2xp1F64:
3175 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3176 break;
3177 case Iop_PRemF64:
3178 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3179 break;
3180 case Iop_PRem1F64:
3181 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3182 break;
3183 default:
3184 vassert(0);
3187 /* save result */
3188 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3189 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3190 return dst;
3193 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3194 HReg dst = newVRegV(env);
3195 HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3196 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3197 addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3198 set_SSE_rounding_default( env );
3199 return dst;
3202 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3203 HReg dst = newVRegV(env);
3204 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3205 set_SSE_rounding_default( env );
3206 addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3207 return dst;
3210 if (e->tag == Iex_Unop
3211 && (e->Iex.Unop.op == Iop_NegF64
3212 || e->Iex.Unop.op == Iop_AbsF64)) {
3213 /* Sigh ... very rough code. Could do much better. */
3214 /* Get the 128-bit literal 00---0 10---0 into a register
3215 and xor/nand it with the value to be negated. */
3216 HReg r1 = newVRegI(env);
3217 HReg dst = newVRegV(env);
3218 HReg tmp = newVRegV(env);
3219 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3220 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3221 addInstr(env, mk_vMOVsd_RR(src,tmp));
3222 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3223 addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3224 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3225 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3227 if (e->Iex.Unop.op == Iop_NegF64)
3228 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3229 else
3230 addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3232 add_to_rsp(env, 16);
3233 return dst;
3236 if (e->tag == Iex_Binop) {
3237 A87FpOp fpop = Afp_INVALID;
3238 switch (e->Iex.Binop.op) {
3239 case Iop_SqrtF64: fpop = Afp_SQRT; break;
3240 case Iop_SinF64: fpop = Afp_SIN; break;
3241 case Iop_CosF64: fpop = Afp_COS; break;
3242 case Iop_TanF64: fpop = Afp_TAN; break;
3243 case Iop_2xm1F64: fpop = Afp_2XM1; break;
3244 default: break;
3246 if (fpop != Afp_INVALID) {
3247 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3248 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3249 HReg dst = newVRegV(env);
3250 Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3251 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3252 addInstr(env, AMD64Instr_A87Free(nNeeded));
3253 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3254 /* XXXROUNDINGFIXME */
3255 /* set roundingmode here */
3256 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3257 codes. I don't think that matters, since this insn
3258 selector never generates such an instruction intervening
3259 between an flag-setting instruction and a flag-using
3260 instruction. */
3261 addInstr(env, AMD64Instr_A87FpOp(fpop));
3262 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3263 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3264 return dst;
3268 if (e->tag == Iex_Unop) {
3269 switch (e->Iex.Unop.op) {
3270 //.. case Iop_I32toF64: {
3271 //.. HReg dst = newVRegF(env);
3272 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3273 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3274 //.. set_FPU_rounding_default(env);
3275 //.. addInstr(env, X86Instr_FpLdStI(
3276 //.. True/*load*/, 4, dst,
3277 //.. X86AMode_IR(0, hregX86_ESP())));
3278 //.. add_to_esp(env, 4);
3279 //.. return dst;
3280 //.. }
3281 case Iop_ReinterpI64asF64: {
3282 /* Given an I64, produce an IEEE754 double with the same
3283 bit pattern. */
3284 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3285 HReg dst = newVRegV(env);
3286 AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg);
3287 /* paranoia */
3288 set_SSE_rounding_default(env);
3289 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3290 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3291 return dst;
3293 case Iop_F32toF64: {
3294 HReg f32;
3295 HReg f64 = newVRegV(env);
3296 /* this shouldn't be necessary, but be paranoid ... */
3297 set_SSE_rounding_default(env);
3298 f32 = iselFltExpr(env, e->Iex.Unop.arg);
3299 addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3300 return f64;
3302 default:
3303 break;
3307 /* --------- MULTIPLEX --------- */
3308 if (e->tag == Iex_ITE) { // VFD
3309 HReg r1, r0, dst;
3310 vassert(ty == Ity_F64);
3311 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3312 r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
3313 r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
3314 dst = newVRegV(env);
3315 addInstr(env, mk_vMOVsd_RR(r1,dst));
3316 AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
3317 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3318 return dst;
3321 ppIRExpr(e);
3322 vpanic("iselDblExpr_wrk");
3326 /*---------------------------------------------------------*/
3327 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3328 /*---------------------------------------------------------*/
3330 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3332 HReg r = iselVecExpr_wrk( env, e );
3333 # if 0
3334 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3335 # endif
3336 vassert(hregClass(r) == HRcVec128);
3337 vassert(hregIsVirtual(r));
3338 return r;
3342 /* DO NOT CALL THIS DIRECTLY */
3343 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3345 HWord fn = 0; /* address of helper fn, if required */
3346 Bool arg1isEReg = False;
3347 AMD64SseOp op = Asse_INVALID;
3348 vassert(e);
3349 IRType ty = typeOfIRExpr(env->type_env, e);
3350 vassert(ty == Ity_V128);
3351 UInt laneBits = 0;
3353 if (e->tag == Iex_RdTmp) {
3354 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3357 if (e->tag == Iex_Get) {
3358 HReg dst = newVRegV(env);
3359 addInstr(env, AMD64Instr_SseLdSt(
3360 True/*load*/,
3362 dst,
3363 AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3366 return dst;
3369 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3370 HReg dst = newVRegV(env);
3371 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3372 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3373 return dst;
3376 if (e->tag == Iex_Const) {
3377 HReg dst = newVRegV(env);
3378 vassert(e->Iex.Const.con->tag == Ico_V128);
3379 switch (e->Iex.Const.con->Ico.V128) {
3380 case 0x0000:
3381 dst = generate_zeroes_V128(env);
3382 break;
3383 case 0xFFFF:
3384 dst = generate_ones_V128(env);
3385 break;
3386 default: {
3387 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3388 /* do push_uimm64 twice, first time for the high-order half. */
3389 push_uimm64(env, bitmask8_to_bytemask64(
3390 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3392 push_uimm64(env, bitmask8_to_bytemask64(
3393 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3395 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3396 add_to_rsp(env, 16);
3397 break;
3400 return dst;
3403 if (e->tag == Iex_Unop) {
3404 switch (e->Iex.Unop.op) {
3406 case Iop_NotV128: {
3407 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3408 return do_sse_NotV128(env, arg);
3411 case Iop_CmpNEZ64x2: {
3412 /* We can use SSE2 instructions for this. */
3413 /* Ideally, we want to do a 64Ix2 comparison against zero of
3414 the operand. Problem is no such insn exists. Solution
3415 therefore is to do a 32Ix4 comparison instead, and bitwise-
3416 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3417 let the not'd result of this initial comparison be a:b:c:d.
3418 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3419 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3420 giving the required result.
3422 The required selection sequence is 2,3,0,1, which
3423 according to Intel's documentation means the pshufd
3424 literal value is 0xB1, that is,
3425 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3427 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3428 HReg tmp = generate_zeroes_V128(env);
3429 HReg dst = newVRegV(env);
3430 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3431 tmp = do_sse_NotV128(env, tmp);
3432 addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3433 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3434 return dst;
3437 case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3438 case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3439 case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3440 do_CmpNEZ_vector:
3442 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3443 HReg tmp = newVRegV(env);
3444 HReg zero = generate_zeroes_V128(env);
3445 HReg dst;
3446 addInstr(env, mk_vMOVsd_RR(arg, tmp));
3447 addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3448 dst = do_sse_NotV128(env, tmp);
3449 return dst;
3452 case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary;
3453 case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3454 do_32Fx4_unary:
3456 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3457 HReg dst = newVRegV(env);
3458 addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3459 return dst;
3462 case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary;
3463 case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3464 case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary;
3465 do_32F0x4_unary:
3467 /* A bit subtle. We have to copy the arg to the result
3468 register first, because actually doing the SSE scalar insn
3469 leaves the upper 3/4 of the destination register
3470 unchanged. Whereas the required semantics of these
3471 primops is that the upper 3/4 is simply copied in from the
3472 argument. */
3473 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3474 HReg dst = newVRegV(env);
3475 addInstr(env, mk_vMOVsd_RR(arg, dst));
3476 addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3477 return dst;
3480 case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary;
3481 do_64F0x2_unary:
3483 /* A bit subtle. We have to copy the arg to the result
3484 register first, because actually doing the SSE scalar insn
3485 leaves the upper half of the destination register
3486 unchanged. Whereas the required semantics of these
3487 primops is that the upper half is simply copied in from the
3488 argument. */
3489 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3490 HReg dst = newVRegV(env);
3491 addInstr(env, mk_vMOVsd_RR(arg, dst));
3492 addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3493 return dst;
3496 case Iop_32UtoV128: {
3497 // FIXME maybe just use MOVQ here?
3498 HReg dst = newVRegV(env);
3499 AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3500 AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg);
3501 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3502 addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3503 return dst;
3506 case Iop_64UtoV128: {
3507 // FIXME maybe just use MOVQ here?
3508 HReg dst = newVRegV(env);
3509 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3510 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3511 addInstr(env, AMD64Instr_Push(rmi));
3512 addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3513 add_to_rsp(env, 8);
3514 return dst;
3517 case Iop_V256toV128_0:
3518 case Iop_V256toV128_1: {
3519 HReg vHi, vLo;
3520 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3521 return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3524 case Iop_F16toF32x4: {
3525 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3526 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3527 HReg dst = newVRegV(env);
3528 addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3529 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3530 return dst;
3532 break;
3535 default:
3536 break;
3537 } /* switch (e->Iex.Unop.op) */
3538 } /* if (e->tag == Iex_Unop) */
3540 if (e->tag == Iex_Binop) {
3541 switch (e->Iex.Binop.op) {
3543 case Iop_Sqrt64Fx2:
3544 case Iop_Sqrt32Fx4: {
3545 /* :: (rmode, vec) -> vec */
3546 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3547 HReg dst = newVRegV(env);
3548 /* XXXROUNDINGFIXME */
3549 /* set roundingmode here */
3550 addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3551 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3552 (Asse_SQRTF, arg, dst));
3553 return dst;
3556 /* FIXME: could we generate MOVQ here? */
3557 case Iop_SetV128lo64: {
3558 HReg dst = newVRegV(env);
3559 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3560 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3561 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3562 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3563 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3564 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3565 return dst;
3568 /* FIXME: could we generate MOVD here? */
3569 case Iop_SetV128lo32: {
3570 HReg dst = newVRegV(env);
3571 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3572 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3573 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3574 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3575 addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3576 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3577 return dst;
3580 case Iop_64HLtoV128: {
3581 const IRExpr* arg1 = e->Iex.Binop.arg1;
3582 const IRExpr* arg2 = e->Iex.Binop.arg2;
3583 HReg dst = newVRegV(env);
3584 HReg tmp = newVRegV(env);
3585 HReg qHi = iselIntExpr_R(env, arg1);
3586 // If the args are trivially the same (tmp or const), use the same
3587 // source register for both, and only one movq since those are
3588 // (relatively) expensive.
3589 if (areAtomsAndEqual(arg1, arg2)) {
3590 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3591 addInstr(env, mk_vMOVsd_RR(dst, tmp));
3592 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3593 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3594 } else {
3595 HReg qLo = iselIntExpr_R(env, arg2);
3596 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3597 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3598 addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3599 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3601 return dst;
3604 case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3605 case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3606 case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3607 case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3608 case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4;
3609 case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4;
3610 do_32Fx4:
3612 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3613 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3614 HReg dst = newVRegV(env);
3615 addInstr(env, mk_vMOVsd_RR(argL, dst));
3616 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3617 return dst;
3620 case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3621 case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3622 case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3623 case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3624 case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2;
3625 case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2;
3626 do_64Fx2:
3628 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3629 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3630 HReg dst = newVRegV(env);
3631 addInstr(env, mk_vMOVsd_RR(argL, dst));
3632 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3633 return dst;
3636 case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3637 case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3638 case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3639 case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3640 case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4;
3641 case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4;
3642 case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4;
3643 case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4;
3644 case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4;
3645 case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4;
3646 do_32F0x4: {
3647 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3648 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3649 HReg dst = newVRegV(env);
3650 addInstr(env, mk_vMOVsd_RR(argL, dst));
3651 addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3652 return dst;
3655 case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3656 case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3657 case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3658 case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3659 case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2;
3660 case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2;
3661 case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2;
3662 case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2;
3663 case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2;
3664 case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2;
3665 do_64F0x2: {
3666 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3667 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3668 HReg dst = newVRegV(env);
3669 addInstr(env, mk_vMOVsd_RR(argL, dst));
3670 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3671 return dst;
3674 case Iop_PermOrZero8x16:
3675 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3676 op = Asse_PSHUFB;
3677 goto do_SseReRg;
3679 // Otherwise we'll have to generate a call to
3680 // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
3681 // host which doesn't have SSSE3, in which case we don't expect this
3682 // IROp to enter the compilation pipeline in the first place.
3683 break;
3685 case Iop_PwExtUSMulQAdd8x16:
3686 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3687 op = Asse_PMADDUBSW;
3688 goto do_SseReRg;
3690 break;
3692 case Iop_QNarrowBin32Sto16Sx8:
3693 op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3694 case Iop_QNarrowBin16Sto8Sx16:
3695 op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3696 case Iop_QNarrowBin16Sto8Ux16:
3697 op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3699 case Iop_InterleaveHI8x16:
3700 op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3701 case Iop_InterleaveHI16x8:
3702 op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3703 case Iop_InterleaveHI32x4:
3704 op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3705 case Iop_InterleaveHI64x2:
3706 op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3708 case Iop_InterleaveLO8x16:
3709 op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3710 case Iop_InterleaveLO16x8:
3711 op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3712 case Iop_InterleaveLO32x4:
3713 op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3714 case Iop_InterleaveLO64x2:
3715 op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3717 case Iop_AndV128: op = Asse_AND; goto do_SseReRg;
3718 case Iop_OrV128: op = Asse_OR; goto do_SseReRg;
3719 case Iop_XorV128: op = Asse_XOR; goto do_SseReRg;
3720 case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg;
3721 case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg;
3722 case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg;
3723 case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg;
3724 case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg;
3725 case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg;
3726 case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg;
3727 case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg;
3728 case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg;
3729 case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg;
3730 case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg;
3731 case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg;
3732 case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg;
3733 case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg;
3734 case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3735 case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3736 case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg;
3737 case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg;
3738 case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg;
3739 case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg;
3740 case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3741 case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3742 case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg;
3743 case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg;
3744 case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg;
3745 case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg;
3746 case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg;
3747 case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg;
3748 case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg;
3749 case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg;
3750 case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg;
3751 do_SseReRg: {
3752 HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3753 HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3754 HReg dst = newVRegV(env);
3755 if (arg1isEReg) {
3756 addInstr(env, mk_vMOVsd_RR(arg2, dst));
3757 addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3758 } else {
3759 addInstr(env, mk_vMOVsd_RR(arg1, dst));
3760 addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3762 return dst;
3765 case Iop_ShlN8x16: laneBits = 8; op = Asse_SHL16; goto do_SseShift;
3766 case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3767 case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3768 case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3769 case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3770 case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3771 case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3772 case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3773 case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3774 do_SseShift: {
3775 HReg dst = newVRegV(env);
3776 HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3777 /* If it's a shift by an in-range immediate, generate a single
3778 instruction. */
3779 if (e->Iex.Binop.arg2->tag == Iex_Const) {
3780 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3781 vassert(c->tag == Ico_U8);
3782 UInt shift = c->Ico.U8;
3783 if (shift < laneBits) {
3784 if (laneBits == 8) {
3785 /* This instruction doesn't exist so we need to fake it using
3786 Asse_SHL16 and Asse_SHR16.
3788 We'd like to shift every byte in the 16-byte register to
3789 the left by some amount.
3791 Instead, we will make a copy and shift all the 16-bit words
3792 to the *right* by 8 and then to the left by 8 plus the
3793 shift amount. That will get us the correct answer for the
3794 upper 8 bits of each 16-bit word and zero elsewhere.
3796 Then we will shift all the 16-bit words in the original to
3797 the left by 8 plus the shift amount and then to the right
3798 by 8. This will get the correct answer for the lower 8
3799 bits of each 16-bit word and zero elsewhere.
3801 Finally, we will OR those two results together.
3803 Because we don't have a shift by constant in x86, we store
3804 the constant 8 into a register and shift by that as needed.
3806 AMD64SseOp reverse_op = op;
3807 switch (op) {
3808 case Asse_SHL16:
3809 reverse_op = Asse_SHR16;
3810 break;
3811 default:
3812 vpanic("Iop_ShlN8x16");
3814 HReg hi = newVRegV(env);
3815 addInstr(env, mk_vMOVsd_RR(greg, hi));
3816 addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, hi));
3817 addInstr(env, AMD64Instr_SseShiftN(op, 8+shift, hi));
3818 addInstr(env, mk_vMOVsd_RR(greg, dst));
3819 addInstr(env, AMD64Instr_SseShiftN(op, 8+shift, dst));
3820 addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, dst));
3821 addInstr(env, AMD64Instr_SseReRg(Asse_OR, hi, dst));
3822 return dst;
3824 addInstr(env, mk_vMOVsd_RR(greg, dst));
3825 addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3826 return dst;
3829 /* Otherwise we have to do it the longwinded way. */
3830 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3831 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3832 HReg ereg = newVRegV(env);
3833 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3834 addInstr(env, AMD64Instr_Push(rmi));
3835 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3836 if (laneBits == 8) {
3837 /* This instruction doesn't exist so we need to fake it, in the same
3838 way as above.
3840 AMD64SseOp reverse_op = op;
3841 switch (op) {
3842 case Asse_SHL16:
3843 reverse_op = Asse_SHR16;
3844 break;
3845 default:
3846 vpanic("Iop_ShlN8x16");
3848 HReg hi = newVRegV(env);
3849 addInstr(env, mk_vMOVsd_RR(greg, hi));
3850 addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, hi));
3851 addInstr(env, AMD64Instr_SseShiftN(op, 8, hi));
3852 addInstr(env, AMD64Instr_SseReRg(op, ereg, hi));
3853 addInstr(env, mk_vMOVsd_RR(greg, dst));
3854 addInstr(env, AMD64Instr_SseShiftN(op, 8, dst));
3855 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3856 addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, dst));
3857 addInstr(env, AMD64Instr_SseReRg(Asse_OR, hi, dst));
3858 return dst;
3860 addInstr(env, mk_vMOVsd_RR(greg, dst));
3861 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3862 add_to_rsp(env, 16);
3863 return dst;
3866 case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
3867 goto do_SseAssistedBinary;
3868 case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
3869 goto do_SseAssistedBinary;
3870 case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
3871 goto do_SseAssistedBinary;
3872 case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
3873 goto do_SseAssistedBinary;
3874 case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
3875 goto do_SseAssistedBinary;
3876 case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
3877 goto do_SseAssistedBinary;
3878 case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
3879 goto do_SseAssistedBinary;
3880 case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
3881 goto do_SseAssistedBinary;
3882 case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
3883 goto do_SseAssistedBinary;
3884 case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2;
3885 goto do_SseAssistedBinary;
3886 case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3887 goto do_SseAssistedBinary;
3888 case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
3889 goto do_SseAssistedBinary;
3890 case Iop_QNarrowBin32Sto16Ux8:
3891 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3892 goto do_SseAssistedBinary;
3893 case Iop_NarrowBin16to8x16:
3894 fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3895 goto do_SseAssistedBinary;
3896 case Iop_NarrowBin32to16x8:
3897 fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3898 goto do_SseAssistedBinary;
3899 do_SseAssistedBinary: {
3900 /* RRRufff! RRRufff code is what we're generating here. Oh
3901 well. */
3902 vassert(fn != 0);
3903 HReg dst = newVRegV(env);
3904 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3905 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3906 HReg argp = newVRegI(env);
3907 /* subq $112, %rsp -- make a space*/
3908 sub_from_rsp(env, 112);
3909 /* leaq 48(%rsp), %r_argp -- point into it */
3910 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3911 argp));
3912 /* andq $-16, %r_argp -- 16-align the pointer */
3913 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3914 AMD64RMI_Imm( ~(UInt)15 ),
3915 argp));
3916 /* Prepare 3 arg regs:
3917 leaq 0(%r_argp), %rdi
3918 leaq 16(%r_argp), %rsi
3919 leaq 32(%r_argp), %rdx
3921 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3922 hregAMD64_RDI()));
3923 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3924 hregAMD64_RSI()));
3925 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3926 hregAMD64_RDX()));
3927 /* Store the two args, at (%rsi) and (%rdx):
3928 movupd %argL, 0(%rsi)
3929 movupd %argR, 0(%rdx)
3931 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3932 AMD64AMode_IR(0, hregAMD64_RSI())));
3933 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3934 AMD64AMode_IR(0, hregAMD64_RDX())));
3935 /* call the helper */
3936 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3937 3, mk_RetLoc_simple(RLPri_None) ));
3938 /* fetch the result from memory, using %r_argp, which the
3939 register allocator will keep alive across the call. */
3940 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3941 AMD64AMode_IR(0, argp)));
3942 /* and finally, clear the space */
3943 add_to_rsp(env, 112);
3944 return dst;
3947 case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3948 goto do_SseAssistedVectorAndScalar;
3949 case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3950 goto do_SseAssistedVectorAndScalar;
3951 do_SseAssistedVectorAndScalar: {
3952 /* RRRufff! RRRufff code is what we're generating here. Oh
3953 well. */
3954 vassert(fn != 0);
3955 HReg dst = newVRegV(env);
3956 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3957 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3958 HReg argp = newVRegI(env);
3959 /* subq $112, %rsp -- make a space*/
3960 sub_from_rsp(env, 112);
3961 /* leaq 48(%rsp), %r_argp -- point into it */
3962 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3963 argp));
3964 /* andq $-16, %r_argp -- 16-align the pointer */
3965 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3966 AMD64RMI_Imm( ~(UInt)15 ),
3967 argp));
3968 /* Prepare 2 vector arg regs:
3969 leaq 0(%r_argp), %rdi
3970 leaq 16(%r_argp), %rsi
3972 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3973 hregAMD64_RDI()));
3974 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3975 hregAMD64_RSI()));
3976 /* Store the vector arg, at (%rsi):
3977 movupd %argL, 0(%rsi)
3979 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3980 AMD64AMode_IR(0, hregAMD64_RSI())));
3981 /* And get the scalar value into rdx */
3982 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3984 /* call the helper */
3985 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3986 3, mk_RetLoc_simple(RLPri_None) ));
3987 /* fetch the result from memory, using %r_argp, which the
3988 register allocator will keep alive across the call. */
3989 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3990 AMD64AMode_IR(0, argp)));
3991 /* and finally, clear the space */
3992 add_to_rsp(env, 112);
3993 return dst;
3996 case Iop_I32StoF32x4:
3997 case Iop_F32toI32Sx4: {
3998 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3999 HReg dst = newVRegV(env);
4000 AMD64SseOp mop
4001 = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
4002 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4003 addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
4004 set_SSE_rounding_default(env);
4005 return dst;
4008 // Half-float vector conversion
4009 case Iop_F32toF16x8: {
4010 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4011 HReg srcHi, srcLo;
4012 iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
4013 HReg dstHi = newVRegV(env);
4014 HReg dstLo = newVRegV(env);
4015 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
4016 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
4017 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
4018 set_SSE_rounding_default(env);
4019 // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
4020 // need to compact all that into one register. There's probably a
4021 // more elegant way to do this, but ..
4022 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4023 // dstHi is now 127:64 = useful data, 63:0 = zero
4024 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4025 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
4026 // dstLo is now 127:64 = zero, 63:0 = useful data
4027 addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
4028 return dstLo;
4030 break;
4033 default:
4034 break;
4035 } /* switch (e->Iex.Binop.op) */
4036 } /* if (e->tag == Iex_Binop) */
4038 if (e->tag == Iex_Triop) {
4039 IRTriop *triop = e->Iex.Triop.details;
4040 switch (triop->op) {
4042 case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
4043 case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
4044 case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
4045 case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
4046 do_64Fx2_w_rm:
4048 HReg argL = iselVecExpr(env, triop->arg2);
4049 HReg argR = iselVecExpr(env, triop->arg3);
4050 HReg dst = newVRegV(env);
4051 addInstr(env, mk_vMOVsd_RR(argL, dst));
4052 /* XXXROUNDINGFIXME */
4053 /* set roundingmode here */
4054 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
4055 return dst;
4058 case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
4059 case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
4060 case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
4061 case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
4062 do_32Fx4_w_rm:
4064 HReg argL = iselVecExpr(env, triop->arg2);
4065 HReg argR = iselVecExpr(env, triop->arg3);
4066 HReg dst = newVRegV(env);
4067 addInstr(env, mk_vMOVsd_RR(argL, dst));
4068 /* XXXROUNDINGFIXME */
4069 /* set roundingmode here */
4070 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
4071 return dst;
4074 default:
4075 break;
4076 } /* switch (triop->op) */
4077 } /* if (e->tag == Iex_Triop) */
4079 if (e->tag == Iex_ITE) { // VFD
4080 HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue);
4081 HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse);
4082 HReg dst = newVRegV(env);
4083 addInstr(env, mk_vMOVsd_RR(r1,dst));
4084 AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
4085 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
4086 return dst;
4089 //vec_fail:
4090 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
4091 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4092 ppIRExpr(e);
4093 vpanic("iselVecExpr_wrk");
4097 /*---------------------------------------------------------*/
4098 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
4099 /*---------------------------------------------------------*/
4101 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
4102 ISelEnv* env, const IRExpr* e )
4104 iselDVecExpr_wrk( rHi, rLo, env, e );
4105 # if 0
4106 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
4107 # endif
4108 vassert(hregClass(*rHi) == HRcVec128);
4109 vassert(hregClass(*rLo) == HRcVec128);
4110 vassert(hregIsVirtual(*rHi));
4111 vassert(hregIsVirtual(*rLo));
4115 /* DO NOT CALL THIS DIRECTLY */
4116 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
4117 ISelEnv* env, const IRExpr* e )
4119 HWord fn = 0; /* address of helper fn, if required */
4120 vassert(e);
4121 IRType ty = typeOfIRExpr(env->type_env, e);
4122 vassert(ty == Ity_V256);
4123 UInt laneBits = 0;
4125 AMD64SseOp op = Asse_INVALID;
4127 /* read 256-bit IRTemp */
4128 if (e->tag == Iex_RdTmp) {
4129 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
4130 return;
4133 if (e->tag == Iex_Get) {
4134 HReg vHi = newVRegV(env);
4135 HReg vLo = newVRegV(env);
4136 HReg rbp = hregAMD64_RBP();
4137 AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp);
4138 AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
4139 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
4140 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
4141 *rHi = vHi;
4142 *rLo = vLo;
4143 return;
4146 if (e->tag == Iex_Load) {
4147 HReg vHi = newVRegV(env);
4148 HReg vLo = newVRegV(env);
4149 HReg rA = iselIntExpr_R(env, e->Iex.Load.addr);
4150 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4151 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4152 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
4153 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
4154 *rHi = vHi;
4155 *rLo = vLo;
4156 return;
4159 if (e->tag == Iex_Const) {
4160 vassert(e->Iex.Const.con->tag == Ico_V256);
4161 switch (e->Iex.Const.con->Ico.V256) {
4162 case 0x00000000: {
4163 HReg vHi = generate_zeroes_V128(env);
4164 HReg vLo = newVRegV(env);
4165 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
4166 *rHi = vHi;
4167 *rLo = vLo;
4168 return;
4170 case 0xFFFFFFFF: {
4171 HReg vHi = generate_ones_V128(env);
4172 HReg vLo = newVRegV(env);
4173 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
4174 *rHi = vHi;
4175 *rLo = vLo;
4176 return;
4178 default:
4179 break; /* give up. Until such time as is necessary. */
4183 if (e->tag == Iex_Unop) {
4184 switch (e->Iex.Unop.op) {
4186 case Iop_NotV256: {
4187 HReg argHi, argLo;
4188 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4189 *rHi = do_sse_NotV128(env, argHi);
4190 *rLo = do_sse_NotV128(env, argLo);
4191 return;
4194 case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary;
4195 case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary;
4196 case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
4197 do_32Fx8_unary:
4199 HReg argHi, argLo;
4200 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4201 HReg dstHi = newVRegV(env);
4202 HReg dstLo = newVRegV(env);
4203 addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4204 addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4205 *rHi = dstHi;
4206 *rLo = dstLo;
4207 return;
4210 case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary;
4211 do_64Fx4_unary:
4213 HReg argHi, argLo;
4214 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4215 HReg dstHi = newVRegV(env);
4216 HReg dstLo = newVRegV(env);
4217 addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4218 addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4219 *rHi = dstHi;
4220 *rLo = dstLo;
4221 return;
4224 case Iop_CmpNEZ64x4: {
4225 /* We can use SSE2 instructions for this. */
4226 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4227 (obviously). See comment on Iop_CmpNEZ64x2 for
4228 explanation of what's going on here. */
4229 HReg argHi, argLo;
4230 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4231 HReg tmpHi = generate_zeroes_V128(env);
4232 HReg tmpLo = newVRegV(env);
4233 addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4234 HReg dstHi = newVRegV(env);
4235 HReg dstLo = newVRegV(env);
4236 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4237 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4238 tmpHi = do_sse_NotV128(env, tmpHi);
4239 tmpLo = do_sse_NotV128(env, tmpLo);
4240 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4241 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4242 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4243 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4244 *rHi = dstHi;
4245 *rLo = dstLo;
4246 return;
4249 case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4250 case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4251 case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
4252 do_CmpNEZ_vector:
4254 HReg argHi, argLo;
4255 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4256 HReg tmpHi = newVRegV(env);
4257 HReg tmpLo = newVRegV(env);
4258 HReg zero = generate_zeroes_V128(env);
4259 HReg dstHi, dstLo;
4260 addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4261 addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4262 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4263 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4264 dstHi = do_sse_NotV128(env, tmpHi);
4265 dstLo = do_sse_NotV128(env, tmpLo);
4266 *rHi = dstHi;
4267 *rLo = dstLo;
4268 return;
4271 case Iop_F16toF32x8: {
4272 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4273 HReg src = iselVecExpr(env, e->Iex.Unop.arg);
4274 HReg srcCopy = newVRegV(env);
4275 HReg dstHi = newVRegV(env);
4276 HReg dstLo = newVRegV(env);
4277 // Copy src, since we'll need to modify it.
4278 addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4279 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4280 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4281 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4282 *rHi = dstHi;
4283 *rLo = dstLo;
4284 return;
4286 break;
4289 default:
4290 break;
4291 } /* switch (e->Iex.Unop.op) */
4292 } /* if (e->tag == Iex_Unop) */
4294 if (e->tag == Iex_Binop) {
4295 switch (e->Iex.Binop.op) {
4297 case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4;
4298 case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4;
4299 do_64Fx4:
4301 HReg argLhi, argLlo, argRhi, argRlo;
4302 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4303 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4304 HReg dstHi = newVRegV(env);
4305 HReg dstLo = newVRegV(env);
4306 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4307 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4308 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4309 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4310 *rHi = dstHi;
4311 *rLo = dstLo;
4312 return;
4315 case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8;
4316 case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8;
4317 do_32Fx8:
4319 HReg argLhi, argLlo, argRhi, argRlo;
4320 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4321 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4322 HReg dstHi = newVRegV(env);
4323 HReg dstLo = newVRegV(env);
4324 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4325 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4326 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4327 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4328 *rHi = dstHi;
4329 *rLo = dstLo;
4330 return;
4333 case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
4334 case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
4335 case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
4336 case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
4337 case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
4338 case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
4339 case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
4340 case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
4341 case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
4342 case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
4343 case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
4344 case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
4345 case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
4346 case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
4347 case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
4348 case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
4349 case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
4350 case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4351 case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4352 case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
4353 case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
4354 case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
4355 case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
4356 case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4357 case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4358 case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
4359 case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
4360 case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
4361 case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
4362 case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
4363 case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
4364 case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
4365 case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
4366 case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
4367 do_SseReRg:
4369 HReg argLhi, argLlo, argRhi, argRlo;
4370 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4371 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4372 HReg dstHi = newVRegV(env);
4373 HReg dstLo = newVRegV(env);
4374 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4375 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4376 addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4377 addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4378 *rHi = dstHi;
4379 *rLo = dstLo;
4380 return;
4383 case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4384 case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4385 case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4386 case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4387 case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4388 case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4389 case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4390 case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4391 do_SseShift: {
4392 HReg dstHi = newVRegV(env);
4393 HReg dstLo = newVRegV(env);
4394 HReg gregHi, gregLo;
4395 iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4396 /* If it's a shift by an in-range immediate, generate two single
4397 instructions. */
4398 if (e->Iex.Binop.arg2->tag == Iex_Const) {
4399 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4400 vassert(c->tag == Ico_U8);
4401 UInt shift = c->Ico.U8;
4402 if (shift < laneBits) {
4403 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4404 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4405 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4406 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4407 *rHi = dstHi;
4408 *rLo = dstLo;
4409 return;
4412 /* Otherwise we have to do it the longwinded way. */
4413 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4414 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
4415 HReg ereg = newVRegV(env);
4416 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4417 addInstr(env, AMD64Instr_Push(rmi));
4418 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4419 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4420 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4421 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4422 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4423 add_to_rsp(env, 16);
4424 *rHi = dstHi;
4425 *rLo = dstLo;
4426 return;
4429 case Iop_V128HLtoV256: {
4430 // Curiously, there doesn't seem to be any benefit to be had here by
4431 // checking whether arg1 and arg2 are the same, in the style of how
4432 // (eg) 64HLtoV128 is handled elsewhere in this file.
4433 *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4434 *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4435 return;
4438 case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
4439 goto do_SseAssistedBinary;
4440 case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
4441 goto do_SseAssistedBinary;
4442 case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
4443 goto do_SseAssistedBinary;
4444 case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
4445 goto do_SseAssistedBinary;
4446 case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
4447 goto do_SseAssistedBinary;
4448 case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
4449 goto do_SseAssistedBinary;
4450 case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
4451 goto do_SseAssistedBinary;
4452 case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
4453 goto do_SseAssistedBinary;
4454 case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
4455 goto do_SseAssistedBinary;
4456 case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
4457 goto do_SseAssistedBinary;
4458 case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4459 goto do_SseAssistedBinary;
4460 do_SseAssistedBinary: {
4461 /* RRRufff! RRRufff code is what we're generating here. Oh
4462 well. */
4463 vassert(fn != 0);
4464 HReg dstHi = newVRegV(env);
4465 HReg dstLo = newVRegV(env);
4466 HReg argLhi, argLlo, argRhi, argRlo;
4467 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4468 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4469 HReg argp = newVRegI(env);
4470 /* subq $160, %rsp -- make a space*/
4471 sub_from_rsp(env, 160);
4472 /* leaq 48(%rsp), %r_argp -- point into it */
4473 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4474 argp));
4475 /* andq $-16, %r_argp -- 16-align the pointer */
4476 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4477 AMD64RMI_Imm( ~(UInt)15 ),
4478 argp));
4479 /* Prepare 3 arg regs:
4480 leaq 0(%r_argp), %rdi
4481 leaq 16(%r_argp), %rsi
4482 leaq 32(%r_argp), %rdx
4484 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4485 hregAMD64_RDI()));
4486 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4487 hregAMD64_RSI()));
4488 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4489 hregAMD64_RDX()));
4490 /* Store the two high args, at (%rsi) and (%rdx):
4491 movupd %argLhi, 0(%rsi)
4492 movupd %argRhi, 0(%rdx)
4494 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4495 AMD64AMode_IR(0, hregAMD64_RSI())));
4496 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4497 AMD64AMode_IR(0, hregAMD64_RDX())));
4498 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4499 movupd %argLlo, 48(%rsi)
4500 movupd %argRlo, 48(%rdx)
4502 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4503 AMD64AMode_IR(48, hregAMD64_RSI())));
4504 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4505 AMD64AMode_IR(48, hregAMD64_RDX())));
4506 /* call the helper */
4507 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4508 mk_RetLoc_simple(RLPri_None) ));
4509 /* Prepare 3 arg regs:
4510 leaq 48(%r_argp), %rdi
4511 leaq 64(%r_argp), %rsi
4512 leaq 80(%r_argp), %rdx
4514 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4515 hregAMD64_RDI()));
4516 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4517 hregAMD64_RSI()));
4518 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4519 hregAMD64_RDX()));
4520 /* call the helper */
4521 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4522 mk_RetLoc_simple(RLPri_None) ));
4523 /* fetch the result from memory, using %r_argp, which the
4524 register allocator will keep alive across the call. */
4525 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4526 AMD64AMode_IR(0, argp)));
4527 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4528 AMD64AMode_IR(48, argp)));
4529 /* and finally, clear the space */
4530 add_to_rsp(env, 160);
4531 *rHi = dstHi;
4532 *rLo = dstLo;
4533 return;
4536 case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
4537 goto do_SseAssistedBinary256;
4538 do_SseAssistedBinary256: {
4539 /* RRRufff! RRRufff code is what we're generating here. Oh
4540 well. */
4541 vassert(fn != 0);
4542 HReg dstHi = newVRegV(env);
4543 HReg dstLo = newVRegV(env);
4544 HReg argLhi, argLlo, argRhi, argRlo;
4545 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4546 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4547 HReg argp = newVRegI(env);
4548 /* subq $160, %rsp -- make a space*/
4549 sub_from_rsp(env, 160);
4550 /* leaq 48(%rsp), %r_argp -- point into it */
4551 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4552 argp));
4553 /* andq $-16, %r_argp -- 16-align the pointer */
4554 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4555 AMD64RMI_Imm( ~(UInt)15 ),
4556 argp));
4557 /* Prepare 3 arg regs:
4558 leaq 0(%r_argp), %rdi
4559 leaq 32(%r_argp), %rsi
4560 leaq 64(%r_argp), %rdx
4562 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4563 hregAMD64_RDI()));
4564 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4565 hregAMD64_RSI()));
4566 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4567 hregAMD64_RDX()));
4568 /* Store the two args, at (%rsi) and (%rdx):
4569 movupd %argLlo, 0(%rsi)
4570 movupd %argLhi, 16(%rsi)
4571 movupd %argRlo, 0(%rdx)
4572 movupd %argRhi, 16(%rdx)
4574 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4575 AMD64AMode_IR(0, hregAMD64_RSI())));
4576 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4577 AMD64AMode_IR(16, hregAMD64_RSI())));
4578 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4579 AMD64AMode_IR(0, hregAMD64_RDX())));
4580 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4581 AMD64AMode_IR(16, hregAMD64_RDX())));
4582 /* call the helper */
4583 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4584 mk_RetLoc_simple(RLPri_None) ));
4585 /* fetch the result from memory, using %r_argp, which the
4586 register allocator will keep alive across the call. */
4587 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4588 AMD64AMode_IR(0, argp)));
4589 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4590 AMD64AMode_IR(16, argp)));
4591 /* and finally, clear the space */
4592 add_to_rsp(env, 160);
4593 *rHi = dstHi;
4594 *rLo = dstLo;
4595 return;
4598 case Iop_I32StoF32x8:
4599 case Iop_F32toI32Sx8: {
4600 HReg argHi, argLo;
4601 iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4602 HReg dstHi = newVRegV(env);
4603 HReg dstLo = newVRegV(env);
4604 AMD64SseOp mop
4605 = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4606 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4607 addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4608 addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4609 set_SSE_rounding_default(env);
4610 *rHi = dstHi;
4611 *rLo = dstLo;
4612 return;
4615 default:
4616 break;
4617 } /* switch (e->Iex.Binop.op) */
4618 } /* if (e->tag == Iex_Binop) */
4620 if (e->tag == Iex_Triop) {
4621 IRTriop *triop = e->Iex.Triop.details;
4622 switch (triop->op) {
4624 case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4625 case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4626 case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4627 case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4628 do_64Fx4_w_rm:
4630 HReg argLhi, argLlo, argRhi, argRlo;
4631 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4632 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4633 HReg dstHi = newVRegV(env);
4634 HReg dstLo = newVRegV(env);
4635 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4636 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4637 /* XXXROUNDINGFIXME */
4638 /* set roundingmode here */
4639 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4640 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4641 *rHi = dstHi;
4642 *rLo = dstLo;
4643 return;
4646 case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4647 case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4648 case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4649 case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4650 do_32Fx8_w_rm:
4652 HReg argLhi, argLlo, argRhi, argRlo;
4653 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4654 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4655 HReg dstHi = newVRegV(env);
4656 HReg dstLo = newVRegV(env);
4657 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4658 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4659 /* XXXROUNDINGFIXME */
4660 /* set roundingmode here */
4661 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4662 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4663 *rHi = dstHi;
4664 *rLo = dstLo;
4665 return;
4668 default:
4669 break;
4670 } /* switch (triop->op) */
4671 } /* if (e->tag == Iex_Triop) */
4674 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4675 const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4676 const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4677 const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4678 const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4679 // If the args are trivially the same (tmp or const), use the same
4680 // source register for all four, and only one movq since those are
4681 // (relatively) expensive.
4682 if (areAtomsAndEqual(arg1, arg2)
4683 && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4684 HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4685 HReg tmp = newVRegV(env);
4686 HReg dst = newVRegV(env);
4687 addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4688 addInstr(env, mk_vMOVsd_RR(dst, tmp));
4689 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4690 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4691 *rHi = dst;
4692 *rLo = dst;
4693 } else {
4694 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4695 HReg q3 = iselIntExpr_R(env, arg1);
4696 HReg q2 = iselIntExpr_R(env, arg2);
4697 HReg q1 = iselIntExpr_R(env, arg3);
4698 HReg q0 = iselIntExpr_R(env, arg4);
4699 HReg tmp = newVRegV(env);
4700 HReg dstHi = newVRegV(env);
4701 HReg dstLo = newVRegV(env);
4702 addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4703 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4704 addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4705 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4706 addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4707 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4708 addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4709 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4710 *rHi = dstHi;
4711 *rLo = dstLo;
4713 return;
4716 if (e->tag == Iex_ITE) {
4717 HReg r1Hi, r1Lo, r0Hi, r0Lo;
4718 iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4719 iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4720 HReg dstHi = newVRegV(env);
4721 HReg dstLo = newVRegV(env);
4722 addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4723 addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4724 AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
4725 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4726 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4727 *rHi = dstHi;
4728 *rLo = dstLo;
4729 return;
4732 //avx_fail:
4733 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4734 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4735 ppIRExpr(e);
4736 vpanic("iselDVecExpr_wrk");
4740 /*---------------------------------------------------------*/
4741 /*--- ISEL: Statements ---*/
4742 /*---------------------------------------------------------*/
4744 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4746 if (vex_traceflags & VEX_TRACE_VCODE) {
4747 vex_printf("\n-- ");
4748 ppIRStmt(stmt);
4749 vex_printf("\n");
4752 switch (stmt->tag) {
4754 /* --------- LOADG (guarded load) --------- */
4755 case Ist_LoadG: {
4756 IRLoadG* lg = stmt->Ist.LoadG.details;
4757 if (lg->end != Iend_LE)
4758 goto stmt_fail;
4760 UChar szB = 0; /* invalid */
4761 switch (lg->cvt) {
4762 case ILGop_Ident32: szB = 4; break;
4763 case ILGop_Ident64: szB = 8; break;
4764 case ILGop_IdentV128: szB = 16; break;
4765 default: break;
4767 if (szB == 0)
4768 goto stmt_fail;
4770 AMD64AMode* amAddr
4771 = iselIntExpr_AMode(env, lg->addr);
4772 HReg rAlt
4773 = szB == 16 ? iselVecExpr(env, lg->alt)
4774 : iselIntExpr_R(env, lg->alt);
4775 HReg rDst
4776 = lookupIRTemp(env, lg->dst);
4778 /* Get the alt value into the dst. We'll do a conditional load
4779 which overwrites it -- or not -- with loaded data. */
4780 if (szB == 16) {
4781 addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4782 } else {
4783 addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4785 AMD64CondCode cc = iselCondCode_C(env, lg->guard);
4786 if (szB == 16) {
4787 addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4788 } else {
4789 addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4791 return;
4794 /* --------- STOREG (guarded store) --------- */
4795 case Ist_StoreG: {
4796 IRStoreG* sg = stmt->Ist.StoreG.details;
4797 if (sg->end != Iend_LE)
4798 goto stmt_fail;
4800 UChar szB = 0; /* invalid */
4801 switch (typeOfIRExpr(env->type_env, sg->data)) {
4802 case Ity_I32: szB = 4; break;
4803 case Ity_I64: szB = 8; break;
4804 case Ity_V128: szB = 16; break;
4805 default: break;
4807 if (szB == 0)
4808 goto stmt_fail;
4810 AMD64AMode* amAddr
4811 = iselIntExpr_AMode(env, sg->addr);
4812 HReg rSrc
4813 = szB == 16 ? iselVecExpr(env, sg->data)
4814 : iselIntExpr_R(env, sg->data);
4815 AMD64CondCode cc
4816 = iselCondCode_C(env, sg->guard);
4817 if (szB == 16) {
4818 addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4819 } else {
4820 addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4822 return;
4825 /* --------- STORE --------- */
4826 case Ist_Store: {
4827 IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4828 IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4829 IREndness end = stmt->Ist.Store.end;
4831 if (tya != Ity_I64 || end != Iend_LE)
4832 goto stmt_fail;
4834 if (tyd == Ity_I64) {
4835 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4836 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4837 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4838 return;
4840 if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4841 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4842 HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4843 addInstr(env, AMD64Instr_Store(
4844 toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4845 r,am));
4846 return;
4848 if (tyd == Ity_F64) {
4849 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4850 HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4851 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4852 return;
4854 if (tyd == Ity_F32) {
4855 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4856 HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4857 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4858 return;
4860 if (tyd == Ity_V128) {
4861 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4862 HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4863 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4864 return;
4866 if (tyd == Ity_V256) {
4867 HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4868 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4869 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4870 HReg vHi, vLo;
4871 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4872 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4873 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4874 return;
4876 break;
4879 /* --------- PUT --------- */
4880 case Ist_Put: {
4881 IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4882 if (ty == Ity_I64) {
4883 /* We're going to write to memory, so compute the RHS into an
4884 AMD64RI. */
4885 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4886 addInstr(env,
4887 AMD64Instr_Alu64M(
4888 Aalu_MOV,
4890 AMD64AMode_IR(stmt->Ist.Put.offset,
4891 hregAMD64_RBP())
4893 return;
4895 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4896 HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4897 addInstr(env, AMD64Instr_Store(
4898 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4900 AMD64AMode_IR(stmt->Ist.Put.offset,
4901 hregAMD64_RBP())));
4902 return;
4904 if (ty == Ity_F32) {
4905 HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4906 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4907 set_SSE_rounding_default(env); /* paranoia */
4908 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4909 return;
4911 if (ty == Ity_F64) {
4912 HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4913 AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4914 hregAMD64_RBP() );
4915 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4916 return;
4918 if (ty == Ity_V128) {
4919 HReg vec = iselVecExpr(env, stmt->Ist.Put.data);
4920 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset,
4921 hregAMD64_RBP());
4922 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4923 return;
4925 if (ty == Ity_V256) {
4926 HReg vHi, vLo;
4927 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4928 HReg rbp = hregAMD64_RBP();
4929 AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp);
4930 AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4931 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4932 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4933 return;
4935 break;
4938 /* --------- Indexed PUT --------- */
4939 case Ist_PutI: {
4940 IRPutI *puti = stmt->Ist.PutI.details;
4942 AMD64AMode* am
4943 = genGuestArrayOffset(
4944 env, puti->descr,
4945 puti->ix, puti->bias );
4947 IRType ty = typeOfIRExpr(env->type_env, puti->data);
4948 if (ty == Ity_F64) {
4949 HReg val = iselDblExpr(env, puti->data);
4950 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4951 return;
4953 if (ty == Ity_I8) {
4954 HReg r = iselIntExpr_R(env, puti->data);
4955 addInstr(env, AMD64Instr_Store( 1, r, am ));
4956 return;
4958 if (ty == Ity_I64) {
4959 AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4960 addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4961 return;
4963 break;
4966 /* --------- TMP --------- */
4967 case Ist_WrTmp: {
4968 IRTemp tmp = stmt->Ist.WrTmp.tmp;
4969 IRType ty = typeOfIRTemp(env->type_env, tmp);
4971 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4972 compute it into an AMode and then use LEA. This usually
4973 produces fewer instructions, often because (for memcheck
4974 created IR) we get t = address-expression, (t is later used
4975 twice) and so doing this naturally turns address-expression
4976 back into an AMD64 amode. */
4977 if (ty == Ity_I64
4978 && stmt->Ist.WrTmp.data->tag == Iex_Binop
4979 && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4980 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4981 HReg dst = lookupIRTemp(env, tmp);
4982 if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4983 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4984 value into a register. Just emit a normal reg-reg move
4985 so reg-alloc can coalesce it away in the usual way. */
4986 HReg src = am->Aam.IR.reg;
4987 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4988 } else {
4989 addInstr(env, AMD64Instr_Lea64(am,dst));
4991 return;
4994 if (ty == Ity_I64 || ty == Ity_I32
4995 || ty == Ity_I16 || ty == Ity_I8) {
4996 AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4997 HReg dst = lookupIRTemp(env, tmp);
4998 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4999 return;
5001 if (ty == Ity_I128) {
5002 HReg rHi, rLo, dstHi, dstLo;
5003 iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
5004 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
5005 addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
5006 addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
5007 return;
5009 if (ty == Ity_I1) {
5010 AMD64CondCode cond = iselCondCode_C(env, stmt->Ist.WrTmp.data);
5011 HReg dst = lookupIRTemp(env, tmp);
5012 addInstr(env, AMD64Instr_Set64(cond, dst));
5013 return;
5015 if (ty == Ity_F64) {
5016 HReg dst = lookupIRTemp(env, tmp);
5017 HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
5018 addInstr(env, mk_vMOVsd_RR(src, dst));
5019 return;
5021 if (ty == Ity_F32) {
5022 HReg dst = lookupIRTemp(env, tmp);
5023 HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
5024 addInstr(env, mk_vMOVsd_RR(src, dst));
5025 return;
5027 if (ty == Ity_V128) {
5028 HReg dst = lookupIRTemp(env, tmp);
5029 HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
5030 addInstr(env, mk_vMOVsd_RR(src, dst));
5031 return;
5033 if (ty == Ity_V256) {
5034 HReg rHi, rLo, dstHi, dstLo;
5035 iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
5036 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
5037 addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
5038 addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
5039 return;
5041 break;
5044 /* --------- Call to DIRTY helper --------- */
5045 case Ist_Dirty: {
5046 IRDirty* d = stmt->Ist.Dirty.details;
5048 /* Figure out the return type, if any. */
5049 IRType retty = Ity_INVALID;
5050 if (d->tmp != IRTemp_INVALID)
5051 retty = typeOfIRTemp(env->type_env, d->tmp);
5053 /* Throw out any return types we don't know about. */
5054 Bool retty_ok = False;
5055 switch (retty) {
5056 case Ity_INVALID: /* function doesn't return anything */
5057 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
5058 case Ity_V128: case Ity_V256:
5059 retty_ok = True; break;
5060 default:
5061 break;
5063 if (!retty_ok)
5064 break; /* will go to stmt_fail: */
5066 /* Marshal args, do the call, and set the return value to
5067 0x555..555 if this is a conditional call that returns a value
5068 and the call is skipped. */
5069 UInt addToSp = 0;
5070 RetLoc rloc = mk_RetLoc_INVALID();
5071 doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
5072 vassert(is_sane_RetLoc(rloc));
5074 /* Now figure out what to do with the returned value, if any. */
5075 switch (retty) {
5076 case Ity_INVALID: {
5077 /* No return value. Nothing to do. */
5078 vassert(d->tmp == IRTemp_INVALID);
5079 vassert(rloc.pri == RLPri_None);
5080 vassert(addToSp == 0);
5081 return;
5083 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
5084 /* The returned value is in %rax. Park it in the register
5085 associated with tmp. */
5086 vassert(rloc.pri == RLPri_Int);
5087 vassert(addToSp == 0);
5088 HReg dst = lookupIRTemp(env, d->tmp);
5089 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
5090 return;
5092 case Ity_V128: {
5093 /* The returned value is on the stack, and rloc.spOff
5094 tells us where. Fish it off the stack and then move
5095 the stack pointer upwards to clear it, as directed by
5096 doHelperCall. */
5097 vassert(rloc.pri == RLPri_V128SpRel);
5098 vassert(addToSp >= 16);
5099 HReg dst = lookupIRTemp(env, d->tmp);
5100 AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
5101 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
5102 add_to_rsp(env, addToSp);
5103 return;
5105 case Ity_V256: {
5106 /* See comments for Ity_V128. */
5107 vassert(rloc.pri == RLPri_V256SpRel);
5108 vassert(addToSp >= 32);
5109 HReg dstLo, dstHi;
5110 lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
5111 AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
5112 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
5113 AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
5114 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
5115 add_to_rsp(env, addToSp);
5116 return;
5118 default:
5119 /*NOTREACHED*/
5120 vassert(0);
5122 break;
5125 /* --------- MEM FENCE --------- */
5126 case Ist_MBE:
5127 switch (stmt->Ist.MBE.event) {
5128 case Imbe_Fence:
5129 addInstr(env, AMD64Instr_MFence());
5130 return;
5131 default:
5132 break;
5134 break;
5136 /* --------- ACAS --------- */
5137 case Ist_CAS:
5138 if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
5139 /* "normal" singleton CAS */
5140 UChar sz;
5141 IRCAS* cas = stmt->Ist.CAS.details;
5142 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
5143 /* get: cas->expd into %rax, and cas->data into %rbx */
5144 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
5145 HReg rData = iselIntExpr_R(env, cas->dataLo);
5146 HReg rExpd = iselIntExpr_R(env, cas->expdLo);
5147 HReg rOld = lookupIRTemp(env, cas->oldLo);
5148 vassert(cas->expdHi == NULL);
5149 vassert(cas->dataHi == NULL);
5150 addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
5151 addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
5152 addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
5153 switch (ty) {
5154 case Ity_I64: sz = 8; break;
5155 case Ity_I32: sz = 4; break;
5156 case Ity_I16: sz = 2; break;
5157 case Ity_I8: sz = 1; break;
5158 default: goto unhandled_cas;
5160 addInstr(env, AMD64Instr_ACAS(am, sz));
5161 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
5162 return;
5163 } else {
5164 /* double CAS */
5165 UChar sz;
5166 IRCAS* cas = stmt->Ist.CAS.details;
5167 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
5168 /* only 32-bit and 64-bit allowed in this case */
5169 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
5170 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
5171 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
5172 HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
5173 HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
5174 HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
5175 HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
5176 HReg rOldHi = lookupIRTemp(env, cas->oldHi);
5177 HReg rOldLo = lookupIRTemp(env, cas->oldLo);
5178 switch (ty) {
5179 case Ity_I64:
5180 if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
5181 goto unhandled_cas; /* we'd have to generate
5182 cmpxchg16b, but the host
5183 doesn't support that */
5184 sz = 8;
5185 break;
5186 case Ity_I32:
5187 sz = 4;
5188 break;
5189 default:
5190 goto unhandled_cas;
5192 addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
5193 addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
5194 addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
5195 addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
5196 addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
5197 addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
5198 addInstr(env, AMD64Instr_DACAS(am, sz));
5199 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
5200 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
5201 return;
5203 unhandled_cas:
5204 break;
5206 /* --------- INSTR MARK --------- */
5207 /* Doesn't generate any executable code ... */
5208 case Ist_IMark:
5209 return;
5211 /* --------- ABI HINT --------- */
5212 /* These have no meaning (denotation in the IR) and so we ignore
5213 them ... if any actually made it this far. */
5214 case Ist_AbiHint:
5215 return;
5217 /* --------- NO-OP --------- */
5218 case Ist_NoOp:
5219 return;
5221 /* --------- EXIT --------- */
5222 case Ist_Exit: {
5223 if (stmt->Ist.Exit.dst->tag != Ico_U64)
5224 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5226 AMD64CondCode cc = iselCondCode_C(env, stmt->Ist.Exit.guard);
5227 AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5228 hregAMD64_RBP());
5230 /* Case: boring transfer to known address */
5231 if (stmt->Ist.Exit.jk == Ijk_Boring) {
5232 if (env->chainingAllowed) {
5233 /* .. almost always true .. */
5234 /* Skip the event check at the dst if this is a forwards
5235 edge. */
5236 Bool toFastEP
5237 = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5238 if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5239 addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5240 amRIP, cc, toFastEP));
5241 } else {
5242 /* .. very occasionally .. */
5243 /* We can't use chaining, so ask for an assisted transfer,
5244 as that's the only alternative that is allowable. */
5245 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5246 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5248 return;
5251 /* Case: assisted transfer to arbitrary address */
5252 switch (stmt->Ist.Exit.jk) {
5253 /* Keep this list in sync with that in iselNext below */
5254 case Ijk_ClientReq:
5255 case Ijk_EmWarn:
5256 case Ijk_NoDecode:
5257 case Ijk_NoRedir:
5258 case Ijk_SigSEGV:
5259 case Ijk_SigBUS:
5260 case Ijk_SigTRAP:
5261 case Ijk_Sys_syscall:
5262 case Ijk_Sys_int210:
5263 case Ijk_InvalICache:
5264 case Ijk_Yield:
5266 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5267 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5268 return;
5270 default:
5271 break;
5274 /* Do we ever expect to see any other kind? */
5275 goto stmt_fail;
5278 default: break;
5280 stmt_fail:
5281 ppIRStmt(stmt);
5282 vpanic("iselStmt(amd64)");
5286 /*---------------------------------------------------------*/
5287 /*--- ISEL: Basic block terminators (Nexts) ---*/
5288 /*---------------------------------------------------------*/
5290 static void iselNext ( ISelEnv* env,
5291 IRExpr* next, IRJumpKind jk, Int offsIP )
5293 if (vex_traceflags & VEX_TRACE_VCODE) {
5294 vex_printf( "\n-- PUT(%d) = ", offsIP);
5295 ppIRExpr( next );
5296 vex_printf( "; exit-");
5297 ppIRJumpKind(jk);
5298 vex_printf( "\n");
5301 /* Case: boring transfer to known address */
5302 if (next->tag == Iex_Const) {
5303 IRConst* cdst = next->Iex.Const.con;
5304 vassert(cdst->tag == Ico_U64);
5305 if (jk == Ijk_Boring || jk == Ijk_Call) {
5306 /* Boring transfer to known address */
5307 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5308 if (env->chainingAllowed) {
5309 /* .. almost always true .. */
5310 /* Skip the event check at the dst if this is a forwards
5311 edge. */
5312 Bool toFastEP
5313 = ((Addr64)cdst->Ico.U64) > env->max_ga;
5314 if (0) vex_printf("%s", toFastEP ? "X" : ".");
5315 addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5316 amRIP, Acc_ALWAYS,
5317 toFastEP));
5318 } else {
5319 /* .. very occasionally .. */
5320 /* We can't use chaining, so ask for an indirect transfer,
5321 as that's the cheapest alternative that is
5322 allowable. */
5323 HReg r = iselIntExpr_R(env, next);
5324 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5325 Ijk_Boring));
5327 return;
5331 /* Case: call/return (==boring) transfer to any address */
5332 switch (jk) {
5333 case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5334 HReg r = iselIntExpr_R(env, next);
5335 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5336 if (env->chainingAllowed) {
5337 addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5338 } else {
5339 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5340 Ijk_Boring));
5342 return;
5344 default:
5345 break;
5348 /* Case: assisted transfer to arbitrary address */
5349 switch (jk) {
5350 /* Keep this list in sync with that for Ist_Exit above */
5351 case Ijk_ClientReq:
5352 case Ijk_EmWarn:
5353 case Ijk_NoDecode:
5354 case Ijk_NoRedir:
5355 case Ijk_SigSEGV:
5356 case Ijk_SigBUS:
5357 case Ijk_SigTRAP:
5358 case Ijk_Sys_syscall:
5359 case Ijk_Sys_int210:
5360 case Ijk_InvalICache:
5361 case Ijk_Yield: {
5362 HReg r = iselIntExpr_R(env, next);
5363 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5364 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5365 return;
5367 default:
5368 break;
5371 vex_printf( "\n-- PUT(%d) = ", offsIP);
5372 ppIRExpr( next );
5373 vex_printf( "; exit-");
5374 ppIRJumpKind(jk);
5375 vex_printf( "\n");
5376 vassert(0); // are we expecting any other kind?
5380 /*---------------------------------------------------------*/
5381 /*--- Insn selector top-level ---*/
5382 /*---------------------------------------------------------*/
5384 /* Translate an entire SB to amd64 code. */
5386 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5387 VexArch arch_host,
5388 const VexArchInfo* archinfo_host,
5389 const VexAbiInfo* vbi/*UNUSED*/,
5390 Int offs_Host_EvC_Counter,
5391 Int offs_Host_EvC_FailAddr,
5392 Bool chainingAllowed,
5393 Bool addProfInc,
5394 Addr max_ga )
5396 Int i, j;
5397 HReg hreg, hregHI;
5398 ISelEnv* env;
5399 UInt hwcaps_host = archinfo_host->hwcaps;
5400 AMD64AMode *amCounter, *amFailAddr;
5402 /* sanity ... */
5403 vassert(arch_host == VexArchAMD64);
5404 vassert(0 == (hwcaps_host
5405 & ~(VEX_HWCAPS_AMD64_SSE3
5406 | VEX_HWCAPS_AMD64_SSSE3
5407 | VEX_HWCAPS_AMD64_CX16
5408 | VEX_HWCAPS_AMD64_LZCNT
5409 | VEX_HWCAPS_AMD64_AVX
5410 | VEX_HWCAPS_AMD64_RDTSCP
5411 | VEX_HWCAPS_AMD64_BMI
5412 | VEX_HWCAPS_AMD64_AVX2
5413 | VEX_HWCAPS_AMD64_F16C
5414 | VEX_HWCAPS_AMD64_RDRAND
5415 | VEX_HWCAPS_AMD64_RDSEED
5416 | VEX_HWCAPS_AMD64_FMA3
5417 | VEX_HWCAPS_AMD64_FMA4)));
5419 /* Check that the host's endianness is as expected. */
5420 vassert(archinfo_host->endness == VexEndnessLE);
5422 /* Make up an initial environment to use. */
5423 env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5424 env->vreg_ctr = 0;
5426 /* Set up output code array. */
5427 env->code = newHInstrArray();
5429 /* Copy BB's type env. */
5430 env->type_env = bb->tyenv;
5432 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
5433 change as we go along. */
5434 env->n_vregmap = bb->tyenv->types_used;
5435 env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5436 env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5438 /* and finally ... */
5439 env->chainingAllowed = chainingAllowed;
5440 env->hwcaps = hwcaps_host;
5441 env->max_ga = max_ga;
5443 /* For each IR temporary, allocate a suitably-kinded virtual
5444 register. */
5445 j = 0;
5446 for (i = 0; i < env->n_vregmap; i++) {
5447 hregHI = hreg = INVALID_HREG;
5448 switch (bb->tyenv->types[i]) {
5449 case Ity_I1:
5450 case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5451 hreg = mkHReg(True, HRcInt64, 0, j++);
5452 break;
5453 case Ity_I128:
5454 hreg = mkHReg(True, HRcInt64, 0, j++);
5455 hregHI = mkHReg(True, HRcInt64, 0, j++);
5456 break;
5457 case Ity_F32:
5458 case Ity_F64:
5459 case Ity_V128:
5460 hreg = mkHReg(True, HRcVec128, 0, j++);
5461 break;
5462 case Ity_V256:
5463 hreg = mkHReg(True, HRcVec128, 0, j++);
5464 hregHI = mkHReg(True, HRcVec128, 0, j++);
5465 break;
5466 default:
5467 ppIRType(bb->tyenv->types[i]);
5468 vpanic("iselBB(amd64): IRTemp type");
5470 env->vregmap[i] = hreg;
5471 env->vregmapHI[i] = hregHI;
5473 env->vreg_ctr = j;
5475 /* The very first instruction must be an event check. */
5476 amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP());
5477 amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5478 addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5480 /* Possibly a block counter increment (for profiling). At this
5481 point we don't know the address of the counter, so just pretend
5482 it is zero. It will have to be patched later, but before this
5483 translation is used, by a call to LibVEX_patchProfCtr. */
5484 if (addProfInc) {
5485 addInstr(env, AMD64Instr_ProfInc());
5488 /* Ok, finally we can iterate over the statements. */
5489 for (i = 0; i < bb->stmts_used; i++)
5490 if (bb->stmts[i])
5491 iselStmt(env, bb->stmts[i]);
5493 iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5495 /* record the number of vregs we used. */
5496 env->code->n_vregs = env->vreg_ctr;
5497 return env->code;
5501 /*---------------------------------------------------------------*/
5502 /*--- end host_amd64_isel.c ---*/
5503 /*---------------------------------------------------------------*/