amd64: Implement RDRAND, VCVTPH2PS and VCVTPS2PH.
[valgrind.git] / VEX / priv / host_amd64_isel.c
blob673909029d389252c921fc8f6888f87ce405c053
2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
28 The GNU General Public License is contained in the file COPYING.
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
36 #include "libvex_basictypes.h"
37 #include "libvex_ir.h"
38 #include "libvex.h"
40 #include "ir_match.h"
41 #include "main_util.h"
42 #include "main_globals.h"
43 #include "host_generic_regs.h"
44 #include "host_generic_simd64.h"
45 #include "host_generic_simd128.h"
46 #include "host_generic_simd256.h"
47 #include "host_generic_maddf.h"
48 #include "host_amd64_defs.h"
51 /*---------------------------------------------------------*/
52 /*--- x87/SSE control word stuff ---*/
53 /*---------------------------------------------------------*/
55 /* Vex-generated code expects to run with the FPU set as follows: all
56 exceptions masked, round-to-nearest, precision = 53 bits. This
57 corresponds to a FPU control word value of 0x027F.
59 Similarly the SSE control word (%mxcsr) should be 0x1F80.
61 %fpucw and %mxcsr should have these values on entry to
62 Vex-generated code, and should those values should be
63 unchanged at exit.
66 #define DEFAULT_FPUCW 0x027F
68 #define DEFAULT_MXCSR 0x1F80
70 /* debugging only, do not use */
71 /* define DEFAULT_FPUCW 0x037F */
74 /*---------------------------------------------------------*/
75 /*--- misc helpers ---*/
76 /*---------------------------------------------------------*/
78 /* These are duplicated in guest-amd64/toIR.c */
79 static IRExpr* unop ( IROp op, IRExpr* a )
81 return IRExpr_Unop(op, a);
84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
86 return IRExpr_Binop(op, a1, a2);
89 static IRExpr* bind ( Int binder )
91 return IRExpr_Binder(binder);
94 static Bool isZeroU8 ( const IRExpr* e )
96 return e->tag == Iex_Const
97 && e->Iex.Const.con->tag == Ico_U8
98 && e->Iex.Const.con->Ico.U8 == 0;
102 /*---------------------------------------------------------*/
103 /*--- ISelEnv ---*/
104 /*---------------------------------------------------------*/
106 /* This carries around:
108 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
109 might encounter. This is computed before insn selection starts,
110 and does not change.
112 - A mapping from IRTemp to HReg. This tells the insn selector
113 which virtual register is associated with each IRTemp
114 temporary. This is computed before insn selection starts, and
115 does not change. We expect this mapping to map precisely the
116 same set of IRTemps as the type mapping does.
118 - vregmap holds the primary register for the IRTemp.
119 - vregmapHI is only used for 128-bit integer-typed
120 IRTemps. It holds the identity of a second
121 64-bit virtual HReg, which holds the high half
122 of the value.
124 - The host subarchitecture we are selecting insns for.
125 This is set at the start and does not change.
127 - The code array, that is, the insns selected so far.
129 - A counter, for generating new virtual registers.
131 - A Bool for indicating whether we may generate chain-me
132 instructions for control flow transfers, or whether we must use
133 XAssisted.
135 - The maximum guest address of any guest insn in this block.
136 Actually, the address of the highest-addressed byte from any insn
137 in this block. Is set at the start and does not change. This is
138 used for detecting jumps which are definitely forward-edges from
139 this block, and therefore can be made (chained) to the fast entry
140 point of the destination, thereby avoiding the destination's
141 event check.
143 Note, this is all host-independent. (JRS 20050201: well, kinda
144 ... not completely. Compare with ISelEnv for X86.)
147 typedef
148 struct {
149 /* Constant -- are set at the start and do not change. */
150 IRTypeEnv* type_env;
152 HReg* vregmap;
153 HReg* vregmapHI;
154 Int n_vregmap;
156 UInt hwcaps;
158 Bool chainingAllowed;
159 Addr64 max_ga;
161 /* These are modified as we go along. */
162 HInstrArray* code;
163 Int vreg_ctr;
165 ISelEnv;
168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
170 vassert(tmp >= 0);
171 vassert(tmp < env->n_vregmap);
172 return env->vregmap[tmp];
175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
176 ISelEnv* env, IRTemp tmp )
178 vassert(tmp >= 0);
179 vassert(tmp < env->n_vregmap);
180 vassert(! hregIsInvalid(env->vregmapHI[tmp]));
181 *vrLO = env->vregmap[tmp];
182 *vrHI = env->vregmapHI[tmp];
185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
187 addHInstr(env->code, instr);
188 if (vex_traceflags & VEX_TRACE_VCODE) {
189 ppAMD64Instr(instr, True);
190 vex_printf("\n");
194 static HReg newVRegI ( ISelEnv* env )
196 HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
197 env->vreg_ctr++;
198 return reg;
201 static HReg newVRegV ( ISelEnv* env )
203 HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
204 env->vreg_ctr++;
205 return reg;
209 /*---------------------------------------------------------*/
210 /*--- ISEL: Forward declarations ---*/
211 /*---------------------------------------------------------*/
213 /* These are organised as iselXXX and iselXXX_wrk pairs. The
214 iselXXX_wrk do the real work, but are not to be called directly.
215 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
216 checks that all returned registers are virtual. You should not
217 call the _wrk version directly.
219 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
220 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e );
222 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
223 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e );
225 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
226 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e );
228 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
229 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e );
231 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
232 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e );
234 static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
235 ISelEnv* env, const IRExpr* e );
236 static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
237 ISelEnv* env, const IRExpr* e );
239 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e );
240 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e );
242 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
243 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e );
245 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
246 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e );
248 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
249 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e );
251 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252 ISelEnv* env, const IRExpr* e );
253 static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
254 ISelEnv* env, const IRExpr* e );
257 /*---------------------------------------------------------*/
258 /*--- ISEL: Misc helpers ---*/
259 /*---------------------------------------------------------*/
261 static Bool sane_AMode ( AMD64AMode* am )
263 switch (am->tag) {
264 case Aam_IR:
265 return
266 toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267 && (hregIsVirtual(am->Aam.IR.reg)
268 || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269 case Aam_IRRS:
270 return
271 toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272 && hregIsVirtual(am->Aam.IRRS.base)
273 && hregClass(am->Aam.IRRS.index) == HRcInt64
274 && hregIsVirtual(am->Aam.IRRS.index) );
275 default:
276 vpanic("sane_AMode: unknown amd64 amode tag");
281 /* Can the lower 32 bits be signedly widened to produce the whole
282 64-bit value? In other words, are the top 33 bits either all 0 or
283 all 1 ? */
284 static Bool fitsIn32Bits ( ULong x )
286 Long y1;
287 y1 = x << 32;
288 y1 >>=/*s*/ 32;
289 return toBool(x == y1);
292 /* Is this a 64-bit zero expression? */
294 static Bool isZeroU64 ( const IRExpr* e )
296 return e->tag == Iex_Const
297 && e->Iex.Const.con->tag == Ico_U64
298 && e->Iex.Const.con->Ico.U64 == 0ULL;
301 static Bool isZeroU32 ( const IRExpr* e )
303 return e->tag == Iex_Const
304 && e->Iex.Const.con->tag == Ico_U32
305 && e->Iex.Const.con->Ico.U32 == 0;
308 /* Are both args atoms and the same? This is copy of eqIRAtom
309 that omits the assertions that the args are indeed atoms. */
311 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
313 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
314 return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
315 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
316 return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
317 return False;
320 /* Make a int reg-reg move. */
322 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
324 vassert(hregClass(src) == HRcInt64);
325 vassert(hregClass(dst) == HRcInt64);
326 return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
329 /* Make a vector (128 bit) reg-reg move. */
331 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
333 vassert(hregClass(src) == HRcVec128);
334 vassert(hregClass(dst) == HRcVec128);
335 return AMD64Instr_SseReRg(Asse_MOV, src, dst);
338 /* Advance/retreat %rsp by n. */
340 static void add_to_rsp ( ISelEnv* env, Int n )
342 vassert(n > 0 && n < 256 && (n%8) == 0);
343 addInstr(env,
344 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
345 hregAMD64_RSP()));
348 static void sub_from_rsp ( ISelEnv* env, Int n )
350 vassert(n > 0 && n < 256 && (n%8) == 0);
351 addInstr(env,
352 AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
353 hregAMD64_RSP()));
356 /* Push 64-bit constants on the stack. */
357 static void push_uimm64( ISelEnv* env, ULong uimm64 )
359 /* If uimm64 can be expressed as the sign extension of its
360 lower 32 bits, we can do it the easy way. */
361 Long simm64 = (Long)uimm64;
362 if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
363 addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
364 } else {
365 HReg tmp = newVRegI(env);
366 addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
367 addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
372 /* Used only in doHelperCall. If possible, produce a single
373 instruction which computes 'e' into 'dst'. If not possible, return
374 NULL. */
376 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
377 HReg dst,
378 IRExpr* e )
380 /* Per comments in doHelperCall below, appearance of
381 Iex_VECRET implies ill-formed IR. */
382 vassert(e->tag != Iex_VECRET);
384 /* In this case we give out a copy of the BaseBlock pointer. */
385 if (UNLIKELY(e->tag == Iex_GSPTR)) {
386 return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
389 vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
391 if (e->tag == Iex_Const) {
392 vassert(e->Iex.Const.con->tag == Ico_U64);
393 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
394 return AMD64Instr_Alu64R(
395 Aalu_MOV,
396 AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
399 } else {
400 return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
404 if (e->tag == Iex_RdTmp) {
405 HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
406 return mk_iMOVsd_RR(src, dst);
409 if (e->tag == Iex_Get) {
410 vassert(e->Iex.Get.ty == Ity_I64);
411 return AMD64Instr_Alu64R(
412 Aalu_MOV,
413 AMD64RMI_Mem(
414 AMD64AMode_IR(e->Iex.Get.offset,
415 hregAMD64_RBP())),
416 dst);
419 if (e->tag == Iex_Unop
420 && e->Iex.Unop.op == Iop_32Uto64
421 && e->Iex.Unop.arg->tag == Iex_RdTmp) {
422 HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
423 return AMD64Instr_MovxLQ(False, src, dst);
426 if (0) { ppIRExpr(e); vex_printf("\n"); }
428 return NULL;
432 /* Do a complete function call. |guard| is a Ity_Bit expression
433 indicating whether or not the call happens. If guard==NULL, the
434 call is unconditional. |retloc| is set to indicate where the
435 return value is after the call. The caller (of this fn) must
436 generate code to add |stackAdjustAfterCall| to the stack pointer
437 after the call is done. */
439 static
440 void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall,
441 /*OUT*/RetLoc* retloc,
442 ISelEnv* env,
443 IRExpr* guard,
444 IRCallee* cee, IRType retTy, IRExpr** args )
446 AMD64CondCode cc;
447 HReg argregs[6];
448 HReg tmpregs[6];
449 AMD64Instr* fastinstrs[6];
450 UInt n_args, i;
452 /* Set default returns. We'll update them later if needed. */
453 *stackAdjustAfterCall = 0;
454 *retloc = mk_RetLoc_INVALID();
456 /* These are used for cross-checking that IR-level constraints on
457 the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
458 UInt nVECRETs = 0;
459 UInt nGSPTRs = 0;
461 /* Marshal args for a call and do the call.
463 This function only deals with a tiny set of possibilities, which
464 cover all helpers in practice. The restrictions are that only
465 arguments in registers are supported, hence only 6x64 integer
466 bits in total can be passed. In fact the only supported arg
467 type is I64.
469 The return type can be I{64,32,16,8} or V{128,256}. In the
470 latter two cases, it is expected that |args| will contain the
471 special node IRExpr_VECRET(), in which case this routine
472 generates code to allocate space on the stack for the vector
473 return value. Since we are not passing any scalars on the
474 stack, it is enough to preallocate the return space before
475 marshalling any arguments, in this case.
477 |args| may also contain IRExpr_GSPTR(), in which case the
478 value in %rbp is passed as the corresponding argument.
480 Generating code which is both efficient and correct when
481 parameters are to be passed in registers is difficult, for the
482 reasons elaborated in detail in comments attached to
483 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
484 of the method described in those comments.
486 The problem is split into two cases: the fast scheme and the
487 slow scheme. In the fast scheme, arguments are computed
488 directly into the target (real) registers. This is only safe
489 when we can be sure that computation of each argument will not
490 trash any real registers set by computation of any other
491 argument.
493 In the slow scheme, all args are first computed into vregs, and
494 once they are all done, they are moved to the relevant real
495 regs. This always gives correct code, but it also gives a bunch
496 of vreg-to-rreg moves which are usually redundant but are hard
497 for the register allocator to get rid of.
499 To decide which scheme to use, all argument expressions are
500 first examined. If they are all so simple that it is clear they
501 will be evaluated without use of any fixed registers, use the
502 fast scheme, else use the slow scheme. Note also that only
503 unconditional calls may use the fast scheme, since having to
504 compute a condition expression could itself trash real
505 registers. Note that for simplicity, in the case where
506 IRExpr_VECRET() is present, we use the slow scheme. This is
507 motivated by the desire to avoid any possible complexity
508 w.r.t. nested calls.
510 Note this requires being able to examine an expression and
511 determine whether or not evaluation of it might use a fixed
512 register. That requires knowledge of how the rest of this insn
513 selector works. Currently just the following 3 are regarded as
514 safe -- hopefully they cover the majority of arguments in
515 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
518 /* Note that the cee->regparms field is meaningless on AMD64 host
519 (since there is only one calling convention) and so we always
520 ignore it. */
521 n_args = 0;
522 for (i = 0; args[i]; i++)
523 n_args++;
525 if (n_args > 6)
526 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
528 argregs[0] = hregAMD64_RDI();
529 argregs[1] = hregAMD64_RSI();
530 argregs[2] = hregAMD64_RDX();
531 argregs[3] = hregAMD64_RCX();
532 argregs[4] = hregAMD64_R8();
533 argregs[5] = hregAMD64_R9();
535 tmpregs[0] = tmpregs[1] = tmpregs[2] =
536 tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
538 fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
539 fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
541 /* First decide which scheme (slow or fast) is to be used. First
542 assume the fast scheme, and select slow if any contraindications
543 (wow) appear. */
545 /* We'll need space on the stack for the return value. Avoid
546 possible complications with nested calls by using the slow
547 scheme. */
548 if (retTy == Ity_V128 || retTy == Ity_V256)
549 goto slowscheme;
551 if (guard) {
552 if (guard->tag == Iex_Const
553 && guard->Iex.Const.con->tag == Ico_U1
554 && guard->Iex.Const.con->Ico.U1 == True) {
555 /* unconditional */
556 } else {
557 /* Not manifestly unconditional -- be conservative. */
558 goto slowscheme;
562 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
563 use the slow scheme. Because this is tentative, we can't call
564 addInstr (that is, commit to) any instructions until we're
565 handled all the arguments. So park the resulting instructions
566 in a buffer and emit that if we're successful. */
568 /* FAST SCHEME */
569 /* In this loop, we process args that can be computed into the
570 destination (real) register with a single instruction, without
571 using any fixed regs. That also includes IRExpr_GSPTR(), but
572 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
573 never see IRExpr_VECRET() at this point, since the return-type
574 check above should ensure all those cases use the slow scheme
575 instead. */
576 vassert(n_args >= 0 && n_args <= 6);
577 for (i = 0; i < n_args; i++) {
578 IRExpr* arg = args[i];
579 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
580 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
582 fastinstrs[i]
583 = iselIntExpr_single_instruction( env, argregs[i], args[i] );
584 if (fastinstrs[i] == NULL)
585 goto slowscheme;
588 /* Looks like we're in luck. Emit the accumulated instructions and
589 move on to doing the call itself. */
590 for (i = 0; i < n_args; i++)
591 addInstr(env, fastinstrs[i]);
593 /* Fast scheme only applies for unconditional calls. Hence: */
594 cc = Acc_ALWAYS;
596 goto handle_call;
599 /* SLOW SCHEME; move via temporaries */
600 slowscheme:
602 # if 0 /* debug only */
603 if (n_args > 0) {for (i = 0; args[i]; i++) {
604 ppIRExpr(args[i]); vex_printf(" "); }
605 vex_printf("\n");}
606 # endif
608 /* If we have a vector return type, allocate a place for it on the
609 stack and record its address. */
610 HReg r_vecRetAddr = INVALID_HREG;
611 if (retTy == Ity_V128) {
612 r_vecRetAddr = newVRegI(env);
613 sub_from_rsp(env, 16);
614 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
616 else if (retTy == Ity_V256) {
617 r_vecRetAddr = newVRegI(env);
618 sub_from_rsp(env, 32);
619 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
622 vassert(n_args >= 0 && n_args <= 6);
623 for (i = 0; i < n_args; i++) {
624 IRExpr* arg = args[i];
625 if (UNLIKELY(arg->tag == Iex_GSPTR)) {
626 tmpregs[i] = newVRegI(env);
627 addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
628 nGSPTRs++;
630 else if (UNLIKELY(arg->tag == Iex_VECRET)) {
631 /* We stashed the address of the return slot earlier, so just
632 retrieve it now. */
633 vassert(!hregIsInvalid(r_vecRetAddr));
634 tmpregs[i] = r_vecRetAddr;
635 nVECRETs++;
637 else {
638 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
639 tmpregs[i] = iselIntExpr_R(env, args[i]);
643 /* Now we can compute the condition. We can't do it earlier
644 because the argument computations could trash the condition
645 codes. Be a bit clever to handle the common case where the
646 guard is 1:Bit. */
647 cc = Acc_ALWAYS;
648 if (guard) {
649 if (guard->tag == Iex_Const
650 && guard->Iex.Const.con->tag == Ico_U1
651 && guard->Iex.Const.con->Ico.U1 == True) {
652 /* unconditional -- do nothing */
653 } else {
654 cc = iselCondCode( env, guard );
658 /* Move the args to their final destinations. */
659 for (i = 0; i < n_args; i++) {
660 /* None of these insns, including any spill code that might
661 be generated, may alter the condition codes. */
662 addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
666 /* Do final checks, set the return values, and generate the call
667 instruction proper. */
668 handle_call:
670 if (retTy == Ity_V128 || retTy == Ity_V256) {
671 vassert(nVECRETs == 1);
672 } else {
673 vassert(nVECRETs == 0);
676 vassert(nGSPTRs == 0 || nGSPTRs == 1);
678 vassert(*stackAdjustAfterCall == 0);
679 vassert(is_RetLoc_INVALID(*retloc));
680 switch (retTy) {
681 case Ity_INVALID:
682 /* Function doesn't return a value. */
683 *retloc = mk_RetLoc_simple(RLPri_None);
684 break;
685 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
686 *retloc = mk_RetLoc_simple(RLPri_Int);
687 break;
688 case Ity_V128:
689 *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
690 *stackAdjustAfterCall = 16;
691 break;
692 case Ity_V256:
693 *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
694 *stackAdjustAfterCall = 32;
695 break;
696 default:
697 /* IR can denote other possible return types, but we don't
698 handle those here. */
699 vassert(0);
702 /* Finally, generate the call itself. This needs the *retloc value
703 set in the switch above, which is why it's at the end. */
704 addInstr(env,
705 AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
709 /* Given a guest-state array descriptor, an index expression and a
710 bias, generate an AMD64AMode holding the relevant guest state
711 offset. */
713 static
714 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
715 IRExpr* off, Int bias )
717 HReg tmp, roff;
718 Int elemSz = sizeofIRType(descr->elemTy);
719 Int nElems = descr->nElems;
721 /* Throw out any cases not generated by an amd64 front end. In
722 theory there might be a day where we need to handle them -- if
723 we ever run non-amd64-guest on amd64 host. */
725 if (nElems != 8 || (elemSz != 1 && elemSz != 8))
726 vpanic("genGuestArrayOffset(amd64 host)");
728 /* Compute off into a reg, %off. Then return:
730 movq %off, %tmp
731 addq $bias, %tmp (if bias != 0)
732 andq %tmp, 7
733 ... base(%rbp, %tmp, shift) ...
735 tmp = newVRegI(env);
736 roff = iselIntExpr_R(env, off);
737 addInstr(env, mk_iMOVsd_RR(roff, tmp));
738 if (bias != 0) {
739 /* Make sure the bias is sane, in the sense that there are
740 no significant bits above bit 30 in it. */
741 vassert(-10000 < bias && bias < 10000);
742 addInstr(env,
743 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
745 addInstr(env,
746 AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
747 vassert(elemSz == 1 || elemSz == 8);
748 return
749 AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
750 elemSz==8 ? 3 : 0);
754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
755 static
756 void set_SSE_rounding_default ( ISelEnv* env )
758 /* pushq $DEFAULT_MXCSR
759 ldmxcsr 0(%rsp)
760 addq $8, %rsp
762 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
763 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
764 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
765 add_to_rsp(env, 8);
768 /* Mess with the FPU's rounding mode: set to the default rounding mode
769 (DEFAULT_FPUCW). */
770 static
771 void set_FPU_rounding_default ( ISelEnv* env )
773 /* movq $DEFAULT_FPUCW, -8(%rsp)
774 fldcw -8(%esp)
776 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
777 addInstr(env, AMD64Instr_Alu64M(
778 Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
779 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
784 expression denoting a value in the range 0 .. 3, indicating a round
785 mode encoded as per type IRRoundingMode. Set the SSE machinery to
786 have the same rounding.
788 static
789 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
791 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
792 both rounding bits == 0. If that wasn't the case, we couldn't
793 create a new rounding field simply by ORing the new value into
794 place. */
796 /* movq $3, %reg
797 andq [[mode]], %reg -- shouldn't be needed; paranoia
798 shlq $13, %reg
799 orq $DEFAULT_MXCSR, %reg
800 pushq %reg
801 ldmxcsr 0(%esp)
802 addq $8, %rsp
804 HReg reg = newVRegI(env);
805 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
806 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
807 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
808 iselIntExpr_RMI(env, mode), reg));
809 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
810 addInstr(env, AMD64Instr_Alu64R(
811 Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
812 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
813 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
814 add_to_rsp(env, 8);
818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
819 expression denoting a value in the range 0 .. 3, indicating a round
820 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
821 the same rounding.
823 static
824 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
826 HReg rrm = iselIntExpr_R(env, mode);
827 HReg rrm2 = newVRegI(env);
828 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
830 /* movq %rrm, %rrm2
831 andq $3, %rrm2 -- shouldn't be needed; paranoia
832 shlq $10, %rrm2
833 orq $DEFAULT_FPUCW, %rrm2
834 movq %rrm2, -8(%rsp)
835 fldcw -8(%esp)
837 addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
838 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
839 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
840 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
841 AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
842 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
843 AMD64RI_Reg(rrm2), m8_rsp));
844 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
848 /* Generate all-zeroes into a new vector register.
850 static HReg generate_zeroes_V128 ( ISelEnv* env )
852 HReg dst = newVRegV(env);
853 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
854 return dst;
857 /* Generate all-ones into a new vector register.
859 static HReg generate_ones_V128 ( ISelEnv* env )
861 HReg dst = newVRegV(env);
862 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
863 return dst;
867 /* Generate !src into a new vector register. Amazing that there isn't
868 a less crappy way to do this.
870 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
872 HReg dst = generate_ones_V128(env);
873 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
874 return dst;
878 /* Expand the given byte into a 64-bit word, by cloning each bit
879 8 times. */
880 static ULong bitmask8_to_bytemask64 ( UShort w8 )
882 vassert(w8 == (w8 & 0xFF));
883 ULong w64 = 0;
884 Int i;
885 for (i = 0; i < 8; i++) {
886 if (w8 & (1<<i))
887 w64 |= (0xFFULL << (8 * i));
889 return w64;
893 /*---------------------------------------------------------*/
894 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
895 /*---------------------------------------------------------*/
897 /* Select insns for an integer-typed expression, and add them to the
898 code list. Return a reg holding the result. This reg will be a
899 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
900 want to modify it, ask for a new vreg, copy it in there, and modify
901 the copy. The register allocator will do its best to map both
902 vregs to the same real register, so the copies will often disappear
903 later in the game.
905 This should handle expressions of 64, 32, 16 and 8-bit type. All
906 results are returned in a 64-bit register. For 32-, 16- and 8-bit
907 expressions, the upper 32/48/56 bits are arbitrary, so you should
908 mask or sign extend partial values if necessary.
911 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
913 HReg r = iselIntExpr_R_wrk(env, e);
914 /* sanity checks ... */
915 # if 0
916 vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
917 # endif
918 vassert(hregClass(r) == HRcInt64);
919 vassert(hregIsVirtual(r));
920 return r;
923 /* DO NOT CALL THIS DIRECTLY ! */
924 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
926 MatchInfo mi;
927 DECLARE_PATTERN(p_1Uto8_64to1);
928 DECLARE_PATTERN(p_LDle8_then_8Uto64);
929 DECLARE_PATTERN(p_LDle16_then_16Uto64);
931 IRType ty = typeOfIRExpr(env->type_env,e);
932 switch (ty) {
933 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
934 default: vassert(0);
937 switch (e->tag) {
939 /* --------- TEMP --------- */
940 case Iex_RdTmp: {
941 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
944 /* --------- LOAD --------- */
945 case Iex_Load: {
946 HReg dst = newVRegI(env);
947 AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
949 /* We can't handle big-endian loads, nor load-linked. */
950 if (e->Iex.Load.end != Iend_LE)
951 goto irreducible;
953 if (ty == Ity_I64) {
954 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
955 AMD64RMI_Mem(amode), dst) );
956 return dst;
958 if (ty == Ity_I32) {
959 addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
960 return dst;
962 if (ty == Ity_I16) {
963 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
964 return dst;
966 if (ty == Ity_I8) {
967 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
968 return dst;
970 break;
973 /* --------- BINARY OP --------- */
974 case Iex_Binop: {
975 AMD64AluOp aluOp;
976 AMD64ShiftOp shOp;
978 /* Pattern: Sub64(0,x) */
979 /* and: Sub32(0,x) */
980 if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
981 || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
982 HReg dst = newVRegI(env);
983 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
984 addInstr(env, mk_iMOVsd_RR(reg,dst));
985 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
986 return dst;
989 /* Is it an addition or logical style op? */
990 switch (e->Iex.Binop.op) {
991 case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
992 aluOp = Aalu_ADD; break;
993 case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
994 aluOp = Aalu_SUB; break;
995 case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
996 aluOp = Aalu_AND; break;
997 case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64:
998 aluOp = Aalu_OR; break;
999 case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
1000 aluOp = Aalu_XOR; break;
1001 case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1002 aluOp = Aalu_MUL; break;
1003 default:
1004 aluOp = Aalu_INVALID; break;
1006 /* For commutative ops we assume any literal
1007 values are on the second operand. */
1008 if (aluOp != Aalu_INVALID) {
1009 HReg dst = newVRegI(env);
1010 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
1011 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1012 addInstr(env, mk_iMOVsd_RR(reg,dst));
1013 addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1014 return dst;
1017 /* Perhaps a shift op? */
1018 switch (e->Iex.Binop.op) {
1019 case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1020 shOp = Ash_SHL; break;
1021 case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1022 shOp = Ash_SHR; break;
1023 case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1024 shOp = Ash_SAR; break;
1025 default:
1026 shOp = Ash_INVALID; break;
1028 if (shOp != Ash_INVALID) {
1029 HReg dst = newVRegI(env);
1031 /* regL = the value to be shifted */
1032 HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1033 addInstr(env, mk_iMOVsd_RR(regL,dst));
1035 /* Do any necessary widening for 32/16/8 bit operands */
1036 switch (e->Iex.Binop.op) {
1037 case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1038 break;
1039 case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1040 break;
1041 case Iop_Shr8:
1042 addInstr(env, AMD64Instr_Alu64R(
1043 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1044 break;
1045 case Iop_Shr16:
1046 addInstr(env, AMD64Instr_Alu64R(
1047 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1048 break;
1049 case Iop_Shr32:
1050 addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1051 break;
1052 case Iop_Sar8:
1053 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1054 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1055 break;
1056 case Iop_Sar16:
1057 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1058 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1059 break;
1060 case Iop_Sar32:
1061 addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1062 break;
1063 default:
1064 ppIROp(e->Iex.Binop.op);
1065 vassert(0);
1068 /* Now consider the shift amount. If it's a literal, we
1069 can do a much better job than the general case. */
1070 if (e->Iex.Binop.arg2->tag == Iex_Const) {
1071 /* assert that the IR is well-typed */
1072 Int nshift;
1073 vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1074 nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1075 vassert(nshift >= 0);
1076 if (nshift > 0)
1077 /* Can't allow nshift==0 since that means %cl */
1078 addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1079 } else {
1080 /* General case; we have to force the amount into %cl. */
1081 HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1082 addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1083 addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1085 return dst;
1088 /* Handle misc other scalar ops. */
1089 if (e->Iex.Binop.op == Iop_Max32U) {
1090 HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1091 HReg dst = newVRegI(env);
1092 HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1093 addInstr(env, mk_iMOVsd_RR(src1, dst));
1094 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1095 addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1096 return dst;
1099 if (e->Iex.Binop.op == Iop_DivModS64to32
1100 || e->Iex.Binop.op == Iop_DivModU64to32) {
1101 /* 64 x 32 -> (32(rem),32(div)) division */
1102 /* Get the 64-bit operand into edx:eax, and the other into
1103 any old R/M. */
1104 HReg rax = hregAMD64_RAX();
1105 HReg rdx = hregAMD64_RDX();
1106 HReg dst = newVRegI(env);
1107 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1108 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1109 /* Compute the left operand into a reg, and then
1110 put the top half in edx and the bottom in eax. */
1111 HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1112 addInstr(env, mk_iMOVsd_RR(left64, rdx));
1113 addInstr(env, mk_iMOVsd_RR(left64, rax));
1114 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1115 addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1116 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1117 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1118 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1119 addInstr(env, mk_iMOVsd_RR(rax, dst));
1120 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1121 return dst;
1124 if (e->Iex.Binop.op == Iop_32HLto64) {
1125 HReg hi32 = newVRegI(env);
1126 HReg lo32 = newVRegI(env);
1127 HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1128 HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1129 addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1130 addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1131 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1132 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1133 addInstr(env, AMD64Instr_Alu64R(
1134 Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1135 return hi32;
1138 if (e->Iex.Binop.op == Iop_16HLto32) {
1139 HReg hi16 = newVRegI(env);
1140 HReg lo16 = newVRegI(env);
1141 HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1142 HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1143 addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1144 addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1145 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1146 addInstr(env, AMD64Instr_Alu64R(
1147 Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1148 addInstr(env, AMD64Instr_Alu64R(
1149 Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1150 return hi16;
1153 if (e->Iex.Binop.op == Iop_8HLto16) {
1154 HReg hi8 = newVRegI(env);
1155 HReg lo8 = newVRegI(env);
1156 HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1157 HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1158 addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1159 addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1160 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1161 addInstr(env, AMD64Instr_Alu64R(
1162 Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1163 addInstr(env, AMD64Instr_Alu64R(
1164 Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1165 return hi8;
1168 if (e->Iex.Binop.op == Iop_MullS32
1169 || e->Iex.Binop.op == Iop_MullS16
1170 || e->Iex.Binop.op == Iop_MullS8
1171 || e->Iex.Binop.op == Iop_MullU32
1172 || e->Iex.Binop.op == Iop_MullU16
1173 || e->Iex.Binop.op == Iop_MullU8) {
1174 HReg a32 = newVRegI(env);
1175 HReg b32 = newVRegI(env);
1176 HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1177 HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1178 Int shift = 0;
1179 AMD64ShiftOp shr_op = Ash_SHR;
1180 switch (e->Iex.Binop.op) {
1181 case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1182 case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1183 case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break;
1184 case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1185 case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1186 case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break;
1187 default: vassert(0);
1190 addInstr(env, mk_iMOVsd_RR(a32s, a32));
1191 addInstr(env, mk_iMOVsd_RR(b32s, b32));
1192 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1193 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1194 addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32));
1195 addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32));
1196 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1197 return b32;
1200 if (e->Iex.Binop.op == Iop_CmpF64) {
1201 HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1202 HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1203 HReg dst = newVRegI(env);
1204 addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1205 /* Mask out irrelevant parts of the result so as to conform
1206 to the CmpF64 definition. */
1207 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1208 return dst;
1211 if (e->Iex.Binop.op == Iop_F64toI32S
1212 || e->Iex.Binop.op == Iop_F64toI64S) {
1213 Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1214 HReg rf = iselDblExpr(env, e->Iex.Binop.arg2);
1215 HReg dst = newVRegI(env);
1216 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1217 addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1218 set_SSE_rounding_default(env);
1219 return dst;
1222 /* Deal with 64-bit SIMD binary ops. For the most part these are doable
1223 by using the equivalent 128-bit operation and ignoring the upper half
1224 of the result. */
1225 AMD64SseOp op = Asse_INVALID;
1226 Bool arg1isEReg = False;
1227 Bool preShift32R = False;
1228 switch (e->Iex.Binop.op) {
1229 // The following 3 could be done with 128 bit insns too, but
1230 // first require the inputs to be reformatted.
1231 //case Iop_QNarrowBin32Sto16Sx4:
1232 //op = Asse_PACKSSD; arg1isEReg = True; break;
1233 //case Iop_QNarrowBin16Sto8Sx8:
1234 //op = Asse_PACKSSW; arg1isEReg = True; break;
1235 //case Iop_QNarrowBin16Sto8Ux8:
1236 //op = Asse_PACKUSW; arg1isEReg = True; break;
1238 case Iop_InterleaveHI8x8:
1239 op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1240 break;
1241 case Iop_InterleaveHI16x4:
1242 op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1243 break;
1244 case Iop_InterleaveHI32x2:
1245 op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1246 break;
1247 case Iop_InterleaveLO8x8:
1248 op = Asse_UNPCKLB; arg1isEReg = True;
1249 break;
1250 case Iop_InterleaveLO16x4:
1251 op = Asse_UNPCKLW; arg1isEReg = True;
1252 break;
1253 case Iop_InterleaveLO32x2:
1254 op = Asse_UNPCKLD; arg1isEReg = True;
1255 break;
1257 case Iop_Add8x8: op = Asse_ADD8; break;
1258 case Iop_Add16x4: op = Asse_ADD16; break;
1259 case Iop_Add32x2: op = Asse_ADD32; break;
1260 case Iop_QAdd8Sx8: op = Asse_QADD8S; break;
1261 case Iop_QAdd16Sx4: op = Asse_QADD16S; break;
1262 case Iop_QAdd8Ux8: op = Asse_QADD8U; break;
1263 case Iop_QAdd16Ux4: op = Asse_QADD16U; break;
1264 case Iop_Avg8Ux8: op = Asse_AVG8U; break;
1265 case Iop_Avg16Ux4: op = Asse_AVG16U; break;
1266 case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break;
1267 case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break;
1268 case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break;
1269 case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break;
1270 case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1271 case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1272 case Iop_Max16Sx4: op = Asse_MAX16S; break;
1273 case Iop_Max8Ux8: op = Asse_MAX8U; break;
1274 case Iop_Min16Sx4: op = Asse_MIN16S; break;
1275 case Iop_Min8Ux8: op = Asse_MIN8U; break;
1276 case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1277 case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1278 case Iop_Mul16x4: op = Asse_MUL16; break;
1279 case Iop_Sub8x8: op = Asse_SUB8; break;
1280 case Iop_Sub16x4: op = Asse_SUB16; break;
1281 case Iop_Sub32x2: op = Asse_SUB32; break;
1282 case Iop_QSub8Sx8: op = Asse_QSUB8S; break;
1283 case Iop_QSub16Sx4: op = Asse_QSUB16S; break;
1284 case Iop_QSub8Ux8: op = Asse_QSUB8U; break;
1285 case Iop_QSub16Ux4: op = Asse_QSUB16U; break;
1286 default: break;
1288 if (op != Asse_INVALID) {
1289 /* This isn't pretty, but .. move each arg to the low half of an XMM
1290 register, do the operation on the whole register, and move the
1291 result back to an integer register. */
1292 const IRExpr* arg1 = e->Iex.Binop.arg1;
1293 const IRExpr* arg2 = e->Iex.Binop.arg2;
1294 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1295 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1296 HReg iarg1 = iselIntExpr_R(env, arg1);
1297 HReg iarg2 = iselIntExpr_R(env, arg2);
1298 HReg varg1 = newVRegV(env);
1299 HReg varg2 = newVRegV(env);
1300 HReg idst = newVRegI(env);
1301 addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1302 addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1303 if (arg1isEReg) {
1304 if (preShift32R) {
1305 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1306 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1308 addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1309 addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1310 } else {
1311 vassert(!preShift32R);
1312 addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1313 addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1315 return idst;
1318 UInt laneBits = 0;
1319 op = Asse_INVALID;
1320 switch (e->Iex.Binop.op) {
1321 case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1322 case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1323 case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1324 case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1325 case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1326 case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1327 default: break;
1329 if (op != Asse_INVALID) {
1330 const IRExpr* arg1 = e->Iex.Binop.arg1;
1331 const IRExpr* arg2 = e->Iex.Binop.arg2;
1332 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1333 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1334 HReg igreg = iselIntExpr_R(env, arg1);
1335 HReg vgreg = newVRegV(env);
1336 HReg idst = newVRegI(env);
1337 addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1338 /* If it's a shift by an in-range immediate, generate a single
1339 instruction. */
1340 if (arg2->tag == Iex_Const) {
1341 IRConst* c = arg2->Iex.Const.con;
1342 vassert(c->tag == Ico_U8);
1343 UInt shift = c->Ico.U8;
1344 if (shift < laneBits) {
1345 addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1346 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1347 return idst;
1350 /* Otherwise we have to do it the longwinded way. */
1351 HReg ishift = iselIntExpr_R(env, arg2);
1352 HReg vshift = newVRegV(env);
1353 addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1354 addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1355 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1356 return idst;
1359 if (e->Iex.Binop.op == Iop_Mul32x2) {
1360 const IRExpr* arg1 = e->Iex.Binop.arg1;
1361 const IRExpr* arg2 = e->Iex.Binop.arg2;
1362 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1363 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1364 HReg s1 = iselIntExpr_R(env, arg1);
1365 HReg s2 = iselIntExpr_R(env, arg2);
1366 HReg resLo = newVRegI(env);
1367 // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1368 addInstr(env, mk_iMOVsd_RR(s1, resLo));
1369 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1370 addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1372 // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1373 HReg resHi = newVRegI(env);
1374 addInstr(env, mk_iMOVsd_RR(s1, resHi));
1375 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1376 HReg tmp = newVRegI(env);
1377 addInstr(env, mk_iMOVsd_RR(s2, tmp));
1378 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1379 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1380 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1382 // final result = resHi | resLo
1383 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1384 return resLo;
1387 // A few remaining SIMD64 ops require helper functions, at least for
1388 // now.
1389 Bool second_is_UInt = False;
1390 HWord fn = 0;
1391 switch (e->Iex.Binop.op) {
1392 case Iop_CatOddLanes16x4:
1393 fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1394 case Iop_CatEvenLanes16x4:
1395 fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1396 case Iop_PermOrZero8x8:
1397 fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1399 case Iop_QNarrowBin32Sto16Sx4:
1400 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1401 case Iop_QNarrowBin16Sto8Sx8:
1402 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1403 case Iop_QNarrowBin16Sto8Ux8:
1404 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1406 case Iop_NarrowBin16to8x8:
1407 fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1408 case Iop_NarrowBin32to16x4:
1409 fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1411 case Iop_SarN8x8:
1412 fn = (HWord)h_generic_calc_SarN8x8;
1413 second_is_UInt = True;
1414 break;
1416 default:
1417 fn = (HWord)0; break;
1419 if (fn != (HWord)0) {
1420 /* Note: the following assumes all helpers are of signature
1421 ULong fn ( ULong, ULong ), and they are
1422 not marked as regparm functions.
1424 HReg dst = newVRegI(env);
1425 HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1426 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1427 if (second_is_UInt)
1428 addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1429 addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1430 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1431 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1432 mk_RetLoc_simple(RLPri_Int) ));
1433 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1434 return dst;
1437 // Half-float vector conversion
1438 if (e->Iex.Binop.op == Iop_F32toF16x4
1439 && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1440 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1441 HReg dstV = newVRegV(env);
1442 HReg dstI = newVRegI(env);
1443 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1444 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1445 set_SSE_rounding_default(env);
1446 addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1447 return dstI;
1450 break;
1453 /* --------- UNARY OP --------- */
1454 case Iex_Unop: {
1456 /* 1Uto8(64to1(expr64)) */
1458 DEFINE_PATTERN( p_1Uto8_64to1,
1459 unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1460 if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1461 const IRExpr* expr64 = mi.bindee[0];
1462 HReg dst = newVRegI(env);
1463 HReg src = iselIntExpr_R(env, expr64);
1464 addInstr(env, mk_iMOVsd_RR(src,dst) );
1465 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1466 AMD64RMI_Imm(1), dst));
1467 return dst;
1471 /* 8Uto64(LDle(expr64)) */
1473 DEFINE_PATTERN(p_LDle8_then_8Uto64,
1474 unop(Iop_8Uto64,
1475 IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1476 if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1477 HReg dst = newVRegI(env);
1478 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1479 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1480 return dst;
1484 /* 16Uto64(LDle(expr64)) */
1486 DEFINE_PATTERN(p_LDle16_then_16Uto64,
1487 unop(Iop_16Uto64,
1488 IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1489 if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1490 HReg dst = newVRegI(env);
1491 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1492 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1493 return dst;
1497 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1498 Use 32 bit arithmetic and let the default zero-extend rule
1499 do the 32Uto64 for free. */
1500 if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1501 IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1502 IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1503 IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1504 AMD64AluOp aluOp = Aalu_INVALID;
1505 switch (opi) {
1506 case Iop_Add32: aluOp = Aalu_ADD; break;
1507 case Iop_Sub32: aluOp = Aalu_SUB; break;
1508 case Iop_And32: aluOp = Aalu_AND; break;
1509 case Iop_Or32: aluOp = Aalu_OR; break;
1510 case Iop_Xor32: aluOp = Aalu_XOR; break;
1511 default: break;
1513 if (aluOp != Aalu_INVALID) {
1514 /* For commutative ops we assume any literal values are on
1515 the second operand. */
1516 HReg dst = newVRegI(env);
1517 HReg reg = iselIntExpr_R(env, argL);
1518 AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1519 addInstr(env, mk_iMOVsd_RR(reg,dst));
1520 addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1521 return dst;
1523 /* just fall through to normal handling for Iop_32Uto64 */
1526 /* Fallback cases */
1527 switch (e->Iex.Unop.op) {
1528 case Iop_32Uto64:
1529 case Iop_32Sto64: {
1530 HReg dst = newVRegI(env);
1531 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1532 addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1533 src, dst) );
1534 return dst;
1536 case Iop_128HIto64: {
1537 HReg rHi, rLo;
1538 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1539 return rHi; /* and abandon rLo */
1541 case Iop_128to64: {
1542 HReg rHi, rLo;
1543 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1544 return rLo; /* and abandon rHi */
1546 case Iop_8Uto16:
1547 case Iop_8Uto32:
1548 case Iop_8Uto64:
1549 case Iop_16Uto64:
1550 case Iop_16Uto32: {
1551 HReg dst = newVRegI(env);
1552 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1553 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1554 || e->Iex.Unop.op==Iop_16Uto64 );
1555 UInt mask = srcIs16 ? 0xFFFF : 0xFF;
1556 addInstr(env, mk_iMOVsd_RR(src,dst) );
1557 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1558 AMD64RMI_Imm(mask), dst));
1559 return dst;
1561 case Iop_8Sto16:
1562 case Iop_8Sto64:
1563 case Iop_8Sto32:
1564 case Iop_16Sto32:
1565 case Iop_16Sto64: {
1566 HReg dst = newVRegI(env);
1567 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1568 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1569 || e->Iex.Unop.op==Iop_16Sto64 );
1570 UInt amt = srcIs16 ? 48 : 56;
1571 addInstr(env, mk_iMOVsd_RR(src,dst) );
1572 addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1573 addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1574 return dst;
1576 case Iop_Not8:
1577 case Iop_Not16:
1578 case Iop_Not32:
1579 case Iop_Not64: {
1580 HReg dst = newVRegI(env);
1581 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1582 addInstr(env, mk_iMOVsd_RR(src,dst) );
1583 addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1584 return dst;
1586 case Iop_16HIto8:
1587 case Iop_32HIto16:
1588 case Iop_64HIto32: {
1589 HReg dst = newVRegI(env);
1590 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1591 Int shift = 0;
1592 switch (e->Iex.Unop.op) {
1593 case Iop_16HIto8: shift = 8; break;
1594 case Iop_32HIto16: shift = 16; break;
1595 case Iop_64HIto32: shift = 32; break;
1596 default: vassert(0);
1598 addInstr(env, mk_iMOVsd_RR(src,dst) );
1599 addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1600 return dst;
1602 case Iop_1Uto64:
1603 case Iop_1Uto32:
1604 case Iop_1Uto8: {
1605 HReg dst = newVRegI(env);
1606 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1607 addInstr(env, AMD64Instr_Set64(cond,dst));
1608 return dst;
1610 case Iop_1Sto8:
1611 case Iop_1Sto16:
1612 case Iop_1Sto32:
1613 case Iop_1Sto64: {
1614 /* could do better than this, but for now ... */
1615 HReg dst = newVRegI(env);
1616 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1617 addInstr(env, AMD64Instr_Set64(cond,dst));
1618 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1619 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1620 return dst;
1622 case Iop_Ctz64: {
1623 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1624 HReg dst = newVRegI(env);
1625 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1626 addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1627 return dst;
1629 case Iop_Clz64: {
1630 /* Count leading zeroes. Do 'bsrq' to establish the index
1631 of the highest set bit, and subtract that value from
1632 63. */
1633 HReg tmp = newVRegI(env);
1634 HReg dst = newVRegI(env);
1635 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1636 addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1637 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1638 AMD64RMI_Imm(63), dst));
1639 addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1640 AMD64RMI_Reg(tmp), dst));
1641 return dst;
1644 case Iop_CmpwNEZ64: {
1645 HReg dst = newVRegI(env);
1646 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1647 addInstr(env, mk_iMOVsd_RR(src,dst));
1648 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1649 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1650 AMD64RMI_Reg(src), dst));
1651 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1652 return dst;
1655 case Iop_CmpwNEZ32: {
1656 HReg src = newVRegI(env);
1657 HReg dst = newVRegI(env);
1658 HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1659 addInstr(env, mk_iMOVsd_RR(pre,src));
1660 addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1661 addInstr(env, mk_iMOVsd_RR(src,dst));
1662 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1663 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1664 AMD64RMI_Reg(src), dst));
1665 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1666 return dst;
1669 case Iop_Left8:
1670 case Iop_Left16:
1671 case Iop_Left32:
1672 case Iop_Left64: {
1673 HReg dst = newVRegI(env);
1674 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1675 addInstr(env, mk_iMOVsd_RR(src, dst));
1676 addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1677 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1678 return dst;
1681 case Iop_V128to32: {
1682 HReg dst = newVRegI(env);
1683 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1684 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1685 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1686 addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1687 return dst;
1690 /* V128{HI}to64 */
1691 case Iop_V128to64: {
1692 HReg dst = newVRegI(env);
1693 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1694 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1695 return dst;
1697 case Iop_V128HIto64: {
1698 HReg dst = newVRegI(env);
1699 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1700 HReg vec2 = newVRegV(env);
1701 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1702 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1703 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1704 return dst;
1707 /* V256to64_{3,2,1,0} */
1708 case Iop_V256to64_0: case Iop_V256to64_1:
1709 case Iop_V256to64_2: case Iop_V256to64_3: {
1710 HReg vHi, vLo, vec;
1711 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1712 /* Do the first part of the selection by deciding which of
1713 the 128 bit registers to look at, and second part using
1714 the same scheme as for V128{HI}to64 above. */
1715 Bool low64of128 = True;
1716 switch (e->Iex.Unop.op) {
1717 case Iop_V256to64_0: vec = vLo; low64of128 = True; break;
1718 case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1719 case Iop_V256to64_2: vec = vHi; low64of128 = True; break;
1720 case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1721 default: vassert(0);
1723 HReg dst = newVRegI(env);
1724 if (low64of128) {
1725 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1726 } else {
1727 HReg vec2 = newVRegV(env);
1728 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1729 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1730 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1732 return dst;
1735 /* ReinterpF64asI64(e) */
1736 /* Given an IEEE754 double, produce an I64 with the same bit
1737 pattern. */
1738 case Iop_ReinterpF64asI64: {
1739 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1740 HReg dst = newVRegI(env);
1741 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1742 /* paranoia */
1743 set_SSE_rounding_default(env);
1744 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1745 addInstr(env, AMD64Instr_Alu64R(
1746 Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1747 return dst;
1750 /* ReinterpF32asI32(e) */
1751 /* Given an IEEE754 single, produce an I64 with the same bit
1752 pattern in the lower half. */
1753 case Iop_ReinterpF32asI32: {
1754 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1755 HReg dst = newVRegI(env);
1756 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1757 /* paranoia */
1758 set_SSE_rounding_default(env);
1759 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1760 addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1761 return dst;
1764 case Iop_16to8:
1765 case Iop_32to8:
1766 case Iop_64to8:
1767 case Iop_32to16:
1768 case Iop_64to16:
1769 case Iop_64to32:
1770 /* These are no-ops. */
1771 return iselIntExpr_R(env, e->Iex.Unop.arg);
1773 case Iop_GetMSBs8x8: {
1774 /* Note: the following assumes the helper is of
1775 signature
1776 UInt fn ( ULong ), and is not a regparm fn.
1778 HReg dst = newVRegI(env);
1779 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1780 HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1781 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1782 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1783 1, mk_RetLoc_simple(RLPri_Int) ));
1784 /* MovxLQ is not exactly the right thing here. We just
1785 need to get the bottom 8 bits of RAX into dst, and zero
1786 out everything else. Assuming that the helper returns
1787 a UInt with the top 24 bits zeroed out, it'll do,
1788 though. */
1789 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1790 return dst;
1793 case Iop_GetMSBs8x16: {
1794 /* Note: the following assumes the helper is of signature
1795 UInt fn ( ULong w64hi, ULong w64Lo ),
1796 and is not a regparm fn. */
1797 HReg dst = newVRegI(env);
1798 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1799 HReg rsp = hregAMD64_RSP();
1800 HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1801 AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
1802 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1803 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1804 16, vec, m16_rsp));
1805 /* hi 64 bits into RDI -- the first arg */
1806 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1807 AMD64RMI_Mem(m8_rsp),
1808 hregAMD64_RDI() )); /* 1st arg */
1809 /* lo 64 bits into RSI -- the 2nd arg */
1810 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1811 AMD64RMI_Mem(m16_rsp),
1812 hregAMD64_RSI() )); /* 2nd arg */
1813 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1814 2, mk_RetLoc_simple(RLPri_Int) ));
1815 /* MovxLQ is not exactly the right thing here. We just
1816 need to get the bottom 16 bits of RAX into dst, and zero
1817 out everything else. Assuming that the helper returns
1818 a UInt with the top 16 bits zeroed out, it'll do,
1819 though. */
1820 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1821 return dst;
1824 default:
1825 break;
1828 /* Deal with unary 64-bit SIMD ops. */
1829 HWord fn = 0;
1830 switch (e->Iex.Unop.op) {
1831 case Iop_CmpNEZ32x2:
1832 fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1833 case Iop_CmpNEZ16x4:
1834 fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1835 case Iop_CmpNEZ8x8:
1836 fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1837 default:
1838 fn = (HWord)0; break;
1840 if (fn != (HWord)0) {
1841 /* Note: the following assumes all helpers are of
1842 signature
1843 ULong fn ( ULong ), and they are
1844 not marked as regparm functions.
1846 HReg dst = newVRegI(env);
1847 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1848 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1849 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1850 mk_RetLoc_simple(RLPri_Int) ));
1851 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1852 return dst;
1855 break;
1858 /* --------- GET --------- */
1859 case Iex_Get: {
1860 if (ty == Ity_I64) {
1861 HReg dst = newVRegI(env);
1862 addInstr(env, AMD64Instr_Alu64R(
1863 Aalu_MOV,
1864 AMD64RMI_Mem(
1865 AMD64AMode_IR(e->Iex.Get.offset,
1866 hregAMD64_RBP())),
1867 dst));
1868 return dst;
1870 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1871 HReg dst = newVRegI(env);
1872 addInstr(env, AMD64Instr_LoadEX(
1873 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1874 False,
1875 AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1876 dst));
1877 return dst;
1879 break;
1882 case Iex_GetI: {
1883 AMD64AMode* am
1884 = genGuestArrayOffset(
1885 env, e->Iex.GetI.descr,
1886 e->Iex.GetI.ix, e->Iex.GetI.bias );
1887 HReg dst = newVRegI(env);
1888 if (ty == Ity_I8) {
1889 addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1890 return dst;
1892 if (ty == Ity_I64) {
1893 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1894 return dst;
1896 break;
1899 /* --------- CCALL --------- */
1900 case Iex_CCall: {
1901 HReg dst = newVRegI(env);
1902 vassert(ty == e->Iex.CCall.retty);
1904 /* be very restrictive for now. Only 64-bit ints allowed for
1905 args, and 64 or 32 bits for return type. */
1906 if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1907 goto irreducible;
1909 /* Marshal args, do the call. */
1910 UInt addToSp = 0;
1911 RetLoc rloc = mk_RetLoc_INVALID();
1912 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1913 e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1914 vassert(is_sane_RetLoc(rloc));
1915 vassert(rloc.pri == RLPri_Int);
1916 vassert(addToSp == 0);
1918 /* Move to dst, and zero out the top 32 bits if the result type is
1919 Ity_I32. Probably overkill, but still .. */
1920 if (e->Iex.CCall.retty == Ity_I64)
1921 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1922 else
1923 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1925 return dst;
1928 /* --------- LITERAL --------- */
1929 /* 64/32/16/8-bit literals */
1930 case Iex_Const:
1931 if (ty == Ity_I64) {
1932 HReg r = newVRegI(env);
1933 addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1934 return r;
1935 } else {
1936 AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1937 HReg r = newVRegI(env);
1938 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1939 return r;
1942 /* --------- MULTIPLEX --------- */
1943 case Iex_ITE: { // VFD
1944 if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1945 && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1946 HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1947 HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1948 HReg dst = newVRegI(env);
1949 addInstr(env, mk_iMOVsd_RR(r1,dst));
1950 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1951 addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1952 return dst;
1954 break;
1957 /* --------- TERNARY OP --------- */
1958 case Iex_Triop: {
1959 IRTriop *triop = e->Iex.Triop.details;
1960 /* C3210 flags following FPU partial remainder (fprem), both
1961 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1962 if (triop->op == Iop_PRemC3210F64
1963 || triop->op == Iop_PRem1C3210F64) {
1964 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1965 HReg arg1 = iselDblExpr(env, triop->arg2);
1966 HReg arg2 = iselDblExpr(env, triop->arg3);
1967 HReg dst = newVRegI(env);
1968 addInstr(env, AMD64Instr_A87Free(2));
1970 /* one arg -> top of x87 stack */
1971 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1972 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1974 /* other arg -> top of x87 stack */
1975 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1976 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1978 switch (triop->op) {
1979 case Iop_PRemC3210F64:
1980 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1981 break;
1982 case Iop_PRem1C3210F64:
1983 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1984 break;
1985 default:
1986 vassert(0);
1988 /* Ignore the result, and instead make off with the FPU's
1989 C3210 flags (in the status word). */
1990 addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1991 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1992 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1993 return dst;
1995 break;
1998 default:
1999 break;
2000 } /* switch (e->tag) */
2002 /* We get here if no pattern matched. */
2003 irreducible:
2004 ppIRExpr(e);
2005 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2009 /*---------------------------------------------------------*/
2010 /*--- ISEL: Integer expression auxiliaries ---*/
2011 /*---------------------------------------------------------*/
2013 /* --------------------- AMODEs --------------------- */
2015 /* Return an AMode which computes the value of the specified
2016 expression, possibly also adding insns to the code list as a
2017 result. The expression may only be a 32-bit one.
2020 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2022 AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2023 vassert(sane_AMode(am));
2024 return am;
2027 /* DO NOT CALL THIS DIRECTLY ! */
2028 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2030 MatchInfo mi;
2031 DECLARE_PATTERN(p_complex);
2032 IRType ty = typeOfIRExpr(env->type_env,e);
2033 vassert(ty == Ity_I64);
2035 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2036 /* bind0 bind1 bind2 bind3 */
2037 DEFINE_PATTERN(p_complex,
2038 binop( Iop_Add64,
2039 binop( Iop_Add64,
2040 bind(0),
2041 binop(Iop_Shl64, bind(1), bind(2))
2043 bind(3)
2046 if (matchIRExpr(&mi, p_complex, e)) {
2047 const IRExpr* expr1 = mi.bindee[0];
2048 const IRExpr* expr2 = mi.bindee[1];
2049 const IRExpr* imm8 = mi.bindee[2];
2050 const IRExpr* simm32 = mi.bindee[3];
2051 if (imm8->tag == Iex_Const
2052 && imm8->Iex.Const.con->tag == Ico_U8
2053 && imm8->Iex.Const.con->Ico.U8 < 4
2054 /* imm8 is OK, now check simm32 */
2055 && simm32->tag == Iex_Const
2056 && simm32->Iex.Const.con->tag == Ico_U64
2057 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2058 UInt shift = imm8->Iex.Const.con->Ico.U8;
2059 UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2060 HReg r1 = iselIntExpr_R(env, expr1);
2061 HReg r2 = iselIntExpr_R(env, expr2);
2062 vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2063 return AMD64AMode_IRRS(offset, r1, r2, shift);
2067 /* Add64(expr1, Shl64(expr2, imm)) */
2068 if (e->tag == Iex_Binop
2069 && e->Iex.Binop.op == Iop_Add64
2070 && e->Iex.Binop.arg2->tag == Iex_Binop
2071 && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2072 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2073 && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2074 UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2075 if (shift == 1 || shift == 2 || shift == 3) {
2076 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2077 HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2078 return AMD64AMode_IRRS(0, r1, r2, shift);
2082 /* Add64(expr,i) */
2083 if (e->tag == Iex_Binop
2084 && e->Iex.Binop.op == Iop_Add64
2085 && e->Iex.Binop.arg2->tag == Iex_Const
2086 && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2087 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2088 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2089 return AMD64AMode_IR(
2090 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2095 /* Doesn't match anything in particular. Generate it into
2096 a register and use that. */
2098 HReg r1 = iselIntExpr_R(env, e);
2099 return AMD64AMode_IR(0, r1);
2104 /* --------------------- RMIs --------------------- */
2106 /* Similarly, calculate an expression into an X86RMI operand. As with
2107 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2109 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2111 AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2112 /* sanity checks ... */
2113 switch (rmi->tag) {
2114 case Armi_Imm:
2115 return rmi;
2116 case Armi_Reg:
2117 vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2118 vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2119 return rmi;
2120 case Armi_Mem:
2121 vassert(sane_AMode(rmi->Armi.Mem.am));
2122 return rmi;
2123 default:
2124 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2128 /* DO NOT CALL THIS DIRECTLY ! */
2129 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2131 IRType ty = typeOfIRExpr(env->type_env,e);
2132 vassert(ty == Ity_I64 || ty == Ity_I32
2133 || ty == Ity_I16 || ty == Ity_I8);
2135 /* special case: immediate 64/32/16/8 */
2136 if (e->tag == Iex_Const) {
2137 switch (e->Iex.Const.con->tag) {
2138 case Ico_U64:
2139 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2140 return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2142 break;
2143 case Ico_U32:
2144 return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2145 case Ico_U16:
2146 return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2147 case Ico_U8:
2148 return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2149 default:
2150 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2154 /* special case: 64-bit GET */
2155 if (e->tag == Iex_Get && ty == Ity_I64) {
2156 return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2157 hregAMD64_RBP()));
2160 /* special case: 64-bit load from memory */
2161 if (e->tag == Iex_Load && ty == Ity_I64
2162 && e->Iex.Load.end == Iend_LE) {
2163 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2164 return AMD64RMI_Mem(am);
2167 /* default case: calculate into a register and return that */
2169 HReg r = iselIntExpr_R ( env, e );
2170 return AMD64RMI_Reg(r);
2175 /* --------------------- RIs --------------------- */
2177 /* Calculate an expression into an AMD64RI operand. As with
2178 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2179 bits. */
2181 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2183 AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2184 /* sanity checks ... */
2185 switch (ri->tag) {
2186 case Ari_Imm:
2187 return ri;
2188 case Ari_Reg:
2189 vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2190 vassert(hregIsVirtual(ri->Ari.Reg.reg));
2191 return ri;
2192 default:
2193 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2197 /* DO NOT CALL THIS DIRECTLY ! */
2198 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2200 IRType ty = typeOfIRExpr(env->type_env,e);
2201 vassert(ty == Ity_I64 || ty == Ity_I32
2202 || ty == Ity_I16 || ty == Ity_I8);
2204 /* special case: immediate */
2205 if (e->tag == Iex_Const) {
2206 switch (e->Iex.Const.con->tag) {
2207 case Ico_U64:
2208 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2209 return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2211 break;
2212 case Ico_U32:
2213 return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2214 case Ico_U16:
2215 return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2216 case Ico_U8:
2217 return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2218 default:
2219 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2223 /* default case: calculate into a register and return that */
2225 HReg r = iselIntExpr_R ( env, e );
2226 return AMD64RI_Reg(r);
2231 /* --------------------- RMs --------------------- */
2233 /* Similarly, calculate an expression into an AMD64RM operand. As
2234 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2235 bits. */
2237 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2239 AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2240 /* sanity checks ... */
2241 switch (rm->tag) {
2242 case Arm_Reg:
2243 vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2244 vassert(hregIsVirtual(rm->Arm.Reg.reg));
2245 return rm;
2246 case Arm_Mem:
2247 vassert(sane_AMode(rm->Arm.Mem.am));
2248 return rm;
2249 default:
2250 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2254 /* DO NOT CALL THIS DIRECTLY ! */
2255 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2257 IRType ty = typeOfIRExpr(env->type_env,e);
2258 vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2260 /* special case: 64-bit GET */
2261 if (e->tag == Iex_Get && ty == Ity_I64) {
2262 return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2263 hregAMD64_RBP()));
2266 /* special case: load from memory */
2268 /* default case: calculate into a register and return that */
2270 HReg r = iselIntExpr_R ( env, e );
2271 return AMD64RM_Reg(r);
2276 /* --------------------- CONDCODE --------------------- */
2278 /* Generate code to evaluated a bit-typed expression, returning the
2279 condition code which would correspond when the expression would
2280 notionally have returned 1. */
2282 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2284 /* Uh, there's nothing we can sanity check here, unfortunately. */
2285 return iselCondCode_wrk(env,e);
2288 /* DO NOT CALL THIS DIRECTLY ! */
2289 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2291 vassert(e);
2292 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2294 /* var */
2295 if (e->tag == Iex_RdTmp) {
2296 HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2297 HReg dst = newVRegI(env);
2298 addInstr(env, mk_iMOVsd_RR(r64,dst));
2299 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2300 return Acc_NZ;
2303 /* Constant 1:Bit */
2304 if (e->tag == Iex_Const) {
2305 HReg r;
2306 vassert(e->Iex.Const.con->tag == Ico_U1);
2307 vassert(e->Iex.Const.con->Ico.U1 == True
2308 || e->Iex.Const.con->Ico.U1 == False);
2309 r = newVRegI(env);
2310 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2311 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2312 return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2315 /* Not1(...) */
2316 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2317 /* Generate code for the arg, and negate the test condition */
2318 return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2321 /* --- patterns rooted at: 64to1 --- */
2323 /* 64to1 */
2324 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2325 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2326 addInstr(env, AMD64Instr_Test64(1,reg));
2327 return Acc_NZ;
2330 /* --- patterns rooted at: 32to1 --- */
2332 /* 32to1 */
2333 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2334 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2335 addInstr(env, AMD64Instr_Test64(1,reg));
2336 return Acc_NZ;
2339 /* --- patterns rooted at: CmpNEZ8 --- */
2341 /* CmpNEZ8(x) */
2342 if (e->tag == Iex_Unop
2343 && e->Iex.Unop.op == Iop_CmpNEZ8) {
2344 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2345 addInstr(env, AMD64Instr_Test64(0xFF,r));
2346 return Acc_NZ;
2349 /* --- patterns rooted at: CmpNEZ16 --- */
2351 /* CmpNEZ16(x) */
2352 if (e->tag == Iex_Unop
2353 && e->Iex.Unop.op == Iop_CmpNEZ16) {
2354 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2355 addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2356 return Acc_NZ;
2359 /* --- patterns rooted at: CmpNEZ32 --- */
2361 if (e->tag == Iex_Unop
2362 && e->Iex.Unop.op == Iop_CmpNEZ32) {
2363 IRExpr* arg = e->Iex.Unop.arg;
2364 if (arg->tag == Iex_Binop
2365 && (arg->Iex.Binop.op == Iop_Or32
2366 || arg->Iex.Binop.op == Iop_And32)) {
2367 /* CmpNEZ32(Or32(x,y)) */
2368 /* CmpNEZ32(And32(x,y)) */
2369 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2370 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2371 HReg tmp = newVRegI(env);
2372 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2373 addInstr(env, AMD64Instr_Alu32R(
2374 arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2375 rmi1, tmp));
2376 return Acc_NZ;
2378 /* CmpNEZ32(x) */
2379 HReg r1 = iselIntExpr_R(env, arg);
2380 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2381 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2382 return Acc_NZ;
2385 /* --- patterns rooted at: CmpNEZ64 --- */
2387 if (e->tag == Iex_Unop
2388 && e->Iex.Unop.op == Iop_CmpNEZ64) {
2389 IRExpr* arg = e->Iex.Unop.arg;
2390 if (arg->tag == Iex_Binop
2391 && (arg->Iex.Binop.op == Iop_Or64
2392 || arg->Iex.Binop.op == Iop_And64)) {
2393 /* CmpNEZ64(Or64(x,y)) */
2394 /* CmpNEZ64(And64(x,y)) */
2395 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2396 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2397 HReg tmp = newVRegI(env);
2398 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2399 addInstr(env, AMD64Instr_Alu64R(
2400 arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2401 rmi1, tmp));
2402 return Acc_NZ;
2404 /* CmpNEZ64(x) */
2405 HReg r1 = iselIntExpr_R(env, arg);
2406 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2407 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2408 return Acc_NZ;
2411 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2413 /* CmpEQ8 / CmpNE8 */
2414 if (e->tag == Iex_Binop
2415 && (e->Iex.Binop.op == Iop_CmpEQ8
2416 || e->Iex.Binop.op == Iop_CmpNE8
2417 || e->Iex.Binop.op == Iop_CasCmpEQ8
2418 || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2419 if (isZeroU8(e->Iex.Binop.arg2)) {
2420 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2421 addInstr(env, AMD64Instr_Test64(0xFF,r1));
2422 switch (e->Iex.Binop.op) {
2423 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2424 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2425 default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2427 } else {
2428 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2429 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2430 HReg r = newVRegI(env);
2431 addInstr(env, mk_iMOVsd_RR(r1,r));
2432 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2433 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2434 switch (e->Iex.Binop.op) {
2435 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2436 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2437 default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2442 /* CmpEQ16 / CmpNE16 */
2443 if (e->tag == Iex_Binop
2444 && (e->Iex.Binop.op == Iop_CmpEQ16
2445 || e->Iex.Binop.op == Iop_CmpNE16
2446 || e->Iex.Binop.op == Iop_CasCmpEQ16
2447 || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2448 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2449 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2450 HReg r = newVRegI(env);
2451 addInstr(env, mk_iMOVsd_RR(r1,r));
2452 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2453 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2454 switch (e->Iex.Binop.op) {
2455 case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2456 case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2457 default: vpanic("iselCondCode(amd64): CmpXX16");
2461 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2462 Saves a "movq %rax, %tmp" compared to the default route. */
2463 if (e->tag == Iex_Binop
2464 && e->Iex.Binop.op == Iop_CmpNE64
2465 && e->Iex.Binop.arg1->tag == Iex_CCall
2466 && e->Iex.Binop.arg2->tag == Iex_Const) {
2467 IRExpr* cal = e->Iex.Binop.arg1;
2468 IRExpr* con = e->Iex.Binop.arg2;
2469 HReg tmp = newVRegI(env);
2470 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2471 vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2472 vassert(con->Iex.Const.con->tag == Ico_U64);
2473 /* Marshal args, do the call. */
2474 UInt addToSp = 0;
2475 RetLoc rloc = mk_RetLoc_INVALID();
2476 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2477 cal->Iex.CCall.cee,
2478 cal->Iex.CCall.retty, cal->Iex.CCall.args );
2479 vassert(is_sane_RetLoc(rloc));
2480 vassert(rloc.pri == RLPri_Int);
2481 vassert(addToSp == 0);
2482 /* */
2483 addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2484 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2485 AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2486 return Acc_NZ;
2489 /* Cmp*64*(x,y) */
2490 if (e->tag == Iex_Binop
2491 && (e->Iex.Binop.op == Iop_CmpEQ64
2492 || e->Iex.Binop.op == Iop_CmpNE64
2493 || e->Iex.Binop.op == Iop_CmpLT64S
2494 || e->Iex.Binop.op == Iop_CmpLT64U
2495 || e->Iex.Binop.op == Iop_CmpLE64S
2496 || e->Iex.Binop.op == Iop_CmpLE64U
2497 || e->Iex.Binop.op == Iop_CasCmpEQ64
2498 || e->Iex.Binop.op == Iop_CasCmpNE64
2499 || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2500 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2501 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2502 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2503 switch (e->Iex.Binop.op) {
2504 case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2505 case Iop_CmpNE64:
2506 case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2507 case Iop_CmpLT64S: return Acc_L;
2508 case Iop_CmpLT64U: return Acc_B;
2509 case Iop_CmpLE64S: return Acc_LE;
2510 case Iop_CmpLE64U: return Acc_BE;
2511 default: vpanic("iselCondCode(amd64): CmpXX64");
2515 /* Cmp*32*(x,y) */
2516 if (e->tag == Iex_Binop
2517 && (e->Iex.Binop.op == Iop_CmpEQ32
2518 || e->Iex.Binop.op == Iop_CmpNE32
2519 || e->Iex.Binop.op == Iop_CmpLT32S
2520 || e->Iex.Binop.op == Iop_CmpLT32U
2521 || e->Iex.Binop.op == Iop_CmpLE32S
2522 || e->Iex.Binop.op == Iop_CmpLE32U
2523 || e->Iex.Binop.op == Iop_CasCmpEQ32
2524 || e->Iex.Binop.op == Iop_CasCmpNE32
2525 || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2526 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2527 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2528 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2529 switch (e->Iex.Binop.op) {
2530 case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2531 case Iop_CmpNE32:
2532 case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2533 case Iop_CmpLT32S: return Acc_L;
2534 case Iop_CmpLT32U: return Acc_B;
2535 case Iop_CmpLE32S: return Acc_LE;
2536 case Iop_CmpLE32U: return Acc_BE;
2537 default: vpanic("iselCondCode(amd64): CmpXX32");
2541 ppIRExpr(e);
2542 vpanic("iselCondCode(amd64)");
2546 /*---------------------------------------------------------*/
2547 /*--- ISEL: Integer expressions (128 bit) ---*/
2548 /*---------------------------------------------------------*/
2550 /* Compute a 128-bit value into a register pair, which is returned as
2551 the first two parameters. As with iselIntExpr_R, these may be
2552 either real or virtual regs; in any case they must not be changed
2553 by subsequent code emitted by the caller. */
2555 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2556 ISelEnv* env, const IRExpr* e )
2558 iselInt128Expr_wrk(rHi, rLo, env, e);
2559 # if 0
2560 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2561 # endif
2562 vassert(hregClass(*rHi) == HRcInt64);
2563 vassert(hregIsVirtual(*rHi));
2564 vassert(hregClass(*rLo) == HRcInt64);
2565 vassert(hregIsVirtual(*rLo));
2568 /* DO NOT CALL THIS DIRECTLY ! */
2569 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2570 ISelEnv* env, const IRExpr* e )
2572 vassert(e);
2573 vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2575 /* read 128-bit IRTemp */
2576 if (e->tag == Iex_RdTmp) {
2577 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2578 return;
2581 /* --------- BINARY ops --------- */
2582 if (e->tag == Iex_Binop) {
2583 switch (e->Iex.Binop.op) {
2584 /* 64 x 64 -> 128 multiply */
2585 case Iop_MullU64:
2586 case Iop_MullS64: {
2587 /* get one operand into %rax, and the other into a R/M.
2588 Need to make an educated guess about which is better in
2589 which. */
2590 HReg tLo = newVRegI(env);
2591 HReg tHi = newVRegI(env);
2592 Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2593 AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2594 HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2595 addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2596 addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2597 /* Result is now in RDX:RAX. Tell the caller. */
2598 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2599 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2600 *rHi = tHi;
2601 *rLo = tLo;
2602 return;
2605 /* 128 x 64 -> (64(rem),64(div)) division */
2606 case Iop_DivModU128to64:
2607 case Iop_DivModS128to64: {
2608 /* Get the 128-bit operand into rdx:rax, and the other into
2609 any old R/M. */
2610 HReg sHi, sLo;
2611 HReg tLo = newVRegI(env);
2612 HReg tHi = newVRegI(env);
2613 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2614 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2615 iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2616 addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2617 addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2618 addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2619 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2620 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2621 *rHi = tHi;
2622 *rLo = tLo;
2623 return;
2626 /* 64HLto128(e1,e2) */
2627 case Iop_64HLto128:
2628 *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2629 *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2630 return;
2632 default:
2633 break;
2635 } /* if (e->tag == Iex_Binop) */
2637 ppIRExpr(e);
2638 vpanic("iselInt128Expr");
2642 /*---------------------------------------------------------*/
2643 /*--- ISEL: Floating point expressions (32 bit) ---*/
2644 /*---------------------------------------------------------*/
2646 /* Nothing interesting here; really just wrappers for
2647 64-bit stuff. */
2649 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2651 HReg r = iselFltExpr_wrk( env, e );
2652 # if 0
2653 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2654 # endif
2655 vassert(hregClass(r) == HRcVec128);
2656 vassert(hregIsVirtual(r));
2657 return r;
2660 /* DO NOT CALL THIS DIRECTLY */
2661 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2663 IRType ty = typeOfIRExpr(env->type_env,e);
2664 vassert(ty == Ity_F32);
2666 if (e->tag == Iex_RdTmp) {
2667 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2670 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2671 AMD64AMode* am;
2672 HReg res = newVRegV(env);
2673 vassert(e->Iex.Load.ty == Ity_F32);
2674 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2675 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2676 return res;
2679 if (e->tag == Iex_Binop
2680 && e->Iex.Binop.op == Iop_F64toF32) {
2681 /* Although the result is still held in a standard SSE register,
2682 we need to round it to reflect the loss of accuracy/range
2683 entailed in casting it to a 32-bit float. */
2684 HReg dst = newVRegV(env);
2685 HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2686 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2687 addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2688 set_SSE_rounding_default( env );
2689 return dst;
2692 if (e->tag == Iex_Get) {
2693 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2694 hregAMD64_RBP() );
2695 HReg res = newVRegV(env);
2696 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2697 return res;
2700 if (e->tag == Iex_Unop
2701 && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2702 /* Given an I32, produce an IEEE754 float with the same bit
2703 pattern. */
2704 HReg dst = newVRegV(env);
2705 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2706 AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2707 addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2708 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2709 return dst;
2712 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2713 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2714 HReg arg = iselFltExpr(env, e->Iex.Binop.arg2);
2715 HReg dst = newVRegV(env);
2717 /* rf now holds the value to be rounded. The first thing to do
2718 is set the FPU's rounding mode accordingly. */
2720 /* Set host x87 rounding mode */
2721 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2723 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2724 addInstr(env, AMD64Instr_A87Free(1));
2725 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2726 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2727 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2728 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2730 /* Restore default x87 rounding. */
2731 set_FPU_rounding_default( env );
2733 return dst;
2736 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2737 /* Sigh ... very rough code. Could do much better. */
2738 /* Get the 128-bit literal 00---0 10---0 into a register
2739 and xor it with the value to be negated. */
2740 HReg r1 = newVRegI(env);
2741 HReg dst = newVRegV(env);
2742 HReg tmp = newVRegV(env);
2743 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2744 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2745 addInstr(env, mk_vMOVsd_RR(src,tmp));
2746 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2747 addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2748 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2749 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2750 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2751 add_to_rsp(env, 16);
2752 return dst;
2755 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2756 IRQop *qop = e->Iex.Qop.details;
2757 HReg dst = newVRegV(env);
2758 HReg argX = iselFltExpr(env, qop->arg2);
2759 HReg argY = iselFltExpr(env, qop->arg3);
2760 HReg argZ = iselFltExpr(env, qop->arg4);
2761 /* XXXROUNDINGFIXME */
2762 /* set roundingmode here */
2763 /* subq $16, %rsp -- make a space*/
2764 sub_from_rsp(env, 16);
2765 /* Prepare 4 arg regs:
2766 leaq 0(%rsp), %rdi
2767 leaq 4(%rsp), %rsi
2768 leaq 8(%rsp), %rdx
2769 leaq 12(%rsp), %rcx
2771 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2772 hregAMD64_RDI()));
2773 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2774 hregAMD64_RSI()));
2775 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2776 hregAMD64_RDX()));
2777 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2778 hregAMD64_RCX()));
2779 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2780 movss %argX, 0(%rsi)
2781 movss %argY, 0(%rdx)
2782 movss %argZ, 0(%rcx)
2784 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2785 AMD64AMode_IR(0, hregAMD64_RSI())));
2786 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2787 AMD64AMode_IR(0, hregAMD64_RDX())));
2788 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2789 AMD64AMode_IR(0, hregAMD64_RCX())));
2790 /* call the helper */
2791 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2792 (ULong)(HWord)h_generic_calc_MAddF32,
2793 4, mk_RetLoc_simple(RLPri_None) ));
2794 /* fetch the result from memory, using %r_argp, which the
2795 register allocator will keep alive across the call. */
2796 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2797 AMD64AMode_IR(0, hregAMD64_RSP())));
2798 /* and finally, clear the space */
2799 add_to_rsp(env, 16);
2800 return dst;
2803 ppIRExpr(e);
2804 vpanic("iselFltExpr_wrk");
2808 /*---------------------------------------------------------*/
2809 /*--- ISEL: Floating point expressions (64 bit) ---*/
2810 /*---------------------------------------------------------*/
2812 /* Compute a 64-bit floating point value into the lower half of an xmm
2813 register, the identity of which is returned. As with
2814 iselIntExpr_R, the returned reg will be virtual, and it must not be
2815 changed by subsequent code emitted by the caller.
2818 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2820 Type S (1 bit) E (11 bits) F (52 bits)
2821 ---- --------- ----------- -----------
2822 signalling NaN u 2047 (max) .0uuuuu---u
2823 (with at least
2824 one 1 bit)
2825 quiet NaN u 2047 (max) .1uuuuu---u
2827 negative infinity 1 2047 (max) .000000---0
2829 positive infinity 0 2047 (max) .000000---0
2831 negative zero 1 0 .000000---0
2833 positive zero 0 0 .000000---0
2836 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2838 HReg r = iselDblExpr_wrk( env, e );
2839 # if 0
2840 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2841 # endif
2842 vassert(hregClass(r) == HRcVec128);
2843 vassert(hregIsVirtual(r));
2844 return r;
2847 /* DO NOT CALL THIS DIRECTLY */
2848 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2850 IRType ty = typeOfIRExpr(env->type_env,e);
2851 vassert(e);
2852 vassert(ty == Ity_F64);
2854 if (e->tag == Iex_RdTmp) {
2855 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2858 if (e->tag == Iex_Const) {
2859 union { ULong u64; Double f64; } u;
2860 HReg res = newVRegV(env);
2861 HReg tmp = newVRegI(env);
2862 vassert(sizeof(u) == 8);
2863 vassert(sizeof(u.u64) == 8);
2864 vassert(sizeof(u.f64) == 8);
2866 if (e->Iex.Const.con->tag == Ico_F64) {
2867 u.f64 = e->Iex.Const.con->Ico.F64;
2869 else if (e->Iex.Const.con->tag == Ico_F64i) {
2870 u.u64 = e->Iex.Const.con->Ico.F64i;
2872 else
2873 vpanic("iselDblExpr(amd64): const");
2875 addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2876 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2877 addInstr(env, AMD64Instr_SseLdSt(
2878 True/*load*/, 8, res,
2879 AMD64AMode_IR(0, hregAMD64_RSP())
2881 add_to_rsp(env, 8);
2882 return res;
2885 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2886 AMD64AMode* am;
2887 HReg res = newVRegV(env);
2888 vassert(e->Iex.Load.ty == Ity_F64);
2889 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2890 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2891 return res;
2894 if (e->tag == Iex_Get) {
2895 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2896 hregAMD64_RBP() );
2897 HReg res = newVRegV(env);
2898 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2899 return res;
2902 if (e->tag == Iex_GetI) {
2903 AMD64AMode* am
2904 = genGuestArrayOffset(
2905 env, e->Iex.GetI.descr,
2906 e->Iex.GetI.ix, e->Iex.GetI.bias );
2907 HReg res = newVRegV(env);
2908 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2909 return res;
2912 if (e->tag == Iex_Triop) {
2913 IRTriop *triop = e->Iex.Triop.details;
2914 AMD64SseOp op = Asse_INVALID;
2915 switch (triop->op) {
2916 case Iop_AddF64: op = Asse_ADDF; break;
2917 case Iop_SubF64: op = Asse_SUBF; break;
2918 case Iop_MulF64: op = Asse_MULF; break;
2919 case Iop_DivF64: op = Asse_DIVF; break;
2920 default: break;
2922 if (op != Asse_INVALID) {
2923 HReg dst = newVRegV(env);
2924 HReg argL = iselDblExpr(env, triop->arg2);
2925 HReg argR = iselDblExpr(env, triop->arg3);
2926 addInstr(env, mk_vMOVsd_RR(argL, dst));
2927 /* XXXROUNDINGFIXME */
2928 /* set roundingmode here */
2929 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2930 return dst;
2934 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2935 IRQop *qop = e->Iex.Qop.details;
2936 HReg dst = newVRegV(env);
2937 HReg argX = iselDblExpr(env, qop->arg2);
2938 HReg argY = iselDblExpr(env, qop->arg3);
2939 HReg argZ = iselDblExpr(env, qop->arg4);
2940 /* XXXROUNDINGFIXME */
2941 /* set roundingmode here */
2942 /* subq $32, %rsp -- make a space*/
2943 sub_from_rsp(env, 32);
2944 /* Prepare 4 arg regs:
2945 leaq 0(%rsp), %rdi
2946 leaq 8(%rsp), %rsi
2947 leaq 16(%rsp), %rdx
2948 leaq 24(%rsp), %rcx
2950 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2951 hregAMD64_RDI()));
2952 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2953 hregAMD64_RSI()));
2954 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2955 hregAMD64_RDX()));
2956 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2957 hregAMD64_RCX()));
2958 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2959 movsd %argX, 0(%rsi)
2960 movsd %argY, 0(%rdx)
2961 movsd %argZ, 0(%rcx)
2963 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2964 AMD64AMode_IR(0, hregAMD64_RSI())));
2965 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2966 AMD64AMode_IR(0, hregAMD64_RDX())));
2967 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2968 AMD64AMode_IR(0, hregAMD64_RCX())));
2969 /* call the helper */
2970 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2971 (ULong)(HWord)h_generic_calc_MAddF64,
2972 4, mk_RetLoc_simple(RLPri_None) ));
2973 /* fetch the result from memory, using %r_argp, which the
2974 register allocator will keep alive across the call. */
2975 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2976 AMD64AMode_IR(0, hregAMD64_RSP())));
2977 /* and finally, clear the space */
2978 add_to_rsp(env, 32);
2979 return dst;
2982 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2983 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2984 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
2985 HReg dst = newVRegV(env);
2987 /* rf now holds the value to be rounded. The first thing to do
2988 is set the FPU's rounding mode accordingly. */
2990 /* Set host x87 rounding mode */
2991 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2993 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2994 addInstr(env, AMD64Instr_A87Free(1));
2995 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2996 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2997 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2998 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3000 /* Restore default x87 rounding. */
3001 set_FPU_rounding_default( env );
3003 return dst;
3006 IRTriop *triop = e->Iex.Triop.details;
3007 if (e->tag == Iex_Triop
3008 && (triop->op == Iop_ScaleF64
3009 || triop->op == Iop_AtanF64
3010 || triop->op == Iop_Yl2xF64
3011 || triop->op == Iop_Yl2xp1F64
3012 || triop->op == Iop_PRemF64
3013 || triop->op == Iop_PRem1F64)
3015 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3016 HReg arg1 = iselDblExpr(env, triop->arg2);
3017 HReg arg2 = iselDblExpr(env, triop->arg3);
3018 HReg dst = newVRegV(env);
3019 Bool arg2first = toBool(triop->op == Iop_ScaleF64
3020 || triop->op == Iop_PRemF64
3021 || triop->op == Iop_PRem1F64);
3022 addInstr(env, AMD64Instr_A87Free(2));
3024 /* one arg -> top of x87 stack */
3025 addInstr(env, AMD64Instr_SseLdSt(
3026 False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3027 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3029 /* other arg -> top of x87 stack */
3030 addInstr(env, AMD64Instr_SseLdSt(
3031 False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3032 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3034 /* do it */
3035 /* XXXROUNDINGFIXME */
3036 /* set roundingmode here */
3037 switch (triop->op) {
3038 case Iop_ScaleF64:
3039 addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3040 break;
3041 case Iop_AtanF64:
3042 addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3043 break;
3044 case Iop_Yl2xF64:
3045 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3046 break;
3047 case Iop_Yl2xp1F64:
3048 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3049 break;
3050 case Iop_PRemF64:
3051 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3052 break;
3053 case Iop_PRem1F64:
3054 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3055 break;
3056 default:
3057 vassert(0);
3060 /* save result */
3061 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3062 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3063 return dst;
3066 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3067 HReg dst = newVRegV(env);
3068 HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3069 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3070 addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3071 set_SSE_rounding_default( env );
3072 return dst;
3075 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3076 HReg dst = newVRegV(env);
3077 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3078 set_SSE_rounding_default( env );
3079 addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3080 return dst;
3083 if (e->tag == Iex_Unop
3084 && (e->Iex.Unop.op == Iop_NegF64
3085 || e->Iex.Unop.op == Iop_AbsF64)) {
3086 /* Sigh ... very rough code. Could do much better. */
3087 /* Get the 128-bit literal 00---0 10---0 into a register
3088 and xor/nand it with the value to be negated. */
3089 HReg r1 = newVRegI(env);
3090 HReg dst = newVRegV(env);
3091 HReg tmp = newVRegV(env);
3092 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3093 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3094 addInstr(env, mk_vMOVsd_RR(src,tmp));
3095 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3096 addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3097 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3098 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3100 if (e->Iex.Unop.op == Iop_NegF64)
3101 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3102 else
3103 addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3105 add_to_rsp(env, 16);
3106 return dst;
3109 if (e->tag == Iex_Binop) {
3110 A87FpOp fpop = Afp_INVALID;
3111 switch (e->Iex.Binop.op) {
3112 case Iop_SqrtF64: fpop = Afp_SQRT; break;
3113 case Iop_SinF64: fpop = Afp_SIN; break;
3114 case Iop_CosF64: fpop = Afp_COS; break;
3115 case Iop_TanF64: fpop = Afp_TAN; break;
3116 case Iop_2xm1F64: fpop = Afp_2XM1; break;
3117 default: break;
3119 if (fpop != Afp_INVALID) {
3120 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3121 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3122 HReg dst = newVRegV(env);
3123 Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3124 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3125 addInstr(env, AMD64Instr_A87Free(nNeeded));
3126 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3127 /* XXXROUNDINGFIXME */
3128 /* set roundingmode here */
3129 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3130 codes. I don't think that matters, since this insn
3131 selector never generates such an instruction intervening
3132 between an flag-setting instruction and a flag-using
3133 instruction. */
3134 addInstr(env, AMD64Instr_A87FpOp(fpop));
3135 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3136 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3137 return dst;
3141 if (e->tag == Iex_Unop) {
3142 switch (e->Iex.Unop.op) {
3143 //.. case Iop_I32toF64: {
3144 //.. HReg dst = newVRegF(env);
3145 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3146 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3147 //.. set_FPU_rounding_default(env);
3148 //.. addInstr(env, X86Instr_FpLdStI(
3149 //.. True/*load*/, 4, dst,
3150 //.. X86AMode_IR(0, hregX86_ESP())));
3151 //.. add_to_esp(env, 4);
3152 //.. return dst;
3153 //.. }
3154 case Iop_ReinterpI64asF64: {
3155 /* Given an I64, produce an IEEE754 double with the same
3156 bit pattern. */
3157 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3158 HReg dst = newVRegV(env);
3159 AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg);
3160 /* paranoia */
3161 set_SSE_rounding_default(env);
3162 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3163 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3164 return dst;
3166 case Iop_F32toF64: {
3167 HReg f32;
3168 HReg f64 = newVRegV(env);
3169 /* this shouldn't be necessary, but be paranoid ... */
3170 set_SSE_rounding_default(env);
3171 f32 = iselFltExpr(env, e->Iex.Unop.arg);
3172 addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3173 return f64;
3175 default:
3176 break;
3180 /* --------- MULTIPLEX --------- */
3181 if (e->tag == Iex_ITE) { // VFD
3182 HReg r1, r0, dst;
3183 vassert(ty == Ity_F64);
3184 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3185 r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
3186 r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
3187 dst = newVRegV(env);
3188 addInstr(env, mk_vMOVsd_RR(r1,dst));
3189 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3190 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3191 return dst;
3194 ppIRExpr(e);
3195 vpanic("iselDblExpr_wrk");
3199 /*---------------------------------------------------------*/
3200 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3201 /*---------------------------------------------------------*/
3203 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3205 HReg r = iselVecExpr_wrk( env, e );
3206 # if 0
3207 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3208 # endif
3209 vassert(hregClass(r) == HRcVec128);
3210 vassert(hregIsVirtual(r));
3211 return r;
3215 /* DO NOT CALL THIS DIRECTLY */
3216 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3218 HWord fn = 0; /* address of helper fn, if required */
3219 Bool arg1isEReg = False;
3220 AMD64SseOp op = Asse_INVALID;
3221 vassert(e);
3222 IRType ty = typeOfIRExpr(env->type_env, e);
3223 vassert(ty == Ity_V128);
3224 UInt laneBits = 0;
3226 if (e->tag == Iex_RdTmp) {
3227 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3230 if (e->tag == Iex_Get) {
3231 HReg dst = newVRegV(env);
3232 addInstr(env, AMD64Instr_SseLdSt(
3233 True/*load*/,
3235 dst,
3236 AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3239 return dst;
3242 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3243 HReg dst = newVRegV(env);
3244 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3245 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3246 return dst;
3249 if (e->tag == Iex_Const) {
3250 HReg dst = newVRegV(env);
3251 vassert(e->Iex.Const.con->tag == Ico_V128);
3252 switch (e->Iex.Const.con->Ico.V128) {
3253 case 0x0000:
3254 dst = generate_zeroes_V128(env);
3255 break;
3256 case 0xFFFF:
3257 dst = generate_ones_V128(env);
3258 break;
3259 default: {
3260 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3261 /* do push_uimm64 twice, first time for the high-order half. */
3262 push_uimm64(env, bitmask8_to_bytemask64(
3263 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3265 push_uimm64(env, bitmask8_to_bytemask64(
3266 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3268 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3269 add_to_rsp(env, 16);
3270 break;
3273 return dst;
3276 if (e->tag == Iex_Unop) {
3277 switch (e->Iex.Unop.op) {
3279 case Iop_NotV128: {
3280 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3281 return do_sse_NotV128(env, arg);
3284 case Iop_CmpNEZ64x2: {
3285 /* We can use SSE2 instructions for this. */
3286 /* Ideally, we want to do a 64Ix2 comparison against zero of
3287 the operand. Problem is no such insn exists. Solution
3288 therefore is to do a 32Ix4 comparison instead, and bitwise-
3289 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3290 let the not'd result of this initial comparison be a:b:c:d.
3291 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3292 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3293 giving the required result.
3295 The required selection sequence is 2,3,0,1, which
3296 according to Intel's documentation means the pshufd
3297 literal value is 0xB1, that is,
3298 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3300 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3301 HReg tmp = generate_zeroes_V128(env);
3302 HReg dst = newVRegV(env);
3303 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3304 tmp = do_sse_NotV128(env, tmp);
3305 addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3306 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3307 return dst;
3310 case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3311 case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3312 case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3313 do_CmpNEZ_vector:
3315 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3316 HReg tmp = newVRegV(env);
3317 HReg zero = generate_zeroes_V128(env);
3318 HReg dst;
3319 addInstr(env, mk_vMOVsd_RR(arg, tmp));
3320 addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3321 dst = do_sse_NotV128(env, tmp);
3322 return dst;
3325 case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary;
3326 case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3327 do_32Fx4_unary:
3329 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3330 HReg dst = newVRegV(env);
3331 addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3332 return dst;
3335 case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary;
3336 case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3337 case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary;
3338 do_32F0x4_unary:
3340 /* A bit subtle. We have to copy the arg to the result
3341 register first, because actually doing the SSE scalar insn
3342 leaves the upper 3/4 of the destination register
3343 unchanged. Whereas the required semantics of these
3344 primops is that the upper 3/4 is simply copied in from the
3345 argument. */
3346 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3347 HReg dst = newVRegV(env);
3348 addInstr(env, mk_vMOVsd_RR(arg, dst));
3349 addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3350 return dst;
3353 case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary;
3354 do_64F0x2_unary:
3356 /* A bit subtle. We have to copy the arg to the result
3357 register first, because actually doing the SSE scalar insn
3358 leaves the upper half of the destination register
3359 unchanged. Whereas the required semantics of these
3360 primops is that the upper half is simply copied in from the
3361 argument. */
3362 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3363 HReg dst = newVRegV(env);
3364 addInstr(env, mk_vMOVsd_RR(arg, dst));
3365 addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3366 return dst;
3369 case Iop_32UtoV128: {
3370 // FIXME maybe just use MOVQ here?
3371 HReg dst = newVRegV(env);
3372 AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3373 AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg);
3374 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3375 addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3376 return dst;
3379 case Iop_64UtoV128: {
3380 // FIXME maybe just use MOVQ here?
3381 HReg dst = newVRegV(env);
3382 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3383 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3384 addInstr(env, AMD64Instr_Push(rmi));
3385 addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3386 add_to_rsp(env, 8);
3387 return dst;
3390 case Iop_V256toV128_0:
3391 case Iop_V256toV128_1: {
3392 HReg vHi, vLo;
3393 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3394 return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3397 case Iop_F16toF32x4: {
3398 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3399 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3400 HReg dst = newVRegV(env);
3401 addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3402 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3403 return dst;
3405 break;
3408 default:
3409 break;
3410 } /* switch (e->Iex.Unop.op) */
3411 } /* if (e->tag == Iex_Unop) */
3413 if (e->tag == Iex_Binop) {
3414 switch (e->Iex.Binop.op) {
3416 case Iop_Sqrt64Fx2:
3417 case Iop_Sqrt32Fx4: {
3418 /* :: (rmode, vec) -> vec */
3419 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3420 HReg dst = newVRegV(env);
3421 /* XXXROUNDINGFIXME */
3422 /* set roundingmode here */
3423 addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3424 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3425 (Asse_SQRTF, arg, dst));
3426 return dst;
3429 /* FIXME: could we generate MOVQ here? */
3430 case Iop_SetV128lo64: {
3431 HReg dst = newVRegV(env);
3432 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3433 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3434 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3435 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3436 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3437 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3438 return dst;
3441 /* FIXME: could we generate MOVD here? */
3442 case Iop_SetV128lo32: {
3443 HReg dst = newVRegV(env);
3444 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3445 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3446 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3447 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3448 addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3449 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3450 return dst;
3453 case Iop_64HLtoV128: {
3454 const IRExpr* arg1 = e->Iex.Binop.arg1;
3455 const IRExpr* arg2 = e->Iex.Binop.arg2;
3456 HReg dst = newVRegV(env);
3457 HReg tmp = newVRegV(env);
3458 HReg qHi = iselIntExpr_R(env, arg1);
3459 // If the args are trivially the same (tmp or const), use the same
3460 // source register for both, and only one movq since those are
3461 // (relatively) expensive.
3462 if (areAtomsAndEqual(arg1, arg2)) {
3463 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3464 addInstr(env, mk_vMOVsd_RR(dst, tmp));
3465 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3466 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3467 } else {
3468 HReg qLo = iselIntExpr_R(env, arg2);
3469 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3470 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3471 addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3472 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3474 return dst;
3477 case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3478 case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3479 case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3480 case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3481 case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4;
3482 case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4;
3483 do_32Fx4:
3485 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3486 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3487 HReg dst = newVRegV(env);
3488 addInstr(env, mk_vMOVsd_RR(argL, dst));
3489 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3490 return dst;
3493 case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3494 case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3495 case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3496 case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3497 case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2;
3498 case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2;
3499 do_64Fx2:
3501 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3502 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3503 HReg dst = newVRegV(env);
3504 addInstr(env, mk_vMOVsd_RR(argL, dst));
3505 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3506 return dst;
3509 case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3510 case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3511 case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3512 case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3513 case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4;
3514 case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4;
3515 case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4;
3516 case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4;
3517 case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4;
3518 case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4;
3519 do_32F0x4: {
3520 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3521 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3522 HReg dst = newVRegV(env);
3523 addInstr(env, mk_vMOVsd_RR(argL, dst));
3524 addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3525 return dst;
3528 case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3529 case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3530 case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3531 case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3532 case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2;
3533 case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2;
3534 case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2;
3535 case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2;
3536 case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2;
3537 case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2;
3538 do_64F0x2: {
3539 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3540 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3541 HReg dst = newVRegV(env);
3542 addInstr(env, mk_vMOVsd_RR(argL, dst));
3543 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3544 return dst;
3547 case Iop_PermOrZero8x16:
3548 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3549 op = Asse_PSHUFB;
3550 goto do_SseReRg;
3552 // Otherwise we'll have to generate a call to
3553 // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
3554 // host which doesn't have SSSE3, in which case we don't expect this
3555 // IROp to enter the compilation pipeline in the first place.
3556 break;
3558 case Iop_PwExtUSMulQAdd8x16:
3559 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3560 op = Asse_PMADDUBSW;
3561 goto do_SseReRg;
3563 break;
3565 case Iop_QNarrowBin32Sto16Sx8:
3566 op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3567 case Iop_QNarrowBin16Sto8Sx16:
3568 op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3569 case Iop_QNarrowBin16Sto8Ux16:
3570 op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3572 case Iop_InterleaveHI8x16:
3573 op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3574 case Iop_InterleaveHI16x8:
3575 op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3576 case Iop_InterleaveHI32x4:
3577 op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3578 case Iop_InterleaveHI64x2:
3579 op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3581 case Iop_InterleaveLO8x16:
3582 op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3583 case Iop_InterleaveLO16x8:
3584 op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3585 case Iop_InterleaveLO32x4:
3586 op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3587 case Iop_InterleaveLO64x2:
3588 op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3590 case Iop_AndV128: op = Asse_AND; goto do_SseReRg;
3591 case Iop_OrV128: op = Asse_OR; goto do_SseReRg;
3592 case Iop_XorV128: op = Asse_XOR; goto do_SseReRg;
3593 case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg;
3594 case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg;
3595 case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg;
3596 case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg;
3597 case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg;
3598 case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg;
3599 case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg;
3600 case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg;
3601 case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg;
3602 case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg;
3603 case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg;
3604 case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg;
3605 case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg;
3606 case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg;
3607 case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3608 case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3609 case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg;
3610 case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg;
3611 case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg;
3612 case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg;
3613 case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3614 case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3615 case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg;
3616 case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg;
3617 case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg;
3618 case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg;
3619 case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg;
3620 case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg;
3621 case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg;
3622 case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg;
3623 case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg;
3624 do_SseReRg: {
3625 HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3626 HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3627 HReg dst = newVRegV(env);
3628 if (arg1isEReg) {
3629 addInstr(env, mk_vMOVsd_RR(arg2, dst));
3630 addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3631 } else {
3632 addInstr(env, mk_vMOVsd_RR(arg1, dst));
3633 addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3635 return dst;
3638 case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3639 case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3640 case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3641 case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3642 case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3643 case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3644 case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3645 case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3646 do_SseShift: {
3647 HReg dst = newVRegV(env);
3648 HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3649 /* If it's a shift by an in-range immediate, generate a single
3650 instruction. */
3651 if (e->Iex.Binop.arg2->tag == Iex_Const) {
3652 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3653 vassert(c->tag == Ico_U8);
3654 UInt shift = c->Ico.U8;
3655 if (shift < laneBits) {
3656 addInstr(env, mk_vMOVsd_RR(greg, dst));
3657 addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3658 return dst;
3661 /* Otherwise we have to do it the longwinded way. */
3662 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3663 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3664 HReg ereg = newVRegV(env);
3665 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3666 addInstr(env, AMD64Instr_Push(rmi));
3667 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3668 addInstr(env, mk_vMOVsd_RR(greg, dst));
3669 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3670 add_to_rsp(env, 16);
3671 return dst;
3674 case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
3675 goto do_SseAssistedBinary;
3676 case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
3677 goto do_SseAssistedBinary;
3678 case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
3679 goto do_SseAssistedBinary;
3680 case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
3681 goto do_SseAssistedBinary;
3682 case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
3683 goto do_SseAssistedBinary;
3684 case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
3685 goto do_SseAssistedBinary;
3686 case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
3687 goto do_SseAssistedBinary;
3688 case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
3689 goto do_SseAssistedBinary;
3690 case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
3691 goto do_SseAssistedBinary;
3692 case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2;
3693 goto do_SseAssistedBinary;
3694 case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3695 goto do_SseAssistedBinary;
3696 case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
3697 goto do_SseAssistedBinary;
3698 case Iop_QNarrowBin32Sto16Ux8:
3699 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3700 goto do_SseAssistedBinary;
3701 case Iop_NarrowBin16to8x16:
3702 fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3703 goto do_SseAssistedBinary;
3704 case Iop_NarrowBin32to16x8:
3705 fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3706 goto do_SseAssistedBinary;
3707 do_SseAssistedBinary: {
3708 /* RRRufff! RRRufff code is what we're generating here. Oh
3709 well. */
3710 vassert(fn != 0);
3711 HReg dst = newVRegV(env);
3712 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3713 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3714 HReg argp = newVRegI(env);
3715 /* subq $112, %rsp -- make a space*/
3716 sub_from_rsp(env, 112);
3717 /* leaq 48(%rsp), %r_argp -- point into it */
3718 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3719 argp));
3720 /* andq $-16, %r_argp -- 16-align the pointer */
3721 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3722 AMD64RMI_Imm( ~(UInt)15 ),
3723 argp));
3724 /* Prepare 3 arg regs:
3725 leaq 0(%r_argp), %rdi
3726 leaq 16(%r_argp), %rsi
3727 leaq 32(%r_argp), %rdx
3729 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3730 hregAMD64_RDI()));
3731 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3732 hregAMD64_RSI()));
3733 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3734 hregAMD64_RDX()));
3735 /* Store the two args, at (%rsi) and (%rdx):
3736 movupd %argL, 0(%rsi)
3737 movupd %argR, 0(%rdx)
3739 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3740 AMD64AMode_IR(0, hregAMD64_RSI())));
3741 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3742 AMD64AMode_IR(0, hregAMD64_RDX())));
3743 /* call the helper */
3744 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3745 3, mk_RetLoc_simple(RLPri_None) ));
3746 /* fetch the result from memory, using %r_argp, which the
3747 register allocator will keep alive across the call. */
3748 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3749 AMD64AMode_IR(0, argp)));
3750 /* and finally, clear the space */
3751 add_to_rsp(env, 112);
3752 return dst;
3755 case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3756 goto do_SseAssistedVectorAndScalar;
3757 case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3758 goto do_SseAssistedVectorAndScalar;
3759 do_SseAssistedVectorAndScalar: {
3760 /* RRRufff! RRRufff code is what we're generating here. Oh
3761 well. */
3762 vassert(fn != 0);
3763 HReg dst = newVRegV(env);
3764 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3765 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3766 HReg argp = newVRegI(env);
3767 /* subq $112, %rsp -- make a space*/
3768 sub_from_rsp(env, 112);
3769 /* leaq 48(%rsp), %r_argp -- point into it */
3770 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3771 argp));
3772 /* andq $-16, %r_argp -- 16-align the pointer */
3773 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3774 AMD64RMI_Imm( ~(UInt)15 ),
3775 argp));
3776 /* Prepare 2 vector arg regs:
3777 leaq 0(%r_argp), %rdi
3778 leaq 16(%r_argp), %rsi
3780 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3781 hregAMD64_RDI()));
3782 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3783 hregAMD64_RSI()));
3784 /* Store the vector arg, at (%rsi):
3785 movupd %argL, 0(%rsi)
3787 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3788 AMD64AMode_IR(0, hregAMD64_RSI())));
3789 /* And get the scalar value into rdx */
3790 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3792 /* call the helper */
3793 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3794 3, mk_RetLoc_simple(RLPri_None) ));
3795 /* fetch the result from memory, using %r_argp, which the
3796 register allocator will keep alive across the call. */
3797 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3798 AMD64AMode_IR(0, argp)));
3799 /* and finally, clear the space */
3800 add_to_rsp(env, 112);
3801 return dst;
3804 case Iop_I32StoF32x4:
3805 case Iop_F32toI32Sx4: {
3806 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3807 HReg dst = newVRegV(env);
3808 AMD64SseOp mop
3809 = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3810 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3811 addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3812 set_SSE_rounding_default(env);
3813 return dst;
3816 // Half-float vector conversion
3817 case Iop_F32toF16x8: {
3818 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3819 HReg srcHi, srcLo;
3820 iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
3821 HReg dstHi = newVRegV(env);
3822 HReg dstLo = newVRegV(env);
3823 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3824 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
3825 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
3826 set_SSE_rounding_default(env);
3827 // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
3828 // need to compact all that into one register. There's probably a
3829 // more elegant way to do this, but ..
3830 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
3831 // dstHi is now 127:64 = useful data, 63:0 = zero
3832 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
3833 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
3834 // dstLo is now 127:64 = zero, 63:0 = useful data
3835 addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
3836 return dstLo;
3838 break;
3841 default:
3842 break;
3843 } /* switch (e->Iex.Binop.op) */
3844 } /* if (e->tag == Iex_Binop) */
3846 if (e->tag == Iex_Triop) {
3847 IRTriop *triop = e->Iex.Triop.details;
3848 switch (triop->op) {
3850 case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3851 case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3852 case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3853 case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3854 do_64Fx2_w_rm:
3856 HReg argL = iselVecExpr(env, triop->arg2);
3857 HReg argR = iselVecExpr(env, triop->arg3);
3858 HReg dst = newVRegV(env);
3859 addInstr(env, mk_vMOVsd_RR(argL, dst));
3860 /* XXXROUNDINGFIXME */
3861 /* set roundingmode here */
3862 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3863 return dst;
3866 case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3867 case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3868 case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3869 case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3870 do_32Fx4_w_rm:
3872 HReg argL = iselVecExpr(env, triop->arg2);
3873 HReg argR = iselVecExpr(env, triop->arg3);
3874 HReg dst = newVRegV(env);
3875 addInstr(env, mk_vMOVsd_RR(argL, dst));
3876 /* XXXROUNDINGFIXME */
3877 /* set roundingmode here */
3878 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3879 return dst;
3882 default:
3883 break;
3884 } /* switch (triop->op) */
3885 } /* if (e->tag == Iex_Triop) */
3887 if (e->tag == Iex_ITE) { // VFD
3888 HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue);
3889 HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse);
3890 HReg dst = newVRegV(env);
3891 addInstr(env, mk_vMOVsd_RR(r1,dst));
3892 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3893 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3894 return dst;
3897 //vec_fail:
3898 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3899 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3900 ppIRExpr(e);
3901 vpanic("iselVecExpr_wrk");
3905 /*---------------------------------------------------------*/
3906 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
3907 /*---------------------------------------------------------*/
3909 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3910 ISelEnv* env, const IRExpr* e )
3912 iselDVecExpr_wrk( rHi, rLo, env, e );
3913 # if 0
3914 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3915 # endif
3916 vassert(hregClass(*rHi) == HRcVec128);
3917 vassert(hregClass(*rLo) == HRcVec128);
3918 vassert(hregIsVirtual(*rHi));
3919 vassert(hregIsVirtual(*rLo));
3923 /* DO NOT CALL THIS DIRECTLY */
3924 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3925 ISelEnv* env, const IRExpr* e )
3927 HWord fn = 0; /* address of helper fn, if required */
3928 vassert(e);
3929 IRType ty = typeOfIRExpr(env->type_env, e);
3930 vassert(ty == Ity_V256);
3931 UInt laneBits = 0;
3933 AMD64SseOp op = Asse_INVALID;
3935 /* read 256-bit IRTemp */
3936 if (e->tag == Iex_RdTmp) {
3937 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3938 return;
3941 if (e->tag == Iex_Get) {
3942 HReg vHi = newVRegV(env);
3943 HReg vLo = newVRegV(env);
3944 HReg rbp = hregAMD64_RBP();
3945 AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp);
3946 AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3947 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3948 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3949 *rHi = vHi;
3950 *rLo = vLo;
3951 return;
3954 if (e->tag == Iex_Load) {
3955 HReg vHi = newVRegV(env);
3956 HReg vLo = newVRegV(env);
3957 HReg rA = iselIntExpr_R(env, e->Iex.Load.addr);
3958 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
3959 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3960 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3961 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3962 *rHi = vHi;
3963 *rLo = vLo;
3964 return;
3967 if (e->tag == Iex_Const) {
3968 vassert(e->Iex.Const.con->tag == Ico_V256);
3969 switch (e->Iex.Const.con->Ico.V256) {
3970 case 0x00000000: {
3971 HReg vHi = generate_zeroes_V128(env);
3972 HReg vLo = newVRegV(env);
3973 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3974 *rHi = vHi;
3975 *rLo = vLo;
3976 return;
3978 default:
3979 break; /* give up. Until such time as is necessary. */
3983 if (e->tag == Iex_Unop) {
3984 switch (e->Iex.Unop.op) {
3986 case Iop_NotV256: {
3987 HReg argHi, argLo;
3988 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3989 *rHi = do_sse_NotV128(env, argHi);
3990 *rLo = do_sse_NotV128(env, argLo);
3991 return;
3994 case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary;
3995 case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary;
3996 case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3997 do_32Fx8_unary:
3999 HReg argHi, argLo;
4000 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4001 HReg dstHi = newVRegV(env);
4002 HReg dstLo = newVRegV(env);
4003 addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4004 addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4005 *rHi = dstHi;
4006 *rLo = dstLo;
4007 return;
4010 case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary;
4011 do_64Fx4_unary:
4013 HReg argHi, argLo;
4014 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4015 HReg dstHi = newVRegV(env);
4016 HReg dstLo = newVRegV(env);
4017 addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4018 addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4019 *rHi = dstHi;
4020 *rLo = dstLo;
4021 return;
4024 case Iop_CmpNEZ64x4: {
4025 /* We can use SSE2 instructions for this. */
4026 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4027 (obviously). See comment on Iop_CmpNEZ64x2 for
4028 explanation of what's going on here. */
4029 HReg argHi, argLo;
4030 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4031 HReg tmpHi = generate_zeroes_V128(env);
4032 HReg tmpLo = newVRegV(env);
4033 addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4034 HReg dstHi = newVRegV(env);
4035 HReg dstLo = newVRegV(env);
4036 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4037 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4038 tmpHi = do_sse_NotV128(env, tmpHi);
4039 tmpLo = do_sse_NotV128(env, tmpLo);
4040 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4041 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4042 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4043 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4044 *rHi = dstHi;
4045 *rLo = dstLo;
4046 return;
4049 case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4050 case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4051 case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
4052 do_CmpNEZ_vector:
4054 HReg argHi, argLo;
4055 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4056 HReg tmpHi = newVRegV(env);
4057 HReg tmpLo = newVRegV(env);
4058 HReg zero = generate_zeroes_V128(env);
4059 HReg dstHi, dstLo;
4060 addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4061 addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4062 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4063 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4064 dstHi = do_sse_NotV128(env, tmpHi);
4065 dstLo = do_sse_NotV128(env, tmpLo);
4066 *rHi = dstHi;
4067 *rLo = dstLo;
4068 return;
4071 case Iop_F16toF32x8: {
4072 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4073 HReg src = iselVecExpr(env, e->Iex.Unop.arg);
4074 HReg srcCopy = newVRegV(env);
4075 HReg dstHi = newVRegV(env);
4076 HReg dstLo = newVRegV(env);
4077 // Copy src, since we'll need to modify it.
4078 addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4079 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4080 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4081 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4082 *rHi = dstHi;
4083 *rLo = dstLo;
4084 return;
4086 break;
4089 default:
4090 break;
4091 } /* switch (e->Iex.Unop.op) */
4092 } /* if (e->tag == Iex_Unop) */
4094 if (e->tag == Iex_Binop) {
4095 switch (e->Iex.Binop.op) {
4097 case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4;
4098 case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4;
4099 do_64Fx4:
4101 HReg argLhi, argLlo, argRhi, argRlo;
4102 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4103 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4104 HReg dstHi = newVRegV(env);
4105 HReg dstLo = newVRegV(env);
4106 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4107 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4108 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4109 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4110 *rHi = dstHi;
4111 *rLo = dstLo;
4112 return;
4115 case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8;
4116 case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8;
4117 do_32Fx8:
4119 HReg argLhi, argLlo, argRhi, argRlo;
4120 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4121 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4122 HReg dstHi = newVRegV(env);
4123 HReg dstLo = newVRegV(env);
4124 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4125 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4126 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4127 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4128 *rHi = dstHi;
4129 *rLo = dstLo;
4130 return;
4133 case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
4134 case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
4135 case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
4136 case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
4137 case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
4138 case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
4139 case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
4140 case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
4141 case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
4142 case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
4143 case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
4144 case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
4145 case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
4146 case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
4147 case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
4148 case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
4149 case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
4150 case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4151 case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4152 case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
4153 case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
4154 case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
4155 case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
4156 case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4157 case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4158 case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
4159 case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
4160 case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
4161 case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
4162 case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
4163 case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
4164 case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
4165 case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
4166 case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
4167 do_SseReRg:
4169 HReg argLhi, argLlo, argRhi, argRlo;
4170 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4171 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4172 HReg dstHi = newVRegV(env);
4173 HReg dstLo = newVRegV(env);
4174 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4175 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4176 addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4177 addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4178 *rHi = dstHi;
4179 *rLo = dstLo;
4180 return;
4183 case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4184 case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4185 case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4186 case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4187 case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4188 case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4189 case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4190 case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4191 do_SseShift: {
4192 HReg dstHi = newVRegV(env);
4193 HReg dstLo = newVRegV(env);
4194 HReg gregHi, gregLo;
4195 iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4196 /* If it's a shift by an in-range immediate, generate two single
4197 instructions. */
4198 if (e->Iex.Binop.arg2->tag == Iex_Const) {
4199 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4200 vassert(c->tag == Ico_U8);
4201 UInt shift = c->Ico.U8;
4202 if (shift < laneBits) {
4203 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4204 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4205 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4206 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4207 *rHi = dstHi;
4208 *rLo = dstLo;
4209 return;
4212 /* Otherwise we have to do it the longwinded way. */
4213 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4214 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
4215 HReg ereg = newVRegV(env);
4216 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4217 addInstr(env, AMD64Instr_Push(rmi));
4218 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4219 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4220 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4221 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4222 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4223 add_to_rsp(env, 16);
4224 *rHi = dstHi;
4225 *rLo = dstLo;
4226 return;
4229 case Iop_V128HLtoV256: {
4230 // Curiously, there doesn't seem to be any benefit to be had here by
4231 // checking whether arg1 and arg2 are the same, in the style of how
4232 // (eg) 64HLtoV128 is handled elsewhere in this file.
4233 *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4234 *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4235 return;
4238 case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
4239 goto do_SseAssistedBinary;
4240 case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
4241 goto do_SseAssistedBinary;
4242 case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
4243 goto do_SseAssistedBinary;
4244 case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
4245 goto do_SseAssistedBinary;
4246 case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
4247 goto do_SseAssistedBinary;
4248 case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
4249 goto do_SseAssistedBinary;
4250 case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
4251 goto do_SseAssistedBinary;
4252 case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
4253 goto do_SseAssistedBinary;
4254 case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
4255 goto do_SseAssistedBinary;
4256 case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
4257 goto do_SseAssistedBinary;
4258 case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4259 goto do_SseAssistedBinary;
4260 do_SseAssistedBinary: {
4261 /* RRRufff! RRRufff code is what we're generating here. Oh
4262 well. */
4263 vassert(fn != 0);
4264 HReg dstHi = newVRegV(env);
4265 HReg dstLo = newVRegV(env);
4266 HReg argLhi, argLlo, argRhi, argRlo;
4267 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4268 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4269 HReg argp = newVRegI(env);
4270 /* subq $160, %rsp -- make a space*/
4271 sub_from_rsp(env, 160);
4272 /* leaq 48(%rsp), %r_argp -- point into it */
4273 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4274 argp));
4275 /* andq $-16, %r_argp -- 16-align the pointer */
4276 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4277 AMD64RMI_Imm( ~(UInt)15 ),
4278 argp));
4279 /* Prepare 3 arg regs:
4280 leaq 0(%r_argp), %rdi
4281 leaq 16(%r_argp), %rsi
4282 leaq 32(%r_argp), %rdx
4284 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4285 hregAMD64_RDI()));
4286 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4287 hregAMD64_RSI()));
4288 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4289 hregAMD64_RDX()));
4290 /* Store the two high args, at (%rsi) and (%rdx):
4291 movupd %argLhi, 0(%rsi)
4292 movupd %argRhi, 0(%rdx)
4294 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4295 AMD64AMode_IR(0, hregAMD64_RSI())));
4296 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4297 AMD64AMode_IR(0, hregAMD64_RDX())));
4298 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4299 movupd %argLlo, 48(%rsi)
4300 movupd %argRlo, 48(%rdx)
4302 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4303 AMD64AMode_IR(48, hregAMD64_RSI())));
4304 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4305 AMD64AMode_IR(48, hregAMD64_RDX())));
4306 /* call the helper */
4307 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4308 mk_RetLoc_simple(RLPri_None) ));
4309 /* Prepare 3 arg regs:
4310 leaq 48(%r_argp), %rdi
4311 leaq 64(%r_argp), %rsi
4312 leaq 80(%r_argp), %rdx
4314 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4315 hregAMD64_RDI()));
4316 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4317 hregAMD64_RSI()));
4318 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4319 hregAMD64_RDX()));
4320 /* call the helper */
4321 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4322 mk_RetLoc_simple(RLPri_None) ));
4323 /* fetch the result from memory, using %r_argp, which the
4324 register allocator will keep alive across the call. */
4325 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4326 AMD64AMode_IR(0, argp)));
4327 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4328 AMD64AMode_IR(48, argp)));
4329 /* and finally, clear the space */
4330 add_to_rsp(env, 160);
4331 *rHi = dstHi;
4332 *rLo = dstLo;
4333 return;
4336 case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
4337 goto do_SseAssistedBinary256;
4338 do_SseAssistedBinary256: {
4339 /* RRRufff! RRRufff code is what we're generating here. Oh
4340 well. */
4341 vassert(fn != 0);
4342 HReg dstHi = newVRegV(env);
4343 HReg dstLo = newVRegV(env);
4344 HReg argLhi, argLlo, argRhi, argRlo;
4345 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4346 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4347 HReg argp = newVRegI(env);
4348 /* subq $160, %rsp -- make a space*/
4349 sub_from_rsp(env, 160);
4350 /* leaq 48(%rsp), %r_argp -- point into it */
4351 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4352 argp));
4353 /* andq $-16, %r_argp -- 16-align the pointer */
4354 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4355 AMD64RMI_Imm( ~(UInt)15 ),
4356 argp));
4357 /* Prepare 3 arg regs:
4358 leaq 0(%r_argp), %rdi
4359 leaq 32(%r_argp), %rsi
4360 leaq 64(%r_argp), %rdx
4362 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4363 hregAMD64_RDI()));
4364 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4365 hregAMD64_RSI()));
4366 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4367 hregAMD64_RDX()));
4368 /* Store the two args, at (%rsi) and (%rdx):
4369 movupd %argLlo, 0(%rsi)
4370 movupd %argLhi, 16(%rsi)
4371 movupd %argRlo, 0(%rdx)
4372 movupd %argRhi, 16(%rdx)
4374 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4375 AMD64AMode_IR(0, hregAMD64_RSI())));
4376 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4377 AMD64AMode_IR(16, hregAMD64_RSI())));
4378 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4379 AMD64AMode_IR(0, hregAMD64_RDX())));
4380 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4381 AMD64AMode_IR(16, hregAMD64_RDX())));
4382 /* call the helper */
4383 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4384 mk_RetLoc_simple(RLPri_None) ));
4385 /* fetch the result from memory, using %r_argp, which the
4386 register allocator will keep alive across the call. */
4387 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4388 AMD64AMode_IR(0, argp)));
4389 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4390 AMD64AMode_IR(16, argp)));
4391 /* and finally, clear the space */
4392 add_to_rsp(env, 160);
4393 *rHi = dstHi;
4394 *rLo = dstLo;
4395 return;
4398 case Iop_I32StoF32x8:
4399 case Iop_F32toI32Sx8: {
4400 HReg argHi, argLo;
4401 iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4402 HReg dstHi = newVRegV(env);
4403 HReg dstLo = newVRegV(env);
4404 AMD64SseOp mop
4405 = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4406 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4407 addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4408 addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4409 set_SSE_rounding_default(env);
4410 *rHi = dstHi;
4411 *rLo = dstLo;
4412 return;
4415 default:
4416 break;
4417 } /* switch (e->Iex.Binop.op) */
4418 } /* if (e->tag == Iex_Binop) */
4420 if (e->tag == Iex_Triop) {
4421 IRTriop *triop = e->Iex.Triop.details;
4422 switch (triop->op) {
4424 case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4425 case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4426 case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4427 case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4428 do_64Fx4_w_rm:
4430 HReg argLhi, argLlo, argRhi, argRlo;
4431 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4432 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4433 HReg dstHi = newVRegV(env);
4434 HReg dstLo = newVRegV(env);
4435 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4436 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4437 /* XXXROUNDINGFIXME */
4438 /* set roundingmode here */
4439 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4440 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4441 *rHi = dstHi;
4442 *rLo = dstLo;
4443 return;
4446 case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4447 case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4448 case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4449 case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4450 do_32Fx8_w_rm:
4452 HReg argLhi, argLlo, argRhi, argRlo;
4453 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4454 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4455 HReg dstHi = newVRegV(env);
4456 HReg dstLo = newVRegV(env);
4457 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4458 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4459 /* XXXROUNDINGFIXME */
4460 /* set roundingmode here */
4461 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4462 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4463 *rHi = dstHi;
4464 *rLo = dstLo;
4465 return;
4468 default:
4469 break;
4470 } /* switch (triop->op) */
4471 } /* if (e->tag == Iex_Triop) */
4474 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4475 const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4476 const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4477 const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4478 const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4479 // If the args are trivially the same (tmp or const), use the same
4480 // source register for all four, and only one movq since those are
4481 // (relatively) expensive.
4482 if (areAtomsAndEqual(arg1, arg2)
4483 && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4484 HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4485 HReg tmp = newVRegV(env);
4486 HReg dst = newVRegV(env);
4487 addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4488 addInstr(env, mk_vMOVsd_RR(dst, tmp));
4489 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4490 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4491 *rHi = dst;
4492 *rLo = dst;
4493 } else {
4494 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4495 HReg q3 = iselIntExpr_R(env, arg1);
4496 HReg q2 = iselIntExpr_R(env, arg2);
4497 HReg q1 = iselIntExpr_R(env, arg3);
4498 HReg q0 = iselIntExpr_R(env, arg4);
4499 HReg tmp = newVRegV(env);
4500 HReg dstHi = newVRegV(env);
4501 HReg dstLo = newVRegV(env);
4502 addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4503 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4504 addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4505 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4506 addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4507 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4508 addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4509 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4510 *rHi = dstHi;
4511 *rLo = dstLo;
4513 return;
4516 if (e->tag == Iex_ITE) {
4517 HReg r1Hi, r1Lo, r0Hi, r0Lo;
4518 iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4519 iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4520 HReg dstHi = newVRegV(env);
4521 HReg dstLo = newVRegV(env);
4522 addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4523 addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4524 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4525 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4526 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4527 *rHi = dstHi;
4528 *rLo = dstLo;
4529 return;
4532 //avx_fail:
4533 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4534 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4535 ppIRExpr(e);
4536 vpanic("iselDVecExpr_wrk");
4540 /*---------------------------------------------------------*/
4541 /*--- ISEL: Statements ---*/
4542 /*---------------------------------------------------------*/
4544 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4546 if (vex_traceflags & VEX_TRACE_VCODE) {
4547 vex_printf("\n-- ");
4548 ppIRStmt(stmt);
4549 vex_printf("\n");
4552 switch (stmt->tag) {
4554 /* --------- LOADG (guarded load) --------- */
4555 case Ist_LoadG: {
4556 IRLoadG* lg = stmt->Ist.LoadG.details;
4557 if (lg->end != Iend_LE)
4558 goto stmt_fail;
4560 UChar szB = 0; /* invalid */
4561 switch (lg->cvt) {
4562 case ILGop_Ident32: szB = 4; break;
4563 case ILGop_Ident64: szB = 8; break;
4564 case ILGop_IdentV128: szB = 16; break;
4565 default: break;
4567 if (szB == 0)
4568 goto stmt_fail;
4570 AMD64AMode* amAddr
4571 = iselIntExpr_AMode(env, lg->addr);
4572 HReg rAlt
4573 = szB == 16 ? iselVecExpr(env, lg->alt)
4574 : iselIntExpr_R(env, lg->alt);
4575 HReg rDst
4576 = lookupIRTemp(env, lg->dst);
4578 /* Get the alt value into the dst. We'll do a conditional load
4579 which overwrites it -- or not -- with loaded data. */
4580 if (szB == 16) {
4581 addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4582 } else {
4583 addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4585 AMD64CondCode cc = iselCondCode(env, lg->guard);
4586 if (szB == 16) {
4587 addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4588 } else {
4589 addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4591 return;
4594 /* --------- STOREG (guarded store) --------- */
4595 case Ist_StoreG: {
4596 IRStoreG* sg = stmt->Ist.StoreG.details;
4597 if (sg->end != Iend_LE)
4598 goto stmt_fail;
4600 UChar szB = 0; /* invalid */
4601 switch (typeOfIRExpr(env->type_env, sg->data)) {
4602 case Ity_I32: szB = 4; break;
4603 case Ity_I64: szB = 8; break;
4604 case Ity_V128: szB = 16; break;
4605 default: break;
4607 if (szB == 0)
4608 goto stmt_fail;
4610 AMD64AMode* amAddr
4611 = iselIntExpr_AMode(env, sg->addr);
4612 HReg rSrc
4613 = szB == 16 ? iselVecExpr(env, sg->data)
4614 : iselIntExpr_R(env, sg->data);
4615 AMD64CondCode cc
4616 = iselCondCode(env, sg->guard);
4617 if (szB == 16) {
4618 addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4619 } else {
4620 addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4622 return;
4625 /* --------- STORE --------- */
4626 case Ist_Store: {
4627 IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4628 IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4629 IREndness end = stmt->Ist.Store.end;
4631 if (tya != Ity_I64 || end != Iend_LE)
4632 goto stmt_fail;
4634 if (tyd == Ity_I64) {
4635 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4636 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4637 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4638 return;
4640 if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4641 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4642 HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4643 addInstr(env, AMD64Instr_Store(
4644 toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4645 r,am));
4646 return;
4648 if (tyd == Ity_F64) {
4649 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4650 HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4651 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4652 return;
4654 if (tyd == Ity_F32) {
4655 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4656 HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4657 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4658 return;
4660 if (tyd == Ity_V128) {
4661 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4662 HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4663 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4664 return;
4666 if (tyd == Ity_V256) {
4667 HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4668 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4669 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4670 HReg vHi, vLo;
4671 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4672 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4673 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4674 return;
4676 break;
4679 /* --------- PUT --------- */
4680 case Ist_Put: {
4681 IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4682 if (ty == Ity_I64) {
4683 /* We're going to write to memory, so compute the RHS into an
4684 AMD64RI. */
4685 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4686 addInstr(env,
4687 AMD64Instr_Alu64M(
4688 Aalu_MOV,
4690 AMD64AMode_IR(stmt->Ist.Put.offset,
4691 hregAMD64_RBP())
4693 return;
4695 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4696 HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4697 addInstr(env, AMD64Instr_Store(
4698 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4700 AMD64AMode_IR(stmt->Ist.Put.offset,
4701 hregAMD64_RBP())));
4702 return;
4704 if (ty == Ity_F32) {
4705 HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4706 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4707 set_SSE_rounding_default(env); /* paranoia */
4708 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4709 return;
4711 if (ty == Ity_F64) {
4712 HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4713 AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4714 hregAMD64_RBP() );
4715 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4716 return;
4718 if (ty == Ity_V128) {
4719 HReg vec = iselVecExpr(env, stmt->Ist.Put.data);
4720 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset,
4721 hregAMD64_RBP());
4722 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4723 return;
4725 if (ty == Ity_V256) {
4726 HReg vHi, vLo;
4727 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4728 HReg rbp = hregAMD64_RBP();
4729 AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp);
4730 AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4731 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4732 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4733 return;
4735 break;
4738 /* --------- Indexed PUT --------- */
4739 case Ist_PutI: {
4740 IRPutI *puti = stmt->Ist.PutI.details;
4742 AMD64AMode* am
4743 = genGuestArrayOffset(
4744 env, puti->descr,
4745 puti->ix, puti->bias );
4747 IRType ty = typeOfIRExpr(env->type_env, puti->data);
4748 if (ty == Ity_F64) {
4749 HReg val = iselDblExpr(env, puti->data);
4750 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4751 return;
4753 if (ty == Ity_I8) {
4754 HReg r = iselIntExpr_R(env, puti->data);
4755 addInstr(env, AMD64Instr_Store( 1, r, am ));
4756 return;
4758 if (ty == Ity_I64) {
4759 AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4760 addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4761 return;
4763 break;
4766 /* --------- TMP --------- */
4767 case Ist_WrTmp: {
4768 IRTemp tmp = stmt->Ist.WrTmp.tmp;
4769 IRType ty = typeOfIRTemp(env->type_env, tmp);
4771 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4772 compute it into an AMode and then use LEA. This usually
4773 produces fewer instructions, often because (for memcheck
4774 created IR) we get t = address-expression, (t is later used
4775 twice) and so doing this naturally turns address-expression
4776 back into an AMD64 amode. */
4777 if (ty == Ity_I64
4778 && stmt->Ist.WrTmp.data->tag == Iex_Binop
4779 && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4780 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4781 HReg dst = lookupIRTemp(env, tmp);
4782 if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4783 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4784 value into a register. Just emit a normal reg-reg move
4785 so reg-alloc can coalesce it away in the usual way. */
4786 HReg src = am->Aam.IR.reg;
4787 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4788 } else {
4789 addInstr(env, AMD64Instr_Lea64(am,dst));
4791 return;
4794 if (ty == Ity_I64 || ty == Ity_I32
4795 || ty == Ity_I16 || ty == Ity_I8) {
4796 AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4797 HReg dst = lookupIRTemp(env, tmp);
4798 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4799 return;
4801 if (ty == Ity_I128) {
4802 HReg rHi, rLo, dstHi, dstLo;
4803 iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4804 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4805 addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4806 addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4807 return;
4809 if (ty == Ity_I1) {
4810 AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4811 HReg dst = lookupIRTemp(env, tmp);
4812 addInstr(env, AMD64Instr_Set64(cond, dst));
4813 return;
4815 if (ty == Ity_F64) {
4816 HReg dst = lookupIRTemp(env, tmp);
4817 HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4818 addInstr(env, mk_vMOVsd_RR(src, dst));
4819 return;
4821 if (ty == Ity_F32) {
4822 HReg dst = lookupIRTemp(env, tmp);
4823 HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4824 addInstr(env, mk_vMOVsd_RR(src, dst));
4825 return;
4827 if (ty == Ity_V128) {
4828 HReg dst = lookupIRTemp(env, tmp);
4829 HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4830 addInstr(env, mk_vMOVsd_RR(src, dst));
4831 return;
4833 if (ty == Ity_V256) {
4834 HReg rHi, rLo, dstHi, dstLo;
4835 iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4836 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4837 addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4838 addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4839 return;
4841 break;
4844 /* --------- Call to DIRTY helper --------- */
4845 case Ist_Dirty: {
4846 IRDirty* d = stmt->Ist.Dirty.details;
4848 /* Figure out the return type, if any. */
4849 IRType retty = Ity_INVALID;
4850 if (d->tmp != IRTemp_INVALID)
4851 retty = typeOfIRTemp(env->type_env, d->tmp);
4853 /* Throw out any return types we don't know about. */
4854 Bool retty_ok = False;
4855 switch (retty) {
4856 case Ity_INVALID: /* function doesn't return anything */
4857 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4858 case Ity_V128: case Ity_V256:
4859 retty_ok = True; break;
4860 default:
4861 break;
4863 if (!retty_ok)
4864 break; /* will go to stmt_fail: */
4866 /* Marshal args, do the call, and set the return value to
4867 0x555..555 if this is a conditional call that returns a value
4868 and the call is skipped. */
4869 UInt addToSp = 0;
4870 RetLoc rloc = mk_RetLoc_INVALID();
4871 doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4872 vassert(is_sane_RetLoc(rloc));
4874 /* Now figure out what to do with the returned value, if any. */
4875 switch (retty) {
4876 case Ity_INVALID: {
4877 /* No return value. Nothing to do. */
4878 vassert(d->tmp == IRTemp_INVALID);
4879 vassert(rloc.pri == RLPri_None);
4880 vassert(addToSp == 0);
4881 return;
4883 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4884 /* The returned value is in %rax. Park it in the register
4885 associated with tmp. */
4886 vassert(rloc.pri == RLPri_Int);
4887 vassert(addToSp == 0);
4888 HReg dst = lookupIRTemp(env, d->tmp);
4889 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4890 return;
4892 case Ity_V128: {
4893 /* The returned value is on the stack, and rloc.spOff
4894 tells us where. Fish it off the stack and then move
4895 the stack pointer upwards to clear it, as directed by
4896 doHelperCall. */
4897 vassert(rloc.pri == RLPri_V128SpRel);
4898 vassert(addToSp >= 16);
4899 HReg dst = lookupIRTemp(env, d->tmp);
4900 AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4901 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4902 add_to_rsp(env, addToSp);
4903 return;
4905 case Ity_V256: {
4906 /* See comments for Ity_V128. */
4907 vassert(rloc.pri == RLPri_V256SpRel);
4908 vassert(addToSp >= 32);
4909 HReg dstLo, dstHi;
4910 lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4911 AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4912 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4913 AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4914 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4915 add_to_rsp(env, addToSp);
4916 return;
4918 default:
4919 /*NOTREACHED*/
4920 vassert(0);
4922 break;
4925 /* --------- MEM FENCE --------- */
4926 case Ist_MBE:
4927 switch (stmt->Ist.MBE.event) {
4928 case Imbe_Fence:
4929 addInstr(env, AMD64Instr_MFence());
4930 return;
4931 default:
4932 break;
4934 break;
4936 /* --------- ACAS --------- */
4937 case Ist_CAS:
4938 if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4939 /* "normal" singleton CAS */
4940 UChar sz;
4941 IRCAS* cas = stmt->Ist.CAS.details;
4942 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4943 /* get: cas->expd into %rax, and cas->data into %rbx */
4944 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4945 HReg rData = iselIntExpr_R(env, cas->dataLo);
4946 HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4947 HReg rOld = lookupIRTemp(env, cas->oldLo);
4948 vassert(cas->expdHi == NULL);
4949 vassert(cas->dataHi == NULL);
4950 addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4951 addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4952 addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4953 switch (ty) {
4954 case Ity_I64: sz = 8; break;
4955 case Ity_I32: sz = 4; break;
4956 case Ity_I16: sz = 2; break;
4957 case Ity_I8: sz = 1; break;
4958 default: goto unhandled_cas;
4960 addInstr(env, AMD64Instr_ACAS(am, sz));
4961 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4962 return;
4963 } else {
4964 /* double CAS */
4965 UChar sz;
4966 IRCAS* cas = stmt->Ist.CAS.details;
4967 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4968 /* only 32-bit and 64-bit allowed in this case */
4969 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4970 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4971 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4972 HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4973 HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4974 HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4975 HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4976 HReg rOldHi = lookupIRTemp(env, cas->oldHi);
4977 HReg rOldLo = lookupIRTemp(env, cas->oldLo);
4978 switch (ty) {
4979 case Ity_I64:
4980 if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4981 goto unhandled_cas; /* we'd have to generate
4982 cmpxchg16b, but the host
4983 doesn't support that */
4984 sz = 8;
4985 break;
4986 case Ity_I32:
4987 sz = 4;
4988 break;
4989 default:
4990 goto unhandled_cas;
4992 addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4993 addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4994 addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4995 addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4996 addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4997 addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4998 addInstr(env, AMD64Instr_DACAS(am, sz));
4999 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
5000 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
5001 return;
5003 unhandled_cas:
5004 break;
5006 /* --------- INSTR MARK --------- */
5007 /* Doesn't generate any executable code ... */
5008 case Ist_IMark:
5009 return;
5011 /* --------- ABI HINT --------- */
5012 /* These have no meaning (denotation in the IR) and so we ignore
5013 them ... if any actually made it this far. */
5014 case Ist_AbiHint:
5015 return;
5017 /* --------- NO-OP --------- */
5018 case Ist_NoOp:
5019 return;
5021 /* --------- EXIT --------- */
5022 case Ist_Exit: {
5023 if (stmt->Ist.Exit.dst->tag != Ico_U64)
5024 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5026 AMD64CondCode cc = iselCondCode(env, stmt->Ist.Exit.guard);
5027 AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5028 hregAMD64_RBP());
5030 /* Case: boring transfer to known address */
5031 if (stmt->Ist.Exit.jk == Ijk_Boring) {
5032 if (env->chainingAllowed) {
5033 /* .. almost always true .. */
5034 /* Skip the event check at the dst if this is a forwards
5035 edge. */
5036 Bool toFastEP
5037 = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5038 if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5039 addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5040 amRIP, cc, toFastEP));
5041 } else {
5042 /* .. very occasionally .. */
5043 /* We can't use chaining, so ask for an assisted transfer,
5044 as that's the only alternative that is allowable. */
5045 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5046 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5048 return;
5051 /* Case: assisted transfer to arbitrary address */
5052 switch (stmt->Ist.Exit.jk) {
5053 /* Keep this list in sync with that in iselNext below */
5054 case Ijk_ClientReq:
5055 case Ijk_EmWarn:
5056 case Ijk_NoDecode:
5057 case Ijk_NoRedir:
5058 case Ijk_SigSEGV:
5059 case Ijk_SigTRAP:
5060 case Ijk_Sys_syscall:
5061 case Ijk_Sys_int210:
5062 case Ijk_InvalICache:
5063 case Ijk_Yield:
5065 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5066 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5067 return;
5069 default:
5070 break;
5073 /* Do we ever expect to see any other kind? */
5074 goto stmt_fail;
5077 default: break;
5079 stmt_fail:
5080 ppIRStmt(stmt);
5081 vpanic("iselStmt(amd64)");
5085 /*---------------------------------------------------------*/
5086 /*--- ISEL: Basic block terminators (Nexts) ---*/
5087 /*---------------------------------------------------------*/
5089 static void iselNext ( ISelEnv* env,
5090 IRExpr* next, IRJumpKind jk, Int offsIP )
5092 if (vex_traceflags & VEX_TRACE_VCODE) {
5093 vex_printf( "\n-- PUT(%d) = ", offsIP);
5094 ppIRExpr( next );
5095 vex_printf( "; exit-");
5096 ppIRJumpKind(jk);
5097 vex_printf( "\n");
5100 /* Case: boring transfer to known address */
5101 if (next->tag == Iex_Const) {
5102 IRConst* cdst = next->Iex.Const.con;
5103 vassert(cdst->tag == Ico_U64);
5104 if (jk == Ijk_Boring || jk == Ijk_Call) {
5105 /* Boring transfer to known address */
5106 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5107 if (env->chainingAllowed) {
5108 /* .. almost always true .. */
5109 /* Skip the event check at the dst if this is a forwards
5110 edge. */
5111 Bool toFastEP
5112 = ((Addr64)cdst->Ico.U64) > env->max_ga;
5113 if (0) vex_printf("%s", toFastEP ? "X" : ".");
5114 addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5115 amRIP, Acc_ALWAYS,
5116 toFastEP));
5117 } else {
5118 /* .. very occasionally .. */
5119 /* We can't use chaining, so ask for an indirect transfer,
5120 as that's the cheapest alternative that is
5121 allowable. */
5122 HReg r = iselIntExpr_R(env, next);
5123 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5124 Ijk_Boring));
5126 return;
5130 /* Case: call/return (==boring) transfer to any address */
5131 switch (jk) {
5132 case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5133 HReg r = iselIntExpr_R(env, next);
5134 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5135 if (env->chainingAllowed) {
5136 addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5137 } else {
5138 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5139 Ijk_Boring));
5141 return;
5143 default:
5144 break;
5147 /* Case: assisted transfer to arbitrary address */
5148 switch (jk) {
5149 /* Keep this list in sync with that for Ist_Exit above */
5150 case Ijk_ClientReq:
5151 case Ijk_EmWarn:
5152 case Ijk_NoDecode:
5153 case Ijk_NoRedir:
5154 case Ijk_SigSEGV:
5155 case Ijk_SigTRAP:
5156 case Ijk_Sys_syscall:
5157 case Ijk_Sys_int210:
5158 case Ijk_InvalICache:
5159 case Ijk_Yield: {
5160 HReg r = iselIntExpr_R(env, next);
5161 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5162 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5163 return;
5165 default:
5166 break;
5169 vex_printf( "\n-- PUT(%d) = ", offsIP);
5170 ppIRExpr( next );
5171 vex_printf( "; exit-");
5172 ppIRJumpKind(jk);
5173 vex_printf( "\n");
5174 vassert(0); // are we expecting any other kind?
5178 /*---------------------------------------------------------*/
5179 /*--- Insn selector top-level ---*/
5180 /*---------------------------------------------------------*/
5182 /* Translate an entire SB to amd64 code. */
5184 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5185 VexArch arch_host,
5186 const VexArchInfo* archinfo_host,
5187 const VexAbiInfo* vbi/*UNUSED*/,
5188 Int offs_Host_EvC_Counter,
5189 Int offs_Host_EvC_FailAddr,
5190 Bool chainingAllowed,
5191 Bool addProfInc,
5192 Addr max_ga )
5194 Int i, j;
5195 HReg hreg, hregHI;
5196 ISelEnv* env;
5197 UInt hwcaps_host = archinfo_host->hwcaps;
5198 AMD64AMode *amCounter, *amFailAddr;
5200 /* sanity ... */
5201 vassert(arch_host == VexArchAMD64);
5202 vassert(0 == (hwcaps_host
5203 & ~(VEX_HWCAPS_AMD64_SSE3
5204 | VEX_HWCAPS_AMD64_SSSE3
5205 | VEX_HWCAPS_AMD64_CX16
5206 | VEX_HWCAPS_AMD64_LZCNT
5207 | VEX_HWCAPS_AMD64_AVX
5208 | VEX_HWCAPS_AMD64_RDTSCP
5209 | VEX_HWCAPS_AMD64_BMI
5210 | VEX_HWCAPS_AMD64_AVX2
5211 | VEX_HWCAPS_AMD64_F16C
5212 | VEX_HWCAPS_AMD64_RDRAND)));
5214 /* Check that the host's endianness is as expected. */
5215 vassert(archinfo_host->endness == VexEndnessLE);
5217 /* Make up an initial environment to use. */
5218 env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5219 env->vreg_ctr = 0;
5221 /* Set up output code array. */
5222 env->code = newHInstrArray();
5224 /* Copy BB's type env. */
5225 env->type_env = bb->tyenv;
5227 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
5228 change as we go along. */
5229 env->n_vregmap = bb->tyenv->types_used;
5230 env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5231 env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5233 /* and finally ... */
5234 env->chainingAllowed = chainingAllowed;
5235 env->hwcaps = hwcaps_host;
5236 env->max_ga = max_ga;
5238 /* For each IR temporary, allocate a suitably-kinded virtual
5239 register. */
5240 j = 0;
5241 for (i = 0; i < env->n_vregmap; i++) {
5242 hregHI = hreg = INVALID_HREG;
5243 switch (bb->tyenv->types[i]) {
5244 case Ity_I1:
5245 case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5246 hreg = mkHReg(True, HRcInt64, 0, j++);
5247 break;
5248 case Ity_I128:
5249 hreg = mkHReg(True, HRcInt64, 0, j++);
5250 hregHI = mkHReg(True, HRcInt64, 0, j++);
5251 break;
5252 case Ity_F32:
5253 case Ity_F64:
5254 case Ity_V128:
5255 hreg = mkHReg(True, HRcVec128, 0, j++);
5256 break;
5257 case Ity_V256:
5258 hreg = mkHReg(True, HRcVec128, 0, j++);
5259 hregHI = mkHReg(True, HRcVec128, 0, j++);
5260 break;
5261 default:
5262 ppIRType(bb->tyenv->types[i]);
5263 vpanic("iselBB(amd64): IRTemp type");
5265 env->vregmap[i] = hreg;
5266 env->vregmapHI[i] = hregHI;
5268 env->vreg_ctr = j;
5270 /* The very first instruction must be an event check. */
5271 amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP());
5272 amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5273 addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5275 /* Possibly a block counter increment (for profiling). At this
5276 point we don't know the address of the counter, so just pretend
5277 it is zero. It will have to be patched later, but before this
5278 translation is used, by a call to LibVEX_patchProfCtr. */
5279 if (addProfInc) {
5280 addInstr(env, AMD64Instr_ProfInc());
5283 /* Ok, finally we can iterate over the statements. */
5284 for (i = 0; i < bb->stmts_used; i++)
5285 if (bb->stmts[i])
5286 iselStmt(env, bb->stmts[i]);
5288 iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5290 /* record the number of vregs we used. */
5291 env->code->n_vregs = env->vreg_ctr;
5292 return env->code;
5296 /*---------------------------------------------------------------*/
5297 /*--- end host_amd64_isel.c ---*/
5298 /*---------------------------------------------------------------*/