Initial implementation of C-source-level &&-idiom recovery
[valgrind.git] / VEX / priv / host_amd64_isel.c
bloba389e81781ae282beee12519d1d634a92f7d0d4b
2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 #include "libvex_basictypes.h"
35 #include "libvex_ir.h"
36 #include "libvex.h"
38 #include "ir_match.h"
39 #include "main_util.h"
40 #include "main_globals.h"
41 #include "host_generic_regs.h"
42 #include "host_generic_simd64.h"
43 #include "host_generic_simd128.h"
44 #include "host_generic_simd256.h"
45 #include "host_generic_maddf.h"
46 #include "host_amd64_defs.h"
49 /*---------------------------------------------------------*/
50 /*--- x87/SSE control word stuff ---*/
51 /*---------------------------------------------------------*/
53 /* Vex-generated code expects to run with the FPU set as follows: all
54 exceptions masked, round-to-nearest, precision = 53 bits. This
55 corresponds to a FPU control word value of 0x027F.
57 Similarly the SSE control word (%mxcsr) should be 0x1F80.
59 %fpucw and %mxcsr should have these values on entry to
60 Vex-generated code, and should those values should be
61 unchanged at exit.
64 #define DEFAULT_FPUCW 0x027F
66 #define DEFAULT_MXCSR 0x1F80
68 /* debugging only, do not use */
69 /* define DEFAULT_FPUCW 0x037F */
72 /*---------------------------------------------------------*/
73 /*--- misc helpers ---*/
74 /*---------------------------------------------------------*/
76 /* These are duplicated in guest-amd64/toIR.c */
77 static IRExpr* unop ( IROp op, IRExpr* a )
79 return IRExpr_Unop(op, a);
82 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
84 return IRExpr_Binop(op, a1, a2);
87 static IRExpr* bind ( Int binder )
89 return IRExpr_Binder(binder);
92 static Bool isZeroU8 ( const IRExpr* e )
94 return e->tag == Iex_Const
95 && e->Iex.Const.con->tag == Ico_U8
96 && e->Iex.Const.con->Ico.U8 == 0;
100 /*---------------------------------------------------------*/
101 /*--- ISelEnv ---*/
102 /*---------------------------------------------------------*/
104 /* This carries around:
106 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
107 might encounter. This is computed before insn selection starts,
108 and does not change.
110 - A mapping from IRTemp to HReg. This tells the insn selector
111 which virtual register is associated with each IRTemp
112 temporary. This is computed before insn selection starts, and
113 does not change. We expect this mapping to map precisely the
114 same set of IRTemps as the type mapping does.
116 - vregmap holds the primary register for the IRTemp.
117 - vregmapHI is only used for 128-bit integer-typed
118 IRTemps. It holds the identity of a second
119 64-bit virtual HReg, which holds the high half
120 of the value.
122 - The host subarchitecture we are selecting insns for.
123 This is set at the start and does not change.
125 - The code array, that is, the insns selected so far.
127 - A counter, for generating new virtual registers.
129 - A Bool for indicating whether we may generate chain-me
130 instructions for control flow transfers, or whether we must use
131 XAssisted.
133 - The maximum guest address of any guest insn in this block.
134 Actually, the address of the highest-addressed byte from any insn
135 in this block. Is set at the start and does not change. This is
136 used for detecting jumps which are definitely forward-edges from
137 this block, and therefore can be made (chained) to the fast entry
138 point of the destination, thereby avoiding the destination's
139 event check.
141 Note, this is all host-independent. (JRS 20050201: well, kinda
142 ... not completely. Compare with ISelEnv for X86.)
145 typedef
146 struct {
147 /* Constant -- are set at the start and do not change. */
148 IRTypeEnv* type_env;
150 HReg* vregmap;
151 HReg* vregmapHI;
152 Int n_vregmap;
154 UInt hwcaps;
156 Bool chainingAllowed;
157 Addr64 max_ga;
159 /* These are modified as we go along. */
160 HInstrArray* code;
161 Int vreg_ctr;
163 ISelEnv;
166 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
168 vassert(tmp >= 0);
169 vassert(tmp < env->n_vregmap);
170 return env->vregmap[tmp];
173 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
174 ISelEnv* env, IRTemp tmp )
176 vassert(tmp >= 0);
177 vassert(tmp < env->n_vregmap);
178 vassert(! hregIsInvalid(env->vregmapHI[tmp]));
179 *vrLO = env->vregmap[tmp];
180 *vrHI = env->vregmapHI[tmp];
183 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
185 addHInstr(env->code, instr);
186 if (vex_traceflags & VEX_TRACE_VCODE) {
187 ppAMD64Instr(instr, True);
188 vex_printf("\n");
192 static HReg newVRegI ( ISelEnv* env )
194 HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
195 env->vreg_ctr++;
196 return reg;
199 static HReg newVRegV ( ISelEnv* env )
201 HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
202 env->vreg_ctr++;
203 return reg;
207 /*---------------------------------------------------------*/
208 /*--- ISEL: Forward declarations ---*/
209 /*---------------------------------------------------------*/
211 /* These are organised as iselXXX and iselXXX_wrk pairs. The
212 iselXXX_wrk do the real work, but are not to be called directly.
213 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
214 checks that all returned registers are virtual. You should not
215 call the _wrk version directly.
217 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
218 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e );
220 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
221 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e );
223 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
224 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e );
226 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
227 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e );
229 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
230 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e );
232 static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
233 ISelEnv* env, const IRExpr* e );
234 static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
235 ISelEnv* env, const IRExpr* e );
237 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e );
238 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e );
240 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
241 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e );
243 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
244 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e );
246 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
247 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e );
249 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
250 ISelEnv* env, const IRExpr* e );
251 static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
252 ISelEnv* env, const IRExpr* e );
255 /*---------------------------------------------------------*/
256 /*--- ISEL: Misc helpers ---*/
257 /*---------------------------------------------------------*/
259 static Bool sane_AMode ( AMD64AMode* am )
261 switch (am->tag) {
262 case Aam_IR:
263 return
264 toBool( hregClass(am->Aam.IR.reg) == HRcInt64
265 && (hregIsVirtual(am->Aam.IR.reg)
266 || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
267 case Aam_IRRS:
268 return
269 toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
270 && hregIsVirtual(am->Aam.IRRS.base)
271 && hregClass(am->Aam.IRRS.index) == HRcInt64
272 && hregIsVirtual(am->Aam.IRRS.index) );
273 default:
274 vpanic("sane_AMode: unknown amd64 amode tag");
279 /* Can the lower 32 bits be signedly widened to produce the whole
280 64-bit value? In other words, are the top 33 bits either all 0 or
281 all 1 ? */
282 static Bool fitsIn32Bits ( ULong x )
284 Long y1;
285 y1 = x << 32;
286 y1 >>=/*s*/ 32;
287 return toBool(x == y1);
290 /* Is this a 64-bit zero expression? */
292 static Bool isZeroU64 ( const IRExpr* e )
294 return e->tag == Iex_Const
295 && e->Iex.Const.con->tag == Ico_U64
296 && e->Iex.Const.con->Ico.U64 == 0ULL;
299 static Bool isZeroU32 ( const IRExpr* e )
301 return e->tag == Iex_Const
302 && e->Iex.Const.con->tag == Ico_U32
303 && e->Iex.Const.con->Ico.U32 == 0;
306 /* Are both args atoms and the same? This is copy of eqIRAtom
307 that omits the assertions that the args are indeed atoms. */
309 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
311 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
312 return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
313 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
314 return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
315 return False;
318 /* Make a int reg-reg move. */
320 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
322 vassert(hregClass(src) == HRcInt64);
323 vassert(hregClass(dst) == HRcInt64);
324 return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
327 /* Make a vector (128 bit) reg-reg move. */
329 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
331 vassert(hregClass(src) == HRcVec128);
332 vassert(hregClass(dst) == HRcVec128);
333 return AMD64Instr_SseReRg(Asse_MOV, src, dst);
336 /* Advance/retreat %rsp by n. */
338 static void add_to_rsp ( ISelEnv* env, Int n )
340 vassert(n > 0 && n < 256 && (n%8) == 0);
341 addInstr(env,
342 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
343 hregAMD64_RSP()));
346 static void sub_from_rsp ( ISelEnv* env, Int n )
348 vassert(n > 0 && n < 256 && (n%8) == 0);
349 addInstr(env,
350 AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
351 hregAMD64_RSP()));
354 /* Push 64-bit constants on the stack. */
355 static void push_uimm64( ISelEnv* env, ULong uimm64 )
357 /* If uimm64 can be expressed as the sign extension of its
358 lower 32 bits, we can do it the easy way. */
359 Long simm64 = (Long)uimm64;
360 if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
361 addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
362 } else {
363 HReg tmp = newVRegI(env);
364 addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
365 addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
370 /* Used only in doHelperCall. If possible, produce a single
371 instruction which computes 'e' into 'dst'. If not possible, return
372 NULL. */
374 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
375 HReg dst,
376 IRExpr* e )
378 /* Per comments in doHelperCall below, appearance of
379 Iex_VECRET implies ill-formed IR. */
380 vassert(e->tag != Iex_VECRET);
382 /* In this case we give out a copy of the BaseBlock pointer. */
383 if (UNLIKELY(e->tag == Iex_GSPTR)) {
384 return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
387 vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
389 if (e->tag == Iex_Const) {
390 vassert(e->Iex.Const.con->tag == Ico_U64);
391 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
392 return AMD64Instr_Alu64R(
393 Aalu_MOV,
394 AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
397 } else {
398 return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
402 if (e->tag == Iex_RdTmp) {
403 HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
404 return mk_iMOVsd_RR(src, dst);
407 if (e->tag == Iex_Get) {
408 vassert(e->Iex.Get.ty == Ity_I64);
409 return AMD64Instr_Alu64R(
410 Aalu_MOV,
411 AMD64RMI_Mem(
412 AMD64AMode_IR(e->Iex.Get.offset,
413 hregAMD64_RBP())),
414 dst);
417 if (e->tag == Iex_Unop
418 && e->Iex.Unop.op == Iop_32Uto64
419 && e->Iex.Unop.arg->tag == Iex_RdTmp) {
420 HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
421 return AMD64Instr_MovxLQ(False, src, dst);
424 if (0) { ppIRExpr(e); vex_printf("\n"); }
426 return NULL;
430 /* Do a complete function call. |guard| is a Ity_Bit expression
431 indicating whether or not the call happens. If guard==NULL, the
432 call is unconditional. |retloc| is set to indicate where the
433 return value is after the call. The caller (of this fn) must
434 generate code to add |stackAdjustAfterCall| to the stack pointer
435 after the call is done. */
437 static
438 void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall,
439 /*OUT*/RetLoc* retloc,
440 ISelEnv* env,
441 IRExpr* guard,
442 IRCallee* cee, IRType retTy, IRExpr** args )
444 AMD64CondCode cc;
445 HReg argregs[6];
446 HReg tmpregs[6];
447 AMD64Instr* fastinstrs[6];
448 UInt n_args, i;
450 /* Set default returns. We'll update them later if needed. */
451 *stackAdjustAfterCall = 0;
452 *retloc = mk_RetLoc_INVALID();
454 /* These are used for cross-checking that IR-level constraints on
455 the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
456 UInt nVECRETs = 0;
457 UInt nGSPTRs = 0;
459 /* Marshal args for a call and do the call.
461 This function only deals with a tiny set of possibilities, which
462 cover all helpers in practice. The restrictions are that only
463 arguments in registers are supported, hence only 6x64 integer
464 bits in total can be passed. In fact the only supported arg
465 type is I64.
467 The return type can be I{64,32,16,8} or V{128,256}. In the
468 latter two cases, it is expected that |args| will contain the
469 special node IRExpr_VECRET(), in which case this routine
470 generates code to allocate space on the stack for the vector
471 return value. Since we are not passing any scalars on the
472 stack, it is enough to preallocate the return space before
473 marshalling any arguments, in this case.
475 |args| may also contain IRExpr_GSPTR(), in which case the
476 value in %rbp is passed as the corresponding argument.
478 Generating code which is both efficient and correct when
479 parameters are to be passed in registers is difficult, for the
480 reasons elaborated in detail in comments attached to
481 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
482 of the method described in those comments.
484 The problem is split into two cases: the fast scheme and the
485 slow scheme. In the fast scheme, arguments are computed
486 directly into the target (real) registers. This is only safe
487 when we can be sure that computation of each argument will not
488 trash any real registers set by computation of any other
489 argument.
491 In the slow scheme, all args are first computed into vregs, and
492 once they are all done, they are moved to the relevant real
493 regs. This always gives correct code, but it also gives a bunch
494 of vreg-to-rreg moves which are usually redundant but are hard
495 for the register allocator to get rid of.
497 To decide which scheme to use, all argument expressions are
498 first examined. If they are all so simple that it is clear they
499 will be evaluated without use of any fixed registers, use the
500 fast scheme, else use the slow scheme. Note also that only
501 unconditional calls may use the fast scheme, since having to
502 compute a condition expression could itself trash real
503 registers. Note that for simplicity, in the case where
504 IRExpr_VECRET() is present, we use the slow scheme. This is
505 motivated by the desire to avoid any possible complexity
506 w.r.t. nested calls.
508 Note this requires being able to examine an expression and
509 determine whether or not evaluation of it might use a fixed
510 register. That requires knowledge of how the rest of this insn
511 selector works. Currently just the following 3 are regarded as
512 safe -- hopefully they cover the majority of arguments in
513 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
516 /* Note that the cee->regparms field is meaningless on AMD64 host
517 (since there is only one calling convention) and so we always
518 ignore it. */
519 n_args = 0;
520 for (i = 0; args[i]; i++)
521 n_args++;
523 if (n_args > 6)
524 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
526 argregs[0] = hregAMD64_RDI();
527 argregs[1] = hregAMD64_RSI();
528 argregs[2] = hregAMD64_RDX();
529 argregs[3] = hregAMD64_RCX();
530 argregs[4] = hregAMD64_R8();
531 argregs[5] = hregAMD64_R9();
533 tmpregs[0] = tmpregs[1] = tmpregs[2] =
534 tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
536 fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
537 fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
539 /* First decide which scheme (slow or fast) is to be used. First
540 assume the fast scheme, and select slow if any contraindications
541 (wow) appear. */
543 /* We'll need space on the stack for the return value. Avoid
544 possible complications with nested calls by using the slow
545 scheme. */
546 if (retTy == Ity_V128 || retTy == Ity_V256)
547 goto slowscheme;
549 if (guard) {
550 if (guard->tag == Iex_Const
551 && guard->Iex.Const.con->tag == Ico_U1
552 && guard->Iex.Const.con->Ico.U1 == True) {
553 /* unconditional */
554 } else {
555 /* Not manifestly unconditional -- be conservative. */
556 goto slowscheme;
560 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
561 use the slow scheme. Because this is tentative, we can't call
562 addInstr (that is, commit to) any instructions until we're
563 handled all the arguments. So park the resulting instructions
564 in a buffer and emit that if we're successful. */
566 /* FAST SCHEME */
567 /* In this loop, we process args that can be computed into the
568 destination (real) register with a single instruction, without
569 using any fixed regs. That also includes IRExpr_GSPTR(), but
570 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
571 never see IRExpr_VECRET() at this point, since the return-type
572 check above should ensure all those cases use the slow scheme
573 instead. */
574 vassert(n_args >= 0 && n_args <= 6);
575 for (i = 0; i < n_args; i++) {
576 IRExpr* arg = args[i];
577 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
578 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
580 fastinstrs[i]
581 = iselIntExpr_single_instruction( env, argregs[i], args[i] );
582 if (fastinstrs[i] == NULL)
583 goto slowscheme;
586 /* Looks like we're in luck. Emit the accumulated instructions and
587 move on to doing the call itself. */
588 for (i = 0; i < n_args; i++)
589 addInstr(env, fastinstrs[i]);
591 /* Fast scheme only applies for unconditional calls. Hence: */
592 cc = Acc_ALWAYS;
594 goto handle_call;
597 /* SLOW SCHEME; move via temporaries */
598 slowscheme:
600 # if 0 /* debug only */
601 if (n_args > 0) {for (i = 0; args[i]; i++) {
602 ppIRExpr(args[i]); vex_printf(" "); }
603 vex_printf("\n");}
604 # endif
606 /* If we have a vector return type, allocate a place for it on the
607 stack and record its address. */
608 HReg r_vecRetAddr = INVALID_HREG;
609 if (retTy == Ity_V128) {
610 r_vecRetAddr = newVRegI(env);
611 sub_from_rsp(env, 16);
612 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
614 else if (retTy == Ity_V256) {
615 r_vecRetAddr = newVRegI(env);
616 sub_from_rsp(env, 32);
617 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
620 vassert(n_args >= 0 && n_args <= 6);
621 for (i = 0; i < n_args; i++) {
622 IRExpr* arg = args[i];
623 if (UNLIKELY(arg->tag == Iex_GSPTR)) {
624 tmpregs[i] = newVRegI(env);
625 addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
626 nGSPTRs++;
628 else if (UNLIKELY(arg->tag == Iex_VECRET)) {
629 /* We stashed the address of the return slot earlier, so just
630 retrieve it now. */
631 vassert(!hregIsInvalid(r_vecRetAddr));
632 tmpregs[i] = r_vecRetAddr;
633 nVECRETs++;
635 else {
636 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
637 tmpregs[i] = iselIntExpr_R(env, args[i]);
641 /* Now we can compute the condition. We can't do it earlier
642 because the argument computations could trash the condition
643 codes. Be a bit clever to handle the common case where the
644 guard is 1:Bit. */
645 cc = Acc_ALWAYS;
646 if (guard) {
647 if (guard->tag == Iex_Const
648 && guard->Iex.Const.con->tag == Ico_U1
649 && guard->Iex.Const.con->Ico.U1 == True) {
650 /* unconditional -- do nothing */
651 } else {
652 cc = iselCondCode( env, guard );
656 /* Move the args to their final destinations. */
657 for (i = 0; i < n_args; i++) {
658 /* None of these insns, including any spill code that might
659 be generated, may alter the condition codes. */
660 addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
664 /* Do final checks, set the return values, and generate the call
665 instruction proper. */
666 handle_call:
668 if (retTy == Ity_V128 || retTy == Ity_V256) {
669 vassert(nVECRETs == 1);
670 } else {
671 vassert(nVECRETs == 0);
674 vassert(nGSPTRs == 0 || nGSPTRs == 1);
676 vassert(*stackAdjustAfterCall == 0);
677 vassert(is_RetLoc_INVALID(*retloc));
678 switch (retTy) {
679 case Ity_INVALID:
680 /* Function doesn't return a value. */
681 *retloc = mk_RetLoc_simple(RLPri_None);
682 break;
683 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
684 *retloc = mk_RetLoc_simple(RLPri_Int);
685 break;
686 case Ity_V128:
687 *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
688 *stackAdjustAfterCall = 16;
689 break;
690 case Ity_V256:
691 *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
692 *stackAdjustAfterCall = 32;
693 break;
694 default:
695 /* IR can denote other possible return types, but we don't
696 handle those here. */
697 vassert(0);
700 /* Finally, generate the call itself. This needs the *retloc value
701 set in the switch above, which is why it's at the end. */
702 addInstr(env,
703 AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
707 /* Given a guest-state array descriptor, an index expression and a
708 bias, generate an AMD64AMode holding the relevant guest state
709 offset. */
711 static
712 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
713 IRExpr* off, Int bias )
715 HReg tmp, roff;
716 Int elemSz = sizeofIRType(descr->elemTy);
717 Int nElems = descr->nElems;
719 /* Throw out any cases not generated by an amd64 front end. In
720 theory there might be a day where we need to handle them -- if
721 we ever run non-amd64-guest on amd64 host. */
723 if (nElems != 8 || (elemSz != 1 && elemSz != 8))
724 vpanic("genGuestArrayOffset(amd64 host)");
726 /* Compute off into a reg, %off. Then return:
728 movq %off, %tmp
729 addq $bias, %tmp (if bias != 0)
730 andq %tmp, 7
731 ... base(%rbp, %tmp, shift) ...
733 tmp = newVRegI(env);
734 roff = iselIntExpr_R(env, off);
735 addInstr(env, mk_iMOVsd_RR(roff, tmp));
736 if (bias != 0) {
737 /* Make sure the bias is sane, in the sense that there are
738 no significant bits above bit 30 in it. */
739 vassert(-10000 < bias && bias < 10000);
740 addInstr(env,
741 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
743 addInstr(env,
744 AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
745 vassert(elemSz == 1 || elemSz == 8);
746 return
747 AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
748 elemSz==8 ? 3 : 0);
752 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
753 static
754 void set_SSE_rounding_default ( ISelEnv* env )
756 /* pushq $DEFAULT_MXCSR
757 ldmxcsr 0(%rsp)
758 addq $8, %rsp
760 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
761 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
762 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
763 add_to_rsp(env, 8);
766 /* Mess with the FPU's rounding mode: set to the default rounding mode
767 (DEFAULT_FPUCW). */
768 static
769 void set_FPU_rounding_default ( ISelEnv* env )
771 /* movq $DEFAULT_FPUCW, -8(%rsp)
772 fldcw -8(%esp)
774 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
775 addInstr(env, AMD64Instr_Alu64M(
776 Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
777 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
781 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
782 expression denoting a value in the range 0 .. 3, indicating a round
783 mode encoded as per type IRRoundingMode. Set the SSE machinery to
784 have the same rounding.
786 static
787 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
789 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
790 both rounding bits == 0. If that wasn't the case, we couldn't
791 create a new rounding field simply by ORing the new value into
792 place. */
794 /* movq $3, %reg
795 andq [[mode]], %reg -- shouldn't be needed; paranoia
796 shlq $13, %reg
797 orq $DEFAULT_MXCSR, %reg
798 pushq %reg
799 ldmxcsr 0(%esp)
800 addq $8, %rsp
802 HReg reg = newVRegI(env);
803 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
804 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
805 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
806 iselIntExpr_RMI(env, mode), reg));
807 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
808 addInstr(env, AMD64Instr_Alu64R(
809 Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
810 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
811 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
812 add_to_rsp(env, 8);
816 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
817 expression denoting a value in the range 0 .. 3, indicating a round
818 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
819 the same rounding.
821 static
822 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
824 HReg rrm = iselIntExpr_R(env, mode);
825 HReg rrm2 = newVRegI(env);
826 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
828 /* movq %rrm, %rrm2
829 andq $3, %rrm2 -- shouldn't be needed; paranoia
830 shlq $10, %rrm2
831 orq $DEFAULT_FPUCW, %rrm2
832 movq %rrm2, -8(%rsp)
833 fldcw -8(%esp)
835 addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
836 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
837 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
838 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
839 AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
840 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
841 AMD64RI_Reg(rrm2), m8_rsp));
842 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
846 /* Generate all-zeroes into a new vector register.
848 static HReg generate_zeroes_V128 ( ISelEnv* env )
850 HReg dst = newVRegV(env);
851 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
852 return dst;
855 /* Generate all-ones into a new vector register.
857 static HReg generate_ones_V128 ( ISelEnv* env )
859 HReg dst = newVRegV(env);
860 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
861 return dst;
865 /* Generate !src into a new vector register. Amazing that there isn't
866 a less crappy way to do this.
868 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
870 HReg dst = generate_ones_V128(env);
871 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
872 return dst;
876 /* Expand the given byte into a 64-bit word, by cloning each bit
877 8 times. */
878 static ULong bitmask8_to_bytemask64 ( UShort w8 )
880 vassert(w8 == (w8 & 0xFF));
881 ULong w64 = 0;
882 Int i;
883 for (i = 0; i < 8; i++) {
884 if (w8 & (1<<i))
885 w64 |= (0xFFULL << (8 * i));
887 return w64;
891 /*---------------------------------------------------------*/
892 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
893 /*---------------------------------------------------------*/
895 /* Select insns for an integer-typed expression, and add them to the
896 code list. Return a reg holding the result. This reg will be a
897 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
898 want to modify it, ask for a new vreg, copy it in there, and modify
899 the copy. The register allocator will do its best to map both
900 vregs to the same real register, so the copies will often disappear
901 later in the game.
903 This should handle expressions of 64, 32, 16 and 8-bit type. All
904 results are returned in a 64-bit register. For 32-, 16- and 8-bit
905 expressions, the upper 32/48/56 bits are arbitrary, so you should
906 mask or sign extend partial values if necessary.
909 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
911 HReg r = iselIntExpr_R_wrk(env, e);
912 /* sanity checks ... */
913 # if 0
914 vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
915 # endif
916 vassert(hregClass(r) == HRcInt64);
917 vassert(hregIsVirtual(r));
918 return r;
921 /* DO NOT CALL THIS DIRECTLY ! */
922 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
924 MatchInfo mi;
925 DECLARE_PATTERN(p_1Uto8_64to1);
926 DECLARE_PATTERN(p_LDle8_then_8Uto64);
927 DECLARE_PATTERN(p_LDle16_then_16Uto64);
929 IRType ty = typeOfIRExpr(env->type_env,e);
930 switch (ty) {
931 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
932 default: vassert(0);
935 switch (e->tag) {
937 /* --------- TEMP --------- */
938 case Iex_RdTmp: {
939 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
942 /* --------- LOAD --------- */
943 case Iex_Load: {
944 HReg dst = newVRegI(env);
945 AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
947 /* We can't handle big-endian loads, nor load-linked. */
948 if (e->Iex.Load.end != Iend_LE)
949 goto irreducible;
951 if (ty == Ity_I64) {
952 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
953 AMD64RMI_Mem(amode), dst) );
954 return dst;
956 if (ty == Ity_I32) {
957 addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
958 return dst;
960 if (ty == Ity_I16) {
961 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
962 return dst;
964 if (ty == Ity_I8) {
965 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
966 return dst;
968 break;
971 /* --------- BINARY OP --------- */
972 case Iex_Binop: {
973 AMD64AluOp aluOp;
974 AMD64ShiftOp shOp;
976 /* Pattern: Sub64(0,x) */
977 /* and: Sub32(0,x) */
978 if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
979 || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
980 HReg dst = newVRegI(env);
981 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
982 addInstr(env, mk_iMOVsd_RR(reg,dst));
983 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
984 return dst;
987 /* Is it an addition or logical style op? */
988 switch (e->Iex.Binop.op) {
989 case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
990 aluOp = Aalu_ADD; break;
991 case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
992 aluOp = Aalu_SUB; break;
993 case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
994 aluOp = Aalu_AND; break;
995 case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64:
996 aluOp = Aalu_OR; break;
997 case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
998 aluOp = Aalu_XOR; break;
999 case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1000 aluOp = Aalu_MUL; break;
1001 default:
1002 aluOp = Aalu_INVALID; break;
1004 /* For commutative ops we assume any literal
1005 values are on the second operand. */
1006 if (aluOp != Aalu_INVALID) {
1007 HReg dst = newVRegI(env);
1008 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
1009 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1010 addInstr(env, mk_iMOVsd_RR(reg,dst));
1011 addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1012 return dst;
1015 /* Perhaps a shift op? */
1016 switch (e->Iex.Binop.op) {
1017 case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1018 shOp = Ash_SHL; break;
1019 case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1020 shOp = Ash_SHR; break;
1021 case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1022 shOp = Ash_SAR; break;
1023 default:
1024 shOp = Ash_INVALID; break;
1026 if (shOp != Ash_INVALID) {
1027 HReg dst = newVRegI(env);
1029 /* regL = the value to be shifted */
1030 HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1031 addInstr(env, mk_iMOVsd_RR(regL,dst));
1033 /* Do any necessary widening for 32/16/8 bit operands */
1034 switch (e->Iex.Binop.op) {
1035 case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1036 break;
1037 case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1038 break;
1039 case Iop_Shr8:
1040 addInstr(env, AMD64Instr_Alu64R(
1041 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1042 break;
1043 case Iop_Shr16:
1044 addInstr(env, AMD64Instr_Alu64R(
1045 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1046 break;
1047 case Iop_Shr32:
1048 addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1049 break;
1050 case Iop_Sar8:
1051 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1052 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1053 break;
1054 case Iop_Sar16:
1055 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1056 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1057 break;
1058 case Iop_Sar32:
1059 addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1060 break;
1061 default:
1062 ppIROp(e->Iex.Binop.op);
1063 vassert(0);
1066 /* Now consider the shift amount. If it's a literal, we
1067 can do a much better job than the general case. */
1068 if (e->Iex.Binop.arg2->tag == Iex_Const) {
1069 /* assert that the IR is well-typed */
1070 Int nshift;
1071 vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1072 nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1073 vassert(nshift >= 0);
1074 if (nshift > 0)
1075 /* Can't allow nshift==0 since that means %cl */
1076 addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1077 } else {
1078 /* General case; we have to force the amount into %cl. */
1079 HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1080 addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1081 addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1083 return dst;
1086 /* Handle misc other scalar ops. */
1087 if (e->Iex.Binop.op == Iop_Max32U) {
1088 HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1089 HReg dst = newVRegI(env);
1090 HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1091 addInstr(env, mk_iMOVsd_RR(src1, dst));
1092 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1093 addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1094 return dst;
1097 if (e->Iex.Binop.op == Iop_DivModS64to32
1098 || e->Iex.Binop.op == Iop_DivModU64to32) {
1099 /* 64 x 32 -> (32(rem),32(div)) division */
1100 /* Get the 64-bit operand into edx:eax, and the other into
1101 any old R/M. */
1102 HReg rax = hregAMD64_RAX();
1103 HReg rdx = hregAMD64_RDX();
1104 HReg dst = newVRegI(env);
1105 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1106 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1107 /* Compute the left operand into a reg, and then
1108 put the top half in edx and the bottom in eax. */
1109 HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1110 addInstr(env, mk_iMOVsd_RR(left64, rdx));
1111 addInstr(env, mk_iMOVsd_RR(left64, rax));
1112 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1113 addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1114 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1115 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1116 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1117 addInstr(env, mk_iMOVsd_RR(rax, dst));
1118 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1119 return dst;
1122 if (e->Iex.Binop.op == Iop_32HLto64) {
1123 HReg hi32 = newVRegI(env);
1124 HReg lo32 = newVRegI(env);
1125 HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1126 HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1127 addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1128 addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1129 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1130 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1131 addInstr(env, AMD64Instr_Alu64R(
1132 Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1133 return hi32;
1136 if (e->Iex.Binop.op == Iop_16HLto32) {
1137 HReg hi16 = newVRegI(env);
1138 HReg lo16 = newVRegI(env);
1139 HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1140 HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1141 addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1142 addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1143 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1144 addInstr(env, AMD64Instr_Alu64R(
1145 Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1146 addInstr(env, AMD64Instr_Alu64R(
1147 Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1148 return hi16;
1151 if (e->Iex.Binop.op == Iop_8HLto16) {
1152 HReg hi8 = newVRegI(env);
1153 HReg lo8 = newVRegI(env);
1154 HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1155 HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1156 addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1157 addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1158 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1159 addInstr(env, AMD64Instr_Alu64R(
1160 Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1161 addInstr(env, AMD64Instr_Alu64R(
1162 Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1163 return hi8;
1166 if (e->Iex.Binop.op == Iop_MullS32
1167 || e->Iex.Binop.op == Iop_MullS16
1168 || e->Iex.Binop.op == Iop_MullS8
1169 || e->Iex.Binop.op == Iop_MullU32
1170 || e->Iex.Binop.op == Iop_MullU16
1171 || e->Iex.Binop.op == Iop_MullU8) {
1172 HReg a32 = newVRegI(env);
1173 HReg b32 = newVRegI(env);
1174 HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1175 HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1176 Int shift = 0;
1177 AMD64ShiftOp shr_op = Ash_SHR;
1178 switch (e->Iex.Binop.op) {
1179 case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1180 case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1181 case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break;
1182 case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1183 case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1184 case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break;
1185 default: vassert(0);
1188 addInstr(env, mk_iMOVsd_RR(a32s, a32));
1189 addInstr(env, mk_iMOVsd_RR(b32s, b32));
1190 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1191 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1192 addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32));
1193 addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32));
1194 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1195 return b32;
1198 if (e->Iex.Binop.op == Iop_CmpF64) {
1199 HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1200 HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1201 HReg dst = newVRegI(env);
1202 addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1203 /* Mask out irrelevant parts of the result so as to conform
1204 to the CmpF64 definition. */
1205 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1206 return dst;
1209 if (e->Iex.Binop.op == Iop_F64toI32S
1210 || e->Iex.Binop.op == Iop_F64toI64S) {
1211 Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1212 HReg rf = iselDblExpr(env, e->Iex.Binop.arg2);
1213 HReg dst = newVRegI(env);
1214 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1215 addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1216 set_SSE_rounding_default(env);
1217 return dst;
1220 /* Deal with 64-bit SIMD binary ops. For the most part these are doable
1221 by using the equivalent 128-bit operation and ignoring the upper half
1222 of the result. */
1223 AMD64SseOp op = Asse_INVALID;
1224 Bool arg1isEReg = False;
1225 Bool preShift32R = False;
1226 switch (e->Iex.Binop.op) {
1227 // The following 3 could be done with 128 bit insns too, but
1228 // first require the inputs to be reformatted.
1229 //case Iop_QNarrowBin32Sto16Sx4:
1230 //op = Asse_PACKSSD; arg1isEReg = True; break;
1231 //case Iop_QNarrowBin16Sto8Sx8:
1232 //op = Asse_PACKSSW; arg1isEReg = True; break;
1233 //case Iop_QNarrowBin16Sto8Ux8:
1234 //op = Asse_PACKUSW; arg1isEReg = True; break;
1236 case Iop_InterleaveHI8x8:
1237 op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1238 break;
1239 case Iop_InterleaveHI16x4:
1240 op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1241 break;
1242 case Iop_InterleaveHI32x2:
1243 op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1244 break;
1245 case Iop_InterleaveLO8x8:
1246 op = Asse_UNPCKLB; arg1isEReg = True;
1247 break;
1248 case Iop_InterleaveLO16x4:
1249 op = Asse_UNPCKLW; arg1isEReg = True;
1250 break;
1251 case Iop_InterleaveLO32x2:
1252 op = Asse_UNPCKLD; arg1isEReg = True;
1253 break;
1255 case Iop_Add8x8: op = Asse_ADD8; break;
1256 case Iop_Add16x4: op = Asse_ADD16; break;
1257 case Iop_Add32x2: op = Asse_ADD32; break;
1258 case Iop_QAdd8Sx8: op = Asse_QADD8S; break;
1259 case Iop_QAdd16Sx4: op = Asse_QADD16S; break;
1260 case Iop_QAdd8Ux8: op = Asse_QADD8U; break;
1261 case Iop_QAdd16Ux4: op = Asse_QADD16U; break;
1262 case Iop_Avg8Ux8: op = Asse_AVG8U; break;
1263 case Iop_Avg16Ux4: op = Asse_AVG16U; break;
1264 case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break;
1265 case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break;
1266 case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break;
1267 case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break;
1268 case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1269 case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1270 case Iop_Max16Sx4: op = Asse_MAX16S; break;
1271 case Iop_Max8Ux8: op = Asse_MAX8U; break;
1272 case Iop_Min16Sx4: op = Asse_MIN16S; break;
1273 case Iop_Min8Ux8: op = Asse_MIN8U; break;
1274 case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1275 case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1276 case Iop_Mul16x4: op = Asse_MUL16; break;
1277 case Iop_Sub8x8: op = Asse_SUB8; break;
1278 case Iop_Sub16x4: op = Asse_SUB16; break;
1279 case Iop_Sub32x2: op = Asse_SUB32; break;
1280 case Iop_QSub8Sx8: op = Asse_QSUB8S; break;
1281 case Iop_QSub16Sx4: op = Asse_QSUB16S; break;
1282 case Iop_QSub8Ux8: op = Asse_QSUB8U; break;
1283 case Iop_QSub16Ux4: op = Asse_QSUB16U; break;
1284 default: break;
1286 if (op != Asse_INVALID) {
1287 /* This isn't pretty, but .. move each arg to the low half of an XMM
1288 register, do the operation on the whole register, and move the
1289 result back to an integer register. */
1290 const IRExpr* arg1 = e->Iex.Binop.arg1;
1291 const IRExpr* arg2 = e->Iex.Binop.arg2;
1292 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1293 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1294 HReg iarg1 = iselIntExpr_R(env, arg1);
1295 HReg iarg2 = iselIntExpr_R(env, arg2);
1296 HReg varg1 = newVRegV(env);
1297 HReg varg2 = newVRegV(env);
1298 HReg idst = newVRegI(env);
1299 addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1300 addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1301 if (arg1isEReg) {
1302 if (preShift32R) {
1303 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1304 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1306 addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1307 addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1308 } else {
1309 vassert(!preShift32R);
1310 addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1311 addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1313 return idst;
1316 UInt laneBits = 0;
1317 op = Asse_INVALID;
1318 switch (e->Iex.Binop.op) {
1319 case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1320 case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1321 case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1322 case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1323 case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1324 case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1325 default: break;
1327 if (op != Asse_INVALID) {
1328 const IRExpr* arg1 = e->Iex.Binop.arg1;
1329 const IRExpr* arg2 = e->Iex.Binop.arg2;
1330 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1331 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1332 HReg igreg = iselIntExpr_R(env, arg1);
1333 HReg vgreg = newVRegV(env);
1334 HReg idst = newVRegI(env);
1335 addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1336 /* If it's a shift by an in-range immediate, generate a single
1337 instruction. */
1338 if (arg2->tag == Iex_Const) {
1339 IRConst* c = arg2->Iex.Const.con;
1340 vassert(c->tag == Ico_U8);
1341 UInt shift = c->Ico.U8;
1342 if (shift < laneBits) {
1343 addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1344 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1345 return idst;
1348 /* Otherwise we have to do it the longwinded way. */
1349 HReg ishift = iselIntExpr_R(env, arg2);
1350 HReg vshift = newVRegV(env);
1351 addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1352 addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1353 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1354 return idst;
1357 if (e->Iex.Binop.op == Iop_Mul32x2) {
1358 const IRExpr* arg1 = e->Iex.Binop.arg1;
1359 const IRExpr* arg2 = e->Iex.Binop.arg2;
1360 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1361 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1362 HReg s1 = iselIntExpr_R(env, arg1);
1363 HReg s2 = iselIntExpr_R(env, arg2);
1364 HReg resLo = newVRegI(env);
1365 // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1366 addInstr(env, mk_iMOVsd_RR(s1, resLo));
1367 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1368 addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1370 // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1371 HReg resHi = newVRegI(env);
1372 addInstr(env, mk_iMOVsd_RR(s1, resHi));
1373 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1374 HReg tmp = newVRegI(env);
1375 addInstr(env, mk_iMOVsd_RR(s2, tmp));
1376 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1377 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1378 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1380 // final result = resHi | resLo
1381 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1382 return resLo;
1385 // A few remaining SIMD64 ops require helper functions, at least for
1386 // now.
1387 Bool second_is_UInt = False;
1388 HWord fn = 0;
1389 switch (e->Iex.Binop.op) {
1390 case Iop_CatOddLanes16x4:
1391 fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1392 case Iop_CatEvenLanes16x4:
1393 fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1394 case Iop_PermOrZero8x8:
1395 fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1397 case Iop_QNarrowBin32Sto16Sx4:
1398 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1399 case Iop_QNarrowBin16Sto8Sx8:
1400 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1401 case Iop_QNarrowBin16Sto8Ux8:
1402 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1404 case Iop_NarrowBin16to8x8:
1405 fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1406 case Iop_NarrowBin32to16x4:
1407 fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1409 case Iop_SarN8x8:
1410 fn = (HWord)h_generic_calc_SarN8x8;
1411 second_is_UInt = True;
1412 break;
1414 default:
1415 fn = (HWord)0; break;
1417 if (fn != (HWord)0) {
1418 /* Note: the following assumes all helpers are of signature
1419 ULong fn ( ULong, ULong ), and they are
1420 not marked as regparm functions.
1422 HReg dst = newVRegI(env);
1423 HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1424 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1425 if (second_is_UInt)
1426 addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1427 addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1428 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1429 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1430 mk_RetLoc_simple(RLPri_Int) ));
1431 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1432 return dst;
1435 // Half-float vector conversion
1436 if (e->Iex.Binop.op == Iop_F32toF16x4
1437 && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1438 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1439 HReg dstV = newVRegV(env);
1440 HReg dstI = newVRegI(env);
1441 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1442 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1443 set_SSE_rounding_default(env);
1444 addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1445 return dstI;
1448 break;
1451 /* --------- UNARY OP --------- */
1452 case Iex_Unop: {
1454 /* 1Uto8(64to1(expr64)) */
1456 DEFINE_PATTERN( p_1Uto8_64to1,
1457 unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1458 if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1459 const IRExpr* expr64 = mi.bindee[0];
1460 HReg dst = newVRegI(env);
1461 HReg src = iselIntExpr_R(env, expr64);
1462 addInstr(env, mk_iMOVsd_RR(src,dst) );
1463 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1464 AMD64RMI_Imm(1), dst));
1465 return dst;
1469 /* 8Uto64(LDle(expr64)) */
1471 DEFINE_PATTERN(p_LDle8_then_8Uto64,
1472 unop(Iop_8Uto64,
1473 IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1474 if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1475 HReg dst = newVRegI(env);
1476 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1477 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1478 return dst;
1482 /* 16Uto64(LDle(expr64)) */
1484 DEFINE_PATTERN(p_LDle16_then_16Uto64,
1485 unop(Iop_16Uto64,
1486 IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1487 if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1488 HReg dst = newVRegI(env);
1489 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1490 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1491 return dst;
1495 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1496 Use 32 bit arithmetic and let the default zero-extend rule
1497 do the 32Uto64 for free. */
1498 if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1499 IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1500 IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1501 IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1502 AMD64AluOp aluOp = Aalu_INVALID;
1503 switch (opi) {
1504 case Iop_Add32: aluOp = Aalu_ADD; break;
1505 case Iop_Sub32: aluOp = Aalu_SUB; break;
1506 case Iop_And32: aluOp = Aalu_AND; break;
1507 case Iop_Or32: aluOp = Aalu_OR; break;
1508 case Iop_Xor32: aluOp = Aalu_XOR; break;
1509 default: break;
1511 if (aluOp != Aalu_INVALID) {
1512 /* For commutative ops we assume any literal values are on
1513 the second operand. */
1514 HReg dst = newVRegI(env);
1515 HReg reg = iselIntExpr_R(env, argL);
1516 AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1517 addInstr(env, mk_iMOVsd_RR(reg,dst));
1518 addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1519 return dst;
1521 /* just fall through to normal handling for Iop_32Uto64 */
1524 /* Fallback cases */
1525 switch (e->Iex.Unop.op) {
1526 case Iop_32Uto64:
1527 case Iop_32Sto64: {
1528 HReg dst = newVRegI(env);
1529 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1530 addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1531 src, dst) );
1532 return dst;
1534 case Iop_128HIto64: {
1535 HReg rHi, rLo;
1536 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1537 return rHi; /* and abandon rLo */
1539 case Iop_128to64: {
1540 HReg rHi, rLo;
1541 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1542 return rLo; /* and abandon rHi */
1544 case Iop_8Uto16:
1545 case Iop_8Uto32:
1546 case Iop_8Uto64:
1547 case Iop_16Uto64:
1548 case Iop_16Uto32: {
1549 HReg dst = newVRegI(env);
1550 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1551 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1552 || e->Iex.Unop.op==Iop_16Uto64 );
1553 UInt mask = srcIs16 ? 0xFFFF : 0xFF;
1554 addInstr(env, mk_iMOVsd_RR(src,dst) );
1555 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1556 AMD64RMI_Imm(mask), dst));
1557 return dst;
1559 case Iop_8Sto16:
1560 case Iop_8Sto64:
1561 case Iop_8Sto32:
1562 case Iop_16Sto32:
1563 case Iop_16Sto64: {
1564 HReg dst = newVRegI(env);
1565 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1566 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1567 || e->Iex.Unop.op==Iop_16Sto64 );
1568 UInt amt = srcIs16 ? 48 : 56;
1569 addInstr(env, mk_iMOVsd_RR(src,dst) );
1570 addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1571 addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1572 return dst;
1574 case Iop_Not8:
1575 case Iop_Not16:
1576 case Iop_Not32:
1577 case Iop_Not64: {
1578 HReg dst = newVRegI(env);
1579 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1580 addInstr(env, mk_iMOVsd_RR(src,dst) );
1581 addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1582 return dst;
1584 case Iop_16HIto8:
1585 case Iop_32HIto16:
1586 case Iop_64HIto32: {
1587 HReg dst = newVRegI(env);
1588 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1589 Int shift = 0;
1590 switch (e->Iex.Unop.op) {
1591 case Iop_16HIto8: shift = 8; break;
1592 case Iop_32HIto16: shift = 16; break;
1593 case Iop_64HIto32: shift = 32; break;
1594 default: vassert(0);
1596 addInstr(env, mk_iMOVsd_RR(src,dst) );
1597 addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1598 return dst;
1600 case Iop_1Uto64:
1601 case Iop_1Uto32:
1602 case Iop_1Uto8: {
1603 HReg dst = newVRegI(env);
1604 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1605 addInstr(env, AMD64Instr_Set64(cond,dst));
1606 return dst;
1608 case Iop_1Sto8:
1609 case Iop_1Sto16:
1610 case Iop_1Sto32:
1611 case Iop_1Sto64: {
1612 /* could do better than this, but for now ... */
1613 HReg dst = newVRegI(env);
1614 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1615 addInstr(env, AMD64Instr_Set64(cond,dst));
1616 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1617 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1618 return dst;
1620 case Iop_Ctz64: {
1621 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1622 HReg dst = newVRegI(env);
1623 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1624 addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1625 return dst;
1627 case Iop_Clz64: {
1628 /* Count leading zeroes. Do 'bsrq' to establish the index
1629 of the highest set bit, and subtract that value from
1630 63. */
1631 HReg tmp = newVRegI(env);
1632 HReg dst = newVRegI(env);
1633 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1634 addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1635 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1636 AMD64RMI_Imm(63), dst));
1637 addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1638 AMD64RMI_Reg(tmp), dst));
1639 return dst;
1642 case Iop_CmpwNEZ64: {
1643 HReg dst = newVRegI(env);
1644 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1645 addInstr(env, mk_iMOVsd_RR(src,dst));
1646 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1647 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1648 AMD64RMI_Reg(src), dst));
1649 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1650 return dst;
1653 case Iop_CmpwNEZ32: {
1654 HReg src = newVRegI(env);
1655 HReg dst = newVRegI(env);
1656 HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1657 addInstr(env, mk_iMOVsd_RR(pre,src));
1658 addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1659 addInstr(env, mk_iMOVsd_RR(src,dst));
1660 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1661 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1662 AMD64RMI_Reg(src), dst));
1663 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1664 return dst;
1667 case Iop_Left8:
1668 case Iop_Left16:
1669 case Iop_Left32:
1670 case Iop_Left64: {
1671 HReg dst = newVRegI(env);
1672 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1673 addInstr(env, mk_iMOVsd_RR(src, dst));
1674 addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1675 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1676 return dst;
1679 case Iop_V128to32: {
1680 HReg dst = newVRegI(env);
1681 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1682 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1683 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1684 addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1685 return dst;
1688 /* V128{HI}to64 */
1689 case Iop_V128to64: {
1690 HReg dst = newVRegI(env);
1691 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1692 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1693 return dst;
1695 case Iop_V128HIto64: {
1696 HReg dst = newVRegI(env);
1697 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1698 HReg vec2 = newVRegV(env);
1699 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1700 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1701 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1702 return dst;
1705 /* V256to64_{3,2,1,0} */
1706 case Iop_V256to64_0: case Iop_V256to64_1:
1707 case Iop_V256to64_2: case Iop_V256to64_3: {
1708 HReg vHi, vLo, vec;
1709 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1710 /* Do the first part of the selection by deciding which of
1711 the 128 bit registers to look at, and second part using
1712 the same scheme as for V128{HI}to64 above. */
1713 Bool low64of128 = True;
1714 switch (e->Iex.Unop.op) {
1715 case Iop_V256to64_0: vec = vLo; low64of128 = True; break;
1716 case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1717 case Iop_V256to64_2: vec = vHi; low64of128 = True; break;
1718 case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1719 default: vassert(0);
1721 HReg dst = newVRegI(env);
1722 if (low64of128) {
1723 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1724 } else {
1725 HReg vec2 = newVRegV(env);
1726 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1727 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1728 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1730 return dst;
1733 /* ReinterpF64asI64(e) */
1734 /* Given an IEEE754 double, produce an I64 with the same bit
1735 pattern. */
1736 case Iop_ReinterpF64asI64: {
1737 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1738 HReg dst = newVRegI(env);
1739 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1740 /* paranoia */
1741 set_SSE_rounding_default(env);
1742 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1743 addInstr(env, AMD64Instr_Alu64R(
1744 Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1745 return dst;
1748 /* ReinterpF32asI32(e) */
1749 /* Given an IEEE754 single, produce an I64 with the same bit
1750 pattern in the lower half. */
1751 case Iop_ReinterpF32asI32: {
1752 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1753 HReg dst = newVRegI(env);
1754 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1755 /* paranoia */
1756 set_SSE_rounding_default(env);
1757 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1758 addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1759 return dst;
1762 case Iop_16to8:
1763 case Iop_32to8:
1764 case Iop_64to8:
1765 case Iop_32to16:
1766 case Iop_64to16:
1767 case Iop_64to32:
1768 /* These are no-ops. */
1769 return iselIntExpr_R(env, e->Iex.Unop.arg);
1771 case Iop_GetMSBs8x8: {
1772 /* Note: the following assumes the helper is of
1773 signature
1774 UInt fn ( ULong ), and is not a regparm fn.
1776 HReg dst = newVRegI(env);
1777 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1778 HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1779 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1780 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1781 1, mk_RetLoc_simple(RLPri_Int) ));
1782 /* MovxLQ is not exactly the right thing here. We just
1783 need to get the bottom 8 bits of RAX into dst, and zero
1784 out everything else. Assuming that the helper returns
1785 a UInt with the top 24 bits zeroed out, it'll do,
1786 though. */
1787 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1788 return dst;
1791 case Iop_GetMSBs8x16: {
1792 /* Note: the following assumes the helper is of signature
1793 UInt fn ( ULong w64hi, ULong w64Lo ),
1794 and is not a regparm fn. */
1795 HReg dst = newVRegI(env);
1796 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1797 HReg rsp = hregAMD64_RSP();
1798 HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1799 AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
1800 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1801 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1802 16, vec, m16_rsp));
1803 /* hi 64 bits into RDI -- the first arg */
1804 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1805 AMD64RMI_Mem(m8_rsp),
1806 hregAMD64_RDI() )); /* 1st arg */
1807 /* lo 64 bits into RSI -- the 2nd arg */
1808 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1809 AMD64RMI_Mem(m16_rsp),
1810 hregAMD64_RSI() )); /* 2nd arg */
1811 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1812 2, mk_RetLoc_simple(RLPri_Int) ));
1813 /* MovxLQ is not exactly the right thing here. We just
1814 need to get the bottom 16 bits of RAX into dst, and zero
1815 out everything else. Assuming that the helper returns
1816 a UInt with the top 16 bits zeroed out, it'll do,
1817 though. */
1818 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1819 return dst;
1822 default:
1823 break;
1826 /* Deal with unary 64-bit SIMD ops. */
1827 HWord fn = 0;
1828 switch (e->Iex.Unop.op) {
1829 case Iop_CmpNEZ32x2:
1830 fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1831 case Iop_CmpNEZ16x4:
1832 fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1833 case Iop_CmpNEZ8x8:
1834 fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1835 default:
1836 fn = (HWord)0; break;
1838 if (fn != (HWord)0) {
1839 /* Note: the following assumes all helpers are of
1840 signature
1841 ULong fn ( ULong ), and they are
1842 not marked as regparm functions.
1844 HReg dst = newVRegI(env);
1845 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1846 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1847 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1848 mk_RetLoc_simple(RLPri_Int) ));
1849 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1850 return dst;
1853 break;
1856 /* --------- GET --------- */
1857 case Iex_Get: {
1858 if (ty == Ity_I64) {
1859 HReg dst = newVRegI(env);
1860 addInstr(env, AMD64Instr_Alu64R(
1861 Aalu_MOV,
1862 AMD64RMI_Mem(
1863 AMD64AMode_IR(e->Iex.Get.offset,
1864 hregAMD64_RBP())),
1865 dst));
1866 return dst;
1868 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1869 HReg dst = newVRegI(env);
1870 addInstr(env, AMD64Instr_LoadEX(
1871 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1872 False,
1873 AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1874 dst));
1875 return dst;
1877 break;
1880 case Iex_GetI: {
1881 AMD64AMode* am
1882 = genGuestArrayOffset(
1883 env, e->Iex.GetI.descr,
1884 e->Iex.GetI.ix, e->Iex.GetI.bias );
1885 HReg dst = newVRegI(env);
1886 if (ty == Ity_I8) {
1887 addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1888 return dst;
1890 if (ty == Ity_I64) {
1891 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1892 return dst;
1894 break;
1897 /* --------- CCALL --------- */
1898 case Iex_CCall: {
1899 HReg dst = newVRegI(env);
1900 vassert(ty == e->Iex.CCall.retty);
1902 /* be very restrictive for now. Only 64-bit ints allowed for
1903 args, and 64 or 32 bits for return type. */
1904 if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1905 goto irreducible;
1907 /* Marshal args, do the call. */
1908 UInt addToSp = 0;
1909 RetLoc rloc = mk_RetLoc_INVALID();
1910 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1911 e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1912 vassert(is_sane_RetLoc(rloc));
1913 vassert(rloc.pri == RLPri_Int);
1914 vassert(addToSp == 0);
1916 /* Move to dst, and zero out the top 32 bits if the result type is
1917 Ity_I32. Probably overkill, but still .. */
1918 if (e->Iex.CCall.retty == Ity_I64)
1919 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1920 else
1921 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1923 return dst;
1926 /* --------- LITERAL --------- */
1927 /* 64/32/16/8-bit literals */
1928 case Iex_Const:
1929 if (ty == Ity_I64) {
1930 HReg r = newVRegI(env);
1931 addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1932 return r;
1933 } else {
1934 AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1935 HReg r = newVRegI(env);
1936 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1937 return r;
1940 /* --------- MULTIPLEX --------- */
1941 case Iex_ITE: { // VFD
1942 if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1943 && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1944 HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1945 HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1946 HReg dst = newVRegI(env);
1947 addInstr(env, mk_iMOVsd_RR(r1,dst));
1948 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1949 addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1950 return dst;
1952 break;
1955 /* --------- TERNARY OP --------- */
1956 case Iex_Triop: {
1957 IRTriop *triop = e->Iex.Triop.details;
1958 /* C3210 flags following FPU partial remainder (fprem), both
1959 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1960 if (triop->op == Iop_PRemC3210F64
1961 || triop->op == Iop_PRem1C3210F64) {
1962 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1963 HReg arg1 = iselDblExpr(env, triop->arg2);
1964 HReg arg2 = iselDblExpr(env, triop->arg3);
1965 HReg dst = newVRegI(env);
1966 addInstr(env, AMD64Instr_A87Free(2));
1968 /* one arg -> top of x87 stack */
1969 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1970 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1972 /* other arg -> top of x87 stack */
1973 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1974 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1976 switch (triop->op) {
1977 case Iop_PRemC3210F64:
1978 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1979 break;
1980 case Iop_PRem1C3210F64:
1981 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1982 break;
1983 default:
1984 vassert(0);
1986 /* Ignore the result, and instead make off with the FPU's
1987 C3210 flags (in the status word). */
1988 addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1989 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1990 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1991 return dst;
1993 break;
1996 default:
1997 break;
1998 } /* switch (e->tag) */
2000 /* We get here if no pattern matched. */
2001 irreducible:
2002 ppIRExpr(e);
2003 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2007 /*---------------------------------------------------------*/
2008 /*--- ISEL: Integer expression auxiliaries ---*/
2009 /*---------------------------------------------------------*/
2011 /* --------------------- AMODEs --------------------- */
2013 /* Return an AMode which computes the value of the specified
2014 expression, possibly also adding insns to the code list as a
2015 result. The expression may only be a 32-bit one.
2018 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2020 AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2021 vassert(sane_AMode(am));
2022 return am;
2025 /* DO NOT CALL THIS DIRECTLY ! */
2026 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2028 MatchInfo mi;
2029 DECLARE_PATTERN(p_complex);
2030 IRType ty = typeOfIRExpr(env->type_env,e);
2031 vassert(ty == Ity_I64);
2033 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2034 /* bind0 bind1 bind2 bind3 */
2035 DEFINE_PATTERN(p_complex,
2036 binop( Iop_Add64,
2037 binop( Iop_Add64,
2038 bind(0),
2039 binop(Iop_Shl64, bind(1), bind(2))
2041 bind(3)
2044 if (matchIRExpr(&mi, p_complex, e)) {
2045 const IRExpr* expr1 = mi.bindee[0];
2046 const IRExpr* expr2 = mi.bindee[1];
2047 const IRExpr* imm8 = mi.bindee[2];
2048 const IRExpr* simm32 = mi.bindee[3];
2049 if (imm8->tag == Iex_Const
2050 && imm8->Iex.Const.con->tag == Ico_U8
2051 && imm8->Iex.Const.con->Ico.U8 < 4
2052 /* imm8 is OK, now check simm32 */
2053 && simm32->tag == Iex_Const
2054 && simm32->Iex.Const.con->tag == Ico_U64
2055 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2056 UInt shift = imm8->Iex.Const.con->Ico.U8;
2057 UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2058 HReg r1 = iselIntExpr_R(env, expr1);
2059 HReg r2 = iselIntExpr_R(env, expr2);
2060 vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2061 return AMD64AMode_IRRS(offset, r1, r2, shift);
2065 /* Add64(expr1, Shl64(expr2, imm)) */
2066 if (e->tag == Iex_Binop
2067 && e->Iex.Binop.op == Iop_Add64
2068 && e->Iex.Binop.arg2->tag == Iex_Binop
2069 && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2070 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2071 && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2072 UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2073 if (shift == 1 || shift == 2 || shift == 3) {
2074 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2075 HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2076 return AMD64AMode_IRRS(0, r1, r2, shift);
2080 /* Add64(expr,i) */
2081 if (e->tag == Iex_Binop
2082 && e->Iex.Binop.op == Iop_Add64
2083 && e->Iex.Binop.arg2->tag == Iex_Const
2084 && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2085 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2086 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2087 return AMD64AMode_IR(
2088 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2093 /* Doesn't match anything in particular. Generate it into
2094 a register and use that. */
2096 HReg r1 = iselIntExpr_R(env, e);
2097 return AMD64AMode_IR(0, r1);
2102 /* --------------------- RMIs --------------------- */
2104 /* Similarly, calculate an expression into an X86RMI operand. As with
2105 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2107 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2109 AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2110 /* sanity checks ... */
2111 switch (rmi->tag) {
2112 case Armi_Imm:
2113 return rmi;
2114 case Armi_Reg:
2115 vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2116 vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2117 return rmi;
2118 case Armi_Mem:
2119 vassert(sane_AMode(rmi->Armi.Mem.am));
2120 return rmi;
2121 default:
2122 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2126 /* DO NOT CALL THIS DIRECTLY ! */
2127 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2129 IRType ty = typeOfIRExpr(env->type_env,e);
2130 vassert(ty == Ity_I64 || ty == Ity_I32
2131 || ty == Ity_I16 || ty == Ity_I8);
2133 /* special case: immediate 64/32/16/8 */
2134 if (e->tag == Iex_Const) {
2135 switch (e->Iex.Const.con->tag) {
2136 case Ico_U64:
2137 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2138 return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2140 break;
2141 case Ico_U32:
2142 return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2143 case Ico_U16:
2144 return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2145 case Ico_U8:
2146 return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2147 default:
2148 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2152 /* special case: 64-bit GET */
2153 if (e->tag == Iex_Get && ty == Ity_I64) {
2154 return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2155 hregAMD64_RBP()));
2158 /* special case: 64-bit load from memory */
2159 if (e->tag == Iex_Load && ty == Ity_I64
2160 && e->Iex.Load.end == Iend_LE) {
2161 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2162 return AMD64RMI_Mem(am);
2165 /* default case: calculate into a register and return that */
2167 HReg r = iselIntExpr_R ( env, e );
2168 return AMD64RMI_Reg(r);
2173 /* --------------------- RIs --------------------- */
2175 /* Calculate an expression into an AMD64RI operand. As with
2176 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2177 bits. */
2179 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2181 AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2182 /* sanity checks ... */
2183 switch (ri->tag) {
2184 case Ari_Imm:
2185 return ri;
2186 case Ari_Reg:
2187 vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2188 vassert(hregIsVirtual(ri->Ari.Reg.reg));
2189 return ri;
2190 default:
2191 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2195 /* DO NOT CALL THIS DIRECTLY ! */
2196 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2198 IRType ty = typeOfIRExpr(env->type_env,e);
2199 vassert(ty == Ity_I64 || ty == Ity_I32
2200 || ty == Ity_I16 || ty == Ity_I8);
2202 /* special case: immediate */
2203 if (e->tag == Iex_Const) {
2204 switch (e->Iex.Const.con->tag) {
2205 case Ico_U64:
2206 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2207 return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2209 break;
2210 case Ico_U32:
2211 return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2212 case Ico_U16:
2213 return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2214 case Ico_U8:
2215 return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2216 default:
2217 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2221 /* default case: calculate into a register and return that */
2223 HReg r = iselIntExpr_R ( env, e );
2224 return AMD64RI_Reg(r);
2229 /* --------------------- RMs --------------------- */
2231 /* Similarly, calculate an expression into an AMD64RM operand. As
2232 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2233 bits. */
2235 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2237 AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2238 /* sanity checks ... */
2239 switch (rm->tag) {
2240 case Arm_Reg:
2241 vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2242 vassert(hregIsVirtual(rm->Arm.Reg.reg));
2243 return rm;
2244 case Arm_Mem:
2245 vassert(sane_AMode(rm->Arm.Mem.am));
2246 return rm;
2247 default:
2248 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2252 /* DO NOT CALL THIS DIRECTLY ! */
2253 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2255 IRType ty = typeOfIRExpr(env->type_env,e);
2256 vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2258 /* special case: 64-bit GET */
2259 if (e->tag == Iex_Get && ty == Ity_I64) {
2260 return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2261 hregAMD64_RBP()));
2264 /* special case: load from memory */
2266 /* default case: calculate into a register and return that */
2268 HReg r = iselIntExpr_R ( env, e );
2269 return AMD64RM_Reg(r);
2274 /* --------------------- CONDCODE --------------------- */
2276 /* Generate code to evaluated a bit-typed expression, returning the
2277 condition code which would correspond when the expression would
2278 notionally have returned 1. */
2280 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2282 /* Uh, there's nothing we can sanity check here, unfortunately. */
2283 return iselCondCode_wrk(env,e);
2286 /* DO NOT CALL THIS DIRECTLY ! */
2287 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2289 vassert(e);
2290 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2292 /* var */
2293 if (e->tag == Iex_RdTmp) {
2294 HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2295 addInstr(env, AMD64Instr_Test64(1,r64));
2296 return Acc_NZ;
2299 /* Constant 1:Bit */
2300 if (e->tag == Iex_Const) {
2301 HReg r;
2302 vassert(e->Iex.Const.con->tag == Ico_U1);
2303 vassert(e->Iex.Const.con->Ico.U1 == True
2304 || e->Iex.Const.con->Ico.U1 == False);
2305 r = newVRegI(env);
2306 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2307 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2308 return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2311 /* Not1(...) */
2312 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2313 /* Generate code for the arg, and negate the test condition */
2314 return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2317 /* --- patterns rooted at: 64to1 --- */
2319 /* 64to1 */
2320 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2321 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2322 addInstr(env, AMD64Instr_Test64(1,reg));
2323 return Acc_NZ;
2326 /* --- patterns rooted at: 32to1 --- */
2328 /* 32to1 */
2329 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2330 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2331 addInstr(env, AMD64Instr_Test64(1,reg));
2332 return Acc_NZ;
2335 /* --- patterns rooted at: CmpNEZ8 --- */
2337 /* CmpNEZ8(x) */
2338 if (e->tag == Iex_Unop
2339 && e->Iex.Unop.op == Iop_CmpNEZ8) {
2340 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2341 addInstr(env, AMD64Instr_Test64(0xFF,r));
2342 return Acc_NZ;
2345 /* --- patterns rooted at: CmpNEZ16 --- */
2347 /* CmpNEZ16(x) */
2348 if (e->tag == Iex_Unop
2349 && e->Iex.Unop.op == Iop_CmpNEZ16) {
2350 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2351 addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2352 return Acc_NZ;
2355 /* --- patterns rooted at: CmpNEZ32 --- */
2357 if (e->tag == Iex_Unop
2358 && e->Iex.Unop.op == Iop_CmpNEZ32) {
2359 IRExpr* arg = e->Iex.Unop.arg;
2360 if (arg->tag == Iex_Binop
2361 && (arg->Iex.Binop.op == Iop_Or32
2362 || arg->Iex.Binop.op == Iop_And32)) {
2363 /* CmpNEZ32(Or32(x,y)) */
2364 /* CmpNEZ32(And32(x,y)) */
2365 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2366 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2367 HReg tmp = newVRegI(env);
2368 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2369 addInstr(env, AMD64Instr_Alu32R(
2370 arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2371 rmi1, tmp));
2372 return Acc_NZ;
2374 /* CmpNEZ32(x) */
2375 HReg r1 = iselIntExpr_R(env, arg);
2376 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2377 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2378 return Acc_NZ;
2381 /* --- patterns rooted at: CmpNEZ64 --- */
2383 if (e->tag == Iex_Unop
2384 && e->Iex.Unop.op == Iop_CmpNEZ64) {
2385 IRExpr* arg = e->Iex.Unop.arg;
2386 if (arg->tag == Iex_Binop
2387 && (arg->Iex.Binop.op == Iop_Or64
2388 || arg->Iex.Binop.op == Iop_And64)) {
2389 /* CmpNEZ64(Or64(x,y)) */
2390 /* CmpNEZ64(And64(x,y)) */
2391 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2392 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2393 HReg tmp = newVRegI(env);
2394 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2395 addInstr(env, AMD64Instr_Alu64R(
2396 arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2397 rmi1, tmp));
2398 return Acc_NZ;
2400 /* CmpNEZ64(x) */
2401 HReg r1 = iselIntExpr_R(env, arg);
2402 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2403 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2404 return Acc_NZ;
2407 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2409 /* CmpEQ8 / CmpNE8 */
2410 if (e->tag == Iex_Binop
2411 && (e->Iex.Binop.op == Iop_CmpEQ8
2412 || e->Iex.Binop.op == Iop_CmpNE8
2413 || e->Iex.Binop.op == Iop_CasCmpEQ8
2414 || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2415 if (isZeroU8(e->Iex.Binop.arg2)) {
2416 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2417 addInstr(env, AMD64Instr_Test64(0xFF,r1));
2418 switch (e->Iex.Binop.op) {
2419 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2420 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2421 default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2423 } else {
2424 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2425 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2426 HReg r = newVRegI(env);
2427 addInstr(env, mk_iMOVsd_RR(r1,r));
2428 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2429 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2430 switch (e->Iex.Binop.op) {
2431 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2432 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2433 default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2438 /* CmpEQ16 / CmpNE16 */
2439 if (e->tag == Iex_Binop
2440 && (e->Iex.Binop.op == Iop_CmpEQ16
2441 || e->Iex.Binop.op == Iop_CmpNE16
2442 || e->Iex.Binop.op == Iop_CasCmpEQ16
2443 || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2444 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2445 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2446 HReg r = newVRegI(env);
2447 addInstr(env, mk_iMOVsd_RR(r1,r));
2448 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2449 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2450 switch (e->Iex.Binop.op) {
2451 case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2452 case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2453 default: vpanic("iselCondCode(amd64): CmpXX16");
2457 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2458 Saves a "movq %rax, %tmp" compared to the default route. */
2459 if (e->tag == Iex_Binop
2460 && e->Iex.Binop.op == Iop_CmpNE64
2461 && e->Iex.Binop.arg1->tag == Iex_CCall
2462 && e->Iex.Binop.arg2->tag == Iex_Const) {
2463 IRExpr* cal = e->Iex.Binop.arg1;
2464 IRExpr* con = e->Iex.Binop.arg2;
2465 HReg tmp = newVRegI(env);
2466 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2467 vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2468 vassert(con->Iex.Const.con->tag == Ico_U64);
2469 /* Marshal args, do the call. */
2470 UInt addToSp = 0;
2471 RetLoc rloc = mk_RetLoc_INVALID();
2472 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2473 cal->Iex.CCall.cee,
2474 cal->Iex.CCall.retty, cal->Iex.CCall.args );
2475 vassert(is_sane_RetLoc(rloc));
2476 vassert(rloc.pri == RLPri_Int);
2477 vassert(addToSp == 0);
2478 /* */
2479 addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2480 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2481 AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2482 return Acc_NZ;
2485 /* Cmp*64*(x,y) */
2486 if (e->tag == Iex_Binop
2487 && (e->Iex.Binop.op == Iop_CmpEQ64
2488 || e->Iex.Binop.op == Iop_CmpNE64
2489 || e->Iex.Binop.op == Iop_CmpLT64S
2490 || e->Iex.Binop.op == Iop_CmpLT64U
2491 || e->Iex.Binop.op == Iop_CmpLE64S
2492 || e->Iex.Binop.op == Iop_CmpLE64U
2493 || e->Iex.Binop.op == Iop_CasCmpEQ64
2494 || e->Iex.Binop.op == Iop_CasCmpNE64
2495 || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2496 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2497 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2498 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2499 switch (e->Iex.Binop.op) {
2500 case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2501 case Iop_CmpNE64:
2502 case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2503 case Iop_CmpLT64S: return Acc_L;
2504 case Iop_CmpLT64U: return Acc_B;
2505 case Iop_CmpLE64S: return Acc_LE;
2506 case Iop_CmpLE64U: return Acc_BE;
2507 default: vpanic("iselCondCode(amd64): CmpXX64");
2511 /* Cmp*32*(x,y) */
2512 if (e->tag == Iex_Binop
2513 && (e->Iex.Binop.op == Iop_CmpEQ32
2514 || e->Iex.Binop.op == Iop_CmpNE32
2515 || e->Iex.Binop.op == Iop_CmpLT32S
2516 || e->Iex.Binop.op == Iop_CmpLT32U
2517 || e->Iex.Binop.op == Iop_CmpLE32S
2518 || e->Iex.Binop.op == Iop_CmpLE32U
2519 || e->Iex.Binop.op == Iop_CasCmpEQ32
2520 || e->Iex.Binop.op == Iop_CasCmpNE32
2521 || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2522 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2523 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2524 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2525 switch (e->Iex.Binop.op) {
2526 case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2527 case Iop_CmpNE32:
2528 case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2529 case Iop_CmpLT32S: return Acc_L;
2530 case Iop_CmpLT32U: return Acc_B;
2531 case Iop_CmpLE32S: return Acc_LE;
2532 case Iop_CmpLE32U: return Acc_BE;
2533 default: vpanic("iselCondCode(amd64): CmpXX32");
2537 /* And1(x,y), Or1(x,y) */
2538 /* FIXME: We could (and probably should) do a lot better here. If both args
2539 are in temps already then we can just emit a reg-reg And/Or directly,
2540 followed by the final Test. */
2541 if (e->tag == Iex_Binop
2542 && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2543 // We could probably be cleverer about this. In the meantime ..
2544 HReg x_as_64 = newVRegI(env);
2545 AMD64CondCode cc_x = iselCondCode(env, e->Iex.Binop.arg1);
2546 addInstr(env, AMD64Instr_Set64(cc_x, x_as_64));
2547 HReg y_as_64 = newVRegI(env);
2548 AMD64CondCode cc_y = iselCondCode(env, e->Iex.Binop.arg2);
2549 addInstr(env, AMD64Instr_Set64(cc_y, y_as_64));
2550 AMD64AluOp aop = e->Iex.Binop.op == Iop_And1 ? Aalu_AND : Aalu_OR;
2551 addInstr(env, AMD64Instr_Alu64R(aop, AMD64RMI_Reg(x_as_64), y_as_64));
2552 addInstr(env, AMD64Instr_Test64(1, y_as_64));
2553 return Acc_NZ;
2556 ppIRExpr(e);
2557 vpanic("iselCondCode(amd64)");
2561 /*---------------------------------------------------------*/
2562 /*--- ISEL: Integer expressions (128 bit) ---*/
2563 /*---------------------------------------------------------*/
2565 /* Compute a 128-bit value into a register pair, which is returned as
2566 the first two parameters. As with iselIntExpr_R, these may be
2567 either real or virtual regs; in any case they must not be changed
2568 by subsequent code emitted by the caller. */
2570 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2571 ISelEnv* env, const IRExpr* e )
2573 iselInt128Expr_wrk(rHi, rLo, env, e);
2574 # if 0
2575 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2576 # endif
2577 vassert(hregClass(*rHi) == HRcInt64);
2578 vassert(hregIsVirtual(*rHi));
2579 vassert(hregClass(*rLo) == HRcInt64);
2580 vassert(hregIsVirtual(*rLo));
2583 /* DO NOT CALL THIS DIRECTLY ! */
2584 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2585 ISelEnv* env, const IRExpr* e )
2587 vassert(e);
2588 vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2590 /* read 128-bit IRTemp */
2591 if (e->tag == Iex_RdTmp) {
2592 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2593 return;
2596 /* --------- BINARY ops --------- */
2597 if (e->tag == Iex_Binop) {
2598 switch (e->Iex.Binop.op) {
2599 /* 64 x 64 -> 128 multiply */
2600 case Iop_MullU64:
2601 case Iop_MullS64: {
2602 /* get one operand into %rax, and the other into a R/M.
2603 Need to make an educated guess about which is better in
2604 which. */
2605 HReg tLo = newVRegI(env);
2606 HReg tHi = newVRegI(env);
2607 Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2608 AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2609 HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2610 addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2611 addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2612 /* Result is now in RDX:RAX. Tell the caller. */
2613 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2614 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2615 *rHi = tHi;
2616 *rLo = tLo;
2617 return;
2620 /* 128 x 64 -> (64(rem),64(div)) division */
2621 case Iop_DivModU128to64:
2622 case Iop_DivModS128to64: {
2623 /* Get the 128-bit operand into rdx:rax, and the other into
2624 any old R/M. */
2625 HReg sHi, sLo;
2626 HReg tLo = newVRegI(env);
2627 HReg tHi = newVRegI(env);
2628 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2629 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2630 iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2631 addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2632 addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2633 addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2634 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2635 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2636 *rHi = tHi;
2637 *rLo = tLo;
2638 return;
2641 /* 64HLto128(e1,e2) */
2642 case Iop_64HLto128:
2643 *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2644 *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2645 return;
2647 default:
2648 break;
2650 } /* if (e->tag == Iex_Binop) */
2652 ppIRExpr(e);
2653 vpanic("iselInt128Expr");
2657 /*---------------------------------------------------------*/
2658 /*--- ISEL: Floating point expressions (32 bit) ---*/
2659 /*---------------------------------------------------------*/
2661 /* Nothing interesting here; really just wrappers for
2662 64-bit stuff. */
2664 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2666 HReg r = iselFltExpr_wrk( env, e );
2667 # if 0
2668 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2669 # endif
2670 vassert(hregClass(r) == HRcVec128);
2671 vassert(hregIsVirtual(r));
2672 return r;
2675 /* DO NOT CALL THIS DIRECTLY */
2676 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2678 IRType ty = typeOfIRExpr(env->type_env,e);
2679 vassert(ty == Ity_F32);
2681 if (e->tag == Iex_RdTmp) {
2682 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2685 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2686 AMD64AMode* am;
2687 HReg res = newVRegV(env);
2688 vassert(e->Iex.Load.ty == Ity_F32);
2689 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2690 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2691 return res;
2694 if (e->tag == Iex_Binop
2695 && e->Iex.Binop.op == Iop_F64toF32) {
2696 /* Although the result is still held in a standard SSE register,
2697 we need to round it to reflect the loss of accuracy/range
2698 entailed in casting it to a 32-bit float. */
2699 HReg dst = newVRegV(env);
2700 HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2701 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2702 addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2703 set_SSE_rounding_default( env );
2704 return dst;
2707 if (e->tag == Iex_Get) {
2708 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2709 hregAMD64_RBP() );
2710 HReg res = newVRegV(env);
2711 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2712 return res;
2715 if (e->tag == Iex_Unop
2716 && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2717 /* Given an I32, produce an IEEE754 float with the same bit
2718 pattern. */
2719 HReg dst = newVRegV(env);
2720 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2721 AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2722 addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2723 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2724 return dst;
2727 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2728 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2729 HReg arg = iselFltExpr(env, e->Iex.Binop.arg2);
2730 HReg dst = newVRegV(env);
2732 /* rf now holds the value to be rounded. The first thing to do
2733 is set the FPU's rounding mode accordingly. */
2735 /* Set host x87 rounding mode */
2736 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2738 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2739 addInstr(env, AMD64Instr_A87Free(1));
2740 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2741 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2742 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2743 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2745 /* Restore default x87 rounding. */
2746 set_FPU_rounding_default( env );
2748 return dst;
2751 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2752 /* Sigh ... very rough code. Could do much better. */
2753 /* Get the 128-bit literal 00---0 10---0 into a register
2754 and xor it with the value to be negated. */
2755 HReg r1 = newVRegI(env);
2756 HReg dst = newVRegV(env);
2757 HReg tmp = newVRegV(env);
2758 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2759 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2760 addInstr(env, mk_vMOVsd_RR(src,tmp));
2761 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2762 addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2763 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2764 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2765 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2766 add_to_rsp(env, 16);
2767 return dst;
2770 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2771 IRQop *qop = e->Iex.Qop.details;
2772 HReg dst = newVRegV(env);
2773 HReg argX = iselFltExpr(env, qop->arg2);
2774 HReg argY = iselFltExpr(env, qop->arg3);
2775 HReg argZ = iselFltExpr(env, qop->arg4);
2776 /* XXXROUNDINGFIXME */
2777 /* set roundingmode here */
2778 /* subq $16, %rsp -- make a space*/
2779 sub_from_rsp(env, 16);
2780 /* Prepare 4 arg regs:
2781 leaq 0(%rsp), %rdi
2782 leaq 4(%rsp), %rsi
2783 leaq 8(%rsp), %rdx
2784 leaq 12(%rsp), %rcx
2786 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2787 hregAMD64_RDI()));
2788 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2789 hregAMD64_RSI()));
2790 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2791 hregAMD64_RDX()));
2792 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2793 hregAMD64_RCX()));
2794 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2795 movss %argX, 0(%rsi)
2796 movss %argY, 0(%rdx)
2797 movss %argZ, 0(%rcx)
2799 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2800 AMD64AMode_IR(0, hregAMD64_RSI())));
2801 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2802 AMD64AMode_IR(0, hregAMD64_RDX())));
2803 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2804 AMD64AMode_IR(0, hregAMD64_RCX())));
2805 /* call the helper */
2806 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2807 (ULong)(HWord)h_generic_calc_MAddF32,
2808 4, mk_RetLoc_simple(RLPri_None) ));
2809 /* fetch the result from memory, using %r_argp, which the
2810 register allocator will keep alive across the call. */
2811 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2812 AMD64AMode_IR(0, hregAMD64_RSP())));
2813 /* and finally, clear the space */
2814 add_to_rsp(env, 16);
2815 return dst;
2818 ppIRExpr(e);
2819 vpanic("iselFltExpr_wrk");
2823 /*---------------------------------------------------------*/
2824 /*--- ISEL: Floating point expressions (64 bit) ---*/
2825 /*---------------------------------------------------------*/
2827 /* Compute a 64-bit floating point value into the lower half of an xmm
2828 register, the identity of which is returned. As with
2829 iselIntExpr_R, the returned reg will be virtual, and it must not be
2830 changed by subsequent code emitted by the caller.
2833 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2835 Type S (1 bit) E (11 bits) F (52 bits)
2836 ---- --------- ----------- -----------
2837 signalling NaN u 2047 (max) .0uuuuu---u
2838 (with at least
2839 one 1 bit)
2840 quiet NaN u 2047 (max) .1uuuuu---u
2842 negative infinity 1 2047 (max) .000000---0
2844 positive infinity 0 2047 (max) .000000---0
2846 negative zero 1 0 .000000---0
2848 positive zero 0 0 .000000---0
2851 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2853 HReg r = iselDblExpr_wrk( env, e );
2854 # if 0
2855 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2856 # endif
2857 vassert(hregClass(r) == HRcVec128);
2858 vassert(hregIsVirtual(r));
2859 return r;
2862 /* DO NOT CALL THIS DIRECTLY */
2863 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2865 IRType ty = typeOfIRExpr(env->type_env,e);
2866 vassert(e);
2867 vassert(ty == Ity_F64);
2869 if (e->tag == Iex_RdTmp) {
2870 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2873 if (e->tag == Iex_Const) {
2874 union { ULong u64; Double f64; } u;
2875 HReg res = newVRegV(env);
2876 HReg tmp = newVRegI(env);
2877 vassert(sizeof(u) == 8);
2878 vassert(sizeof(u.u64) == 8);
2879 vassert(sizeof(u.f64) == 8);
2881 if (e->Iex.Const.con->tag == Ico_F64) {
2882 u.f64 = e->Iex.Const.con->Ico.F64;
2884 else if (e->Iex.Const.con->tag == Ico_F64i) {
2885 u.u64 = e->Iex.Const.con->Ico.F64i;
2887 else
2888 vpanic("iselDblExpr(amd64): const");
2890 addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2891 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2892 addInstr(env, AMD64Instr_SseLdSt(
2893 True/*load*/, 8, res,
2894 AMD64AMode_IR(0, hregAMD64_RSP())
2896 add_to_rsp(env, 8);
2897 return res;
2900 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2901 AMD64AMode* am;
2902 HReg res = newVRegV(env);
2903 vassert(e->Iex.Load.ty == Ity_F64);
2904 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2905 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2906 return res;
2909 if (e->tag == Iex_Get) {
2910 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2911 hregAMD64_RBP() );
2912 HReg res = newVRegV(env);
2913 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2914 return res;
2917 if (e->tag == Iex_GetI) {
2918 AMD64AMode* am
2919 = genGuestArrayOffset(
2920 env, e->Iex.GetI.descr,
2921 e->Iex.GetI.ix, e->Iex.GetI.bias );
2922 HReg res = newVRegV(env);
2923 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2924 return res;
2927 if (e->tag == Iex_Triop) {
2928 IRTriop *triop = e->Iex.Triop.details;
2929 AMD64SseOp op = Asse_INVALID;
2930 switch (triop->op) {
2931 case Iop_AddF64: op = Asse_ADDF; break;
2932 case Iop_SubF64: op = Asse_SUBF; break;
2933 case Iop_MulF64: op = Asse_MULF; break;
2934 case Iop_DivF64: op = Asse_DIVF; break;
2935 default: break;
2937 if (op != Asse_INVALID) {
2938 HReg dst = newVRegV(env);
2939 HReg argL = iselDblExpr(env, triop->arg2);
2940 HReg argR = iselDblExpr(env, triop->arg3);
2941 addInstr(env, mk_vMOVsd_RR(argL, dst));
2942 /* XXXROUNDINGFIXME */
2943 /* set roundingmode here */
2944 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2945 return dst;
2949 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2950 IRQop *qop = e->Iex.Qop.details;
2951 HReg dst = newVRegV(env);
2952 HReg argX = iselDblExpr(env, qop->arg2);
2953 HReg argY = iselDblExpr(env, qop->arg3);
2954 HReg argZ = iselDblExpr(env, qop->arg4);
2955 /* XXXROUNDINGFIXME */
2956 /* set roundingmode here */
2957 /* subq $32, %rsp -- make a space*/
2958 sub_from_rsp(env, 32);
2959 /* Prepare 4 arg regs:
2960 leaq 0(%rsp), %rdi
2961 leaq 8(%rsp), %rsi
2962 leaq 16(%rsp), %rdx
2963 leaq 24(%rsp), %rcx
2965 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2966 hregAMD64_RDI()));
2967 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2968 hregAMD64_RSI()));
2969 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2970 hregAMD64_RDX()));
2971 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2972 hregAMD64_RCX()));
2973 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2974 movsd %argX, 0(%rsi)
2975 movsd %argY, 0(%rdx)
2976 movsd %argZ, 0(%rcx)
2978 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2979 AMD64AMode_IR(0, hregAMD64_RSI())));
2980 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2981 AMD64AMode_IR(0, hregAMD64_RDX())));
2982 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2983 AMD64AMode_IR(0, hregAMD64_RCX())));
2984 /* call the helper */
2985 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2986 (ULong)(HWord)h_generic_calc_MAddF64,
2987 4, mk_RetLoc_simple(RLPri_None) ));
2988 /* fetch the result from memory, using %r_argp, which the
2989 register allocator will keep alive across the call. */
2990 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2991 AMD64AMode_IR(0, hregAMD64_RSP())));
2992 /* and finally, clear the space */
2993 add_to_rsp(env, 32);
2994 return dst;
2997 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2998 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2999 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3000 HReg dst = newVRegV(env);
3002 /* rf now holds the value to be rounded. The first thing to do
3003 is set the FPU's rounding mode accordingly. */
3005 /* Set host x87 rounding mode */
3006 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3008 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3009 addInstr(env, AMD64Instr_A87Free(1));
3010 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3011 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
3012 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3013 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3015 /* Restore default x87 rounding. */
3016 set_FPU_rounding_default( env );
3018 return dst;
3021 IRTriop *triop = e->Iex.Triop.details;
3022 if (e->tag == Iex_Triop
3023 && (triop->op == Iop_ScaleF64
3024 || triop->op == Iop_AtanF64
3025 || triop->op == Iop_Yl2xF64
3026 || triop->op == Iop_Yl2xp1F64
3027 || triop->op == Iop_PRemF64
3028 || triop->op == Iop_PRem1F64)
3030 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3031 HReg arg1 = iselDblExpr(env, triop->arg2);
3032 HReg arg2 = iselDblExpr(env, triop->arg3);
3033 HReg dst = newVRegV(env);
3034 Bool arg2first = toBool(triop->op == Iop_ScaleF64
3035 || triop->op == Iop_PRemF64
3036 || triop->op == Iop_PRem1F64);
3037 addInstr(env, AMD64Instr_A87Free(2));
3039 /* one arg -> top of x87 stack */
3040 addInstr(env, AMD64Instr_SseLdSt(
3041 False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3042 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3044 /* other arg -> top of x87 stack */
3045 addInstr(env, AMD64Instr_SseLdSt(
3046 False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3047 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3049 /* do it */
3050 /* XXXROUNDINGFIXME */
3051 /* set roundingmode here */
3052 switch (triop->op) {
3053 case Iop_ScaleF64:
3054 addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3055 break;
3056 case Iop_AtanF64:
3057 addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3058 break;
3059 case Iop_Yl2xF64:
3060 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3061 break;
3062 case Iop_Yl2xp1F64:
3063 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3064 break;
3065 case Iop_PRemF64:
3066 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3067 break;
3068 case Iop_PRem1F64:
3069 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3070 break;
3071 default:
3072 vassert(0);
3075 /* save result */
3076 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3077 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3078 return dst;
3081 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3082 HReg dst = newVRegV(env);
3083 HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3084 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3085 addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3086 set_SSE_rounding_default( env );
3087 return dst;
3090 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3091 HReg dst = newVRegV(env);
3092 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3093 set_SSE_rounding_default( env );
3094 addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3095 return dst;
3098 if (e->tag == Iex_Unop
3099 && (e->Iex.Unop.op == Iop_NegF64
3100 || e->Iex.Unop.op == Iop_AbsF64)) {
3101 /* Sigh ... very rough code. Could do much better. */
3102 /* Get the 128-bit literal 00---0 10---0 into a register
3103 and xor/nand it with the value to be negated. */
3104 HReg r1 = newVRegI(env);
3105 HReg dst = newVRegV(env);
3106 HReg tmp = newVRegV(env);
3107 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3108 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3109 addInstr(env, mk_vMOVsd_RR(src,tmp));
3110 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3111 addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3112 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3113 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3115 if (e->Iex.Unop.op == Iop_NegF64)
3116 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3117 else
3118 addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3120 add_to_rsp(env, 16);
3121 return dst;
3124 if (e->tag == Iex_Binop) {
3125 A87FpOp fpop = Afp_INVALID;
3126 switch (e->Iex.Binop.op) {
3127 case Iop_SqrtF64: fpop = Afp_SQRT; break;
3128 case Iop_SinF64: fpop = Afp_SIN; break;
3129 case Iop_CosF64: fpop = Afp_COS; break;
3130 case Iop_TanF64: fpop = Afp_TAN; break;
3131 case Iop_2xm1F64: fpop = Afp_2XM1; break;
3132 default: break;
3134 if (fpop != Afp_INVALID) {
3135 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3136 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3137 HReg dst = newVRegV(env);
3138 Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3139 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3140 addInstr(env, AMD64Instr_A87Free(nNeeded));
3141 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3142 /* XXXROUNDINGFIXME */
3143 /* set roundingmode here */
3144 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3145 codes. I don't think that matters, since this insn
3146 selector never generates such an instruction intervening
3147 between an flag-setting instruction and a flag-using
3148 instruction. */
3149 addInstr(env, AMD64Instr_A87FpOp(fpop));
3150 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3151 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3152 return dst;
3156 if (e->tag == Iex_Unop) {
3157 switch (e->Iex.Unop.op) {
3158 //.. case Iop_I32toF64: {
3159 //.. HReg dst = newVRegF(env);
3160 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3161 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3162 //.. set_FPU_rounding_default(env);
3163 //.. addInstr(env, X86Instr_FpLdStI(
3164 //.. True/*load*/, 4, dst,
3165 //.. X86AMode_IR(0, hregX86_ESP())));
3166 //.. add_to_esp(env, 4);
3167 //.. return dst;
3168 //.. }
3169 case Iop_ReinterpI64asF64: {
3170 /* Given an I64, produce an IEEE754 double with the same
3171 bit pattern. */
3172 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3173 HReg dst = newVRegV(env);
3174 AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg);
3175 /* paranoia */
3176 set_SSE_rounding_default(env);
3177 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3178 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3179 return dst;
3181 case Iop_F32toF64: {
3182 HReg f32;
3183 HReg f64 = newVRegV(env);
3184 /* this shouldn't be necessary, but be paranoid ... */
3185 set_SSE_rounding_default(env);
3186 f32 = iselFltExpr(env, e->Iex.Unop.arg);
3187 addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3188 return f64;
3190 default:
3191 break;
3195 /* --------- MULTIPLEX --------- */
3196 if (e->tag == Iex_ITE) { // VFD
3197 HReg r1, r0, dst;
3198 vassert(ty == Ity_F64);
3199 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3200 r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
3201 r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
3202 dst = newVRegV(env);
3203 addInstr(env, mk_vMOVsd_RR(r1,dst));
3204 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3205 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3206 return dst;
3209 ppIRExpr(e);
3210 vpanic("iselDblExpr_wrk");
3214 /*---------------------------------------------------------*/
3215 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3216 /*---------------------------------------------------------*/
3218 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3220 HReg r = iselVecExpr_wrk( env, e );
3221 # if 0
3222 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3223 # endif
3224 vassert(hregClass(r) == HRcVec128);
3225 vassert(hregIsVirtual(r));
3226 return r;
3230 /* DO NOT CALL THIS DIRECTLY */
3231 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3233 HWord fn = 0; /* address of helper fn, if required */
3234 Bool arg1isEReg = False;
3235 AMD64SseOp op = Asse_INVALID;
3236 vassert(e);
3237 IRType ty = typeOfIRExpr(env->type_env, e);
3238 vassert(ty == Ity_V128);
3239 UInt laneBits = 0;
3241 if (e->tag == Iex_RdTmp) {
3242 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3245 if (e->tag == Iex_Get) {
3246 HReg dst = newVRegV(env);
3247 addInstr(env, AMD64Instr_SseLdSt(
3248 True/*load*/,
3250 dst,
3251 AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3254 return dst;
3257 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3258 HReg dst = newVRegV(env);
3259 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3260 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3261 return dst;
3264 if (e->tag == Iex_Const) {
3265 HReg dst = newVRegV(env);
3266 vassert(e->Iex.Const.con->tag == Ico_V128);
3267 switch (e->Iex.Const.con->Ico.V128) {
3268 case 0x0000:
3269 dst = generate_zeroes_V128(env);
3270 break;
3271 case 0xFFFF:
3272 dst = generate_ones_V128(env);
3273 break;
3274 default: {
3275 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3276 /* do push_uimm64 twice, first time for the high-order half. */
3277 push_uimm64(env, bitmask8_to_bytemask64(
3278 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3280 push_uimm64(env, bitmask8_to_bytemask64(
3281 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3283 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3284 add_to_rsp(env, 16);
3285 break;
3288 return dst;
3291 if (e->tag == Iex_Unop) {
3292 switch (e->Iex.Unop.op) {
3294 case Iop_NotV128: {
3295 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3296 return do_sse_NotV128(env, arg);
3299 case Iop_CmpNEZ64x2: {
3300 /* We can use SSE2 instructions for this. */
3301 /* Ideally, we want to do a 64Ix2 comparison against zero of
3302 the operand. Problem is no such insn exists. Solution
3303 therefore is to do a 32Ix4 comparison instead, and bitwise-
3304 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3305 let the not'd result of this initial comparison be a:b:c:d.
3306 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3307 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3308 giving the required result.
3310 The required selection sequence is 2,3,0,1, which
3311 according to Intel's documentation means the pshufd
3312 literal value is 0xB1, that is,
3313 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3315 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3316 HReg tmp = generate_zeroes_V128(env);
3317 HReg dst = newVRegV(env);
3318 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3319 tmp = do_sse_NotV128(env, tmp);
3320 addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3321 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3322 return dst;
3325 case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3326 case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3327 case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3328 do_CmpNEZ_vector:
3330 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3331 HReg tmp = newVRegV(env);
3332 HReg zero = generate_zeroes_V128(env);
3333 HReg dst;
3334 addInstr(env, mk_vMOVsd_RR(arg, tmp));
3335 addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3336 dst = do_sse_NotV128(env, tmp);
3337 return dst;
3340 case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary;
3341 case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3342 do_32Fx4_unary:
3344 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3345 HReg dst = newVRegV(env);
3346 addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3347 return dst;
3350 case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary;
3351 case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3352 case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary;
3353 do_32F0x4_unary:
3355 /* A bit subtle. We have to copy the arg to the result
3356 register first, because actually doing the SSE scalar insn
3357 leaves the upper 3/4 of the destination register
3358 unchanged. Whereas the required semantics of these
3359 primops is that the upper 3/4 is simply copied in from the
3360 argument. */
3361 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3362 HReg dst = newVRegV(env);
3363 addInstr(env, mk_vMOVsd_RR(arg, dst));
3364 addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3365 return dst;
3368 case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary;
3369 do_64F0x2_unary:
3371 /* A bit subtle. We have to copy the arg to the result
3372 register first, because actually doing the SSE scalar insn
3373 leaves the upper half of the destination register
3374 unchanged. Whereas the required semantics of these
3375 primops is that the upper half is simply copied in from the
3376 argument. */
3377 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3378 HReg dst = newVRegV(env);
3379 addInstr(env, mk_vMOVsd_RR(arg, dst));
3380 addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3381 return dst;
3384 case Iop_32UtoV128: {
3385 // FIXME maybe just use MOVQ here?
3386 HReg dst = newVRegV(env);
3387 AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3388 AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg);
3389 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3390 addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3391 return dst;
3394 case Iop_64UtoV128: {
3395 // FIXME maybe just use MOVQ here?
3396 HReg dst = newVRegV(env);
3397 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3398 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3399 addInstr(env, AMD64Instr_Push(rmi));
3400 addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3401 add_to_rsp(env, 8);
3402 return dst;
3405 case Iop_V256toV128_0:
3406 case Iop_V256toV128_1: {
3407 HReg vHi, vLo;
3408 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3409 return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3412 case Iop_F16toF32x4: {
3413 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3414 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3415 HReg dst = newVRegV(env);
3416 addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3417 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3418 return dst;
3420 break;
3423 default:
3424 break;
3425 } /* switch (e->Iex.Unop.op) */
3426 } /* if (e->tag == Iex_Unop) */
3428 if (e->tag == Iex_Binop) {
3429 switch (e->Iex.Binop.op) {
3431 case Iop_Sqrt64Fx2:
3432 case Iop_Sqrt32Fx4: {
3433 /* :: (rmode, vec) -> vec */
3434 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3435 HReg dst = newVRegV(env);
3436 /* XXXROUNDINGFIXME */
3437 /* set roundingmode here */
3438 addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3439 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3440 (Asse_SQRTF, arg, dst));
3441 return dst;
3444 /* FIXME: could we generate MOVQ here? */
3445 case Iop_SetV128lo64: {
3446 HReg dst = newVRegV(env);
3447 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3448 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3449 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3450 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3451 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3452 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3453 return dst;
3456 /* FIXME: could we generate MOVD here? */
3457 case Iop_SetV128lo32: {
3458 HReg dst = newVRegV(env);
3459 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3460 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3461 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3462 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3463 addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3464 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3465 return dst;
3468 case Iop_64HLtoV128: {
3469 const IRExpr* arg1 = e->Iex.Binop.arg1;
3470 const IRExpr* arg2 = e->Iex.Binop.arg2;
3471 HReg dst = newVRegV(env);
3472 HReg tmp = newVRegV(env);
3473 HReg qHi = iselIntExpr_R(env, arg1);
3474 // If the args are trivially the same (tmp or const), use the same
3475 // source register for both, and only one movq since those are
3476 // (relatively) expensive.
3477 if (areAtomsAndEqual(arg1, arg2)) {
3478 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3479 addInstr(env, mk_vMOVsd_RR(dst, tmp));
3480 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3481 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3482 } else {
3483 HReg qLo = iselIntExpr_R(env, arg2);
3484 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3485 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3486 addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3487 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3489 return dst;
3492 case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3493 case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3494 case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3495 case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3496 case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4;
3497 case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4;
3498 do_32Fx4:
3500 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3501 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3502 HReg dst = newVRegV(env);
3503 addInstr(env, mk_vMOVsd_RR(argL, dst));
3504 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3505 return dst;
3508 case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3509 case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3510 case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3511 case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3512 case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2;
3513 case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2;
3514 do_64Fx2:
3516 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3517 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3518 HReg dst = newVRegV(env);
3519 addInstr(env, mk_vMOVsd_RR(argL, dst));
3520 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3521 return dst;
3524 case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3525 case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3526 case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3527 case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3528 case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4;
3529 case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4;
3530 case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4;
3531 case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4;
3532 case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4;
3533 case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4;
3534 do_32F0x4: {
3535 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3536 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3537 HReg dst = newVRegV(env);
3538 addInstr(env, mk_vMOVsd_RR(argL, dst));
3539 addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3540 return dst;
3543 case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3544 case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3545 case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3546 case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3547 case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2;
3548 case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2;
3549 case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2;
3550 case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2;
3551 case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2;
3552 case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2;
3553 do_64F0x2: {
3554 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3555 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3556 HReg dst = newVRegV(env);
3557 addInstr(env, mk_vMOVsd_RR(argL, dst));
3558 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3559 return dst;
3562 case Iop_PermOrZero8x16:
3563 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3564 op = Asse_PSHUFB;
3565 goto do_SseReRg;
3567 // Otherwise we'll have to generate a call to
3568 // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
3569 // host which doesn't have SSSE3, in which case we don't expect this
3570 // IROp to enter the compilation pipeline in the first place.
3571 break;
3573 case Iop_PwExtUSMulQAdd8x16:
3574 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3575 op = Asse_PMADDUBSW;
3576 goto do_SseReRg;
3578 break;
3580 case Iop_QNarrowBin32Sto16Sx8:
3581 op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3582 case Iop_QNarrowBin16Sto8Sx16:
3583 op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3584 case Iop_QNarrowBin16Sto8Ux16:
3585 op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3587 case Iop_InterleaveHI8x16:
3588 op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3589 case Iop_InterleaveHI16x8:
3590 op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3591 case Iop_InterleaveHI32x4:
3592 op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3593 case Iop_InterleaveHI64x2:
3594 op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3596 case Iop_InterleaveLO8x16:
3597 op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3598 case Iop_InterleaveLO16x8:
3599 op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3600 case Iop_InterleaveLO32x4:
3601 op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3602 case Iop_InterleaveLO64x2:
3603 op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3605 case Iop_AndV128: op = Asse_AND; goto do_SseReRg;
3606 case Iop_OrV128: op = Asse_OR; goto do_SseReRg;
3607 case Iop_XorV128: op = Asse_XOR; goto do_SseReRg;
3608 case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg;
3609 case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg;
3610 case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg;
3611 case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg;
3612 case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg;
3613 case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg;
3614 case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg;
3615 case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg;
3616 case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg;
3617 case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg;
3618 case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg;
3619 case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg;
3620 case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg;
3621 case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg;
3622 case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3623 case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3624 case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg;
3625 case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg;
3626 case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg;
3627 case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg;
3628 case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3629 case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3630 case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg;
3631 case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg;
3632 case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg;
3633 case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg;
3634 case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg;
3635 case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg;
3636 case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg;
3637 case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg;
3638 case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg;
3639 do_SseReRg: {
3640 HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3641 HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3642 HReg dst = newVRegV(env);
3643 if (arg1isEReg) {
3644 addInstr(env, mk_vMOVsd_RR(arg2, dst));
3645 addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3646 } else {
3647 addInstr(env, mk_vMOVsd_RR(arg1, dst));
3648 addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3650 return dst;
3653 case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3654 case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3655 case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3656 case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3657 case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3658 case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3659 case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3660 case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3661 do_SseShift: {
3662 HReg dst = newVRegV(env);
3663 HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3664 /* If it's a shift by an in-range immediate, generate a single
3665 instruction. */
3666 if (e->Iex.Binop.arg2->tag == Iex_Const) {
3667 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3668 vassert(c->tag == Ico_U8);
3669 UInt shift = c->Ico.U8;
3670 if (shift < laneBits) {
3671 addInstr(env, mk_vMOVsd_RR(greg, dst));
3672 addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3673 return dst;
3676 /* Otherwise we have to do it the longwinded way. */
3677 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3678 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3679 HReg ereg = newVRegV(env);
3680 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3681 addInstr(env, AMD64Instr_Push(rmi));
3682 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3683 addInstr(env, mk_vMOVsd_RR(greg, dst));
3684 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3685 add_to_rsp(env, 16);
3686 return dst;
3689 case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
3690 goto do_SseAssistedBinary;
3691 case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
3692 goto do_SseAssistedBinary;
3693 case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
3694 goto do_SseAssistedBinary;
3695 case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
3696 goto do_SseAssistedBinary;
3697 case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
3698 goto do_SseAssistedBinary;
3699 case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
3700 goto do_SseAssistedBinary;
3701 case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
3702 goto do_SseAssistedBinary;
3703 case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
3704 goto do_SseAssistedBinary;
3705 case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
3706 goto do_SseAssistedBinary;
3707 case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2;
3708 goto do_SseAssistedBinary;
3709 case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3710 goto do_SseAssistedBinary;
3711 case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
3712 goto do_SseAssistedBinary;
3713 case Iop_QNarrowBin32Sto16Ux8:
3714 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3715 goto do_SseAssistedBinary;
3716 case Iop_NarrowBin16to8x16:
3717 fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3718 goto do_SseAssistedBinary;
3719 case Iop_NarrowBin32to16x8:
3720 fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3721 goto do_SseAssistedBinary;
3722 do_SseAssistedBinary: {
3723 /* RRRufff! RRRufff code is what we're generating here. Oh
3724 well. */
3725 vassert(fn != 0);
3726 HReg dst = newVRegV(env);
3727 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3728 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3729 HReg argp = newVRegI(env);
3730 /* subq $112, %rsp -- make a space*/
3731 sub_from_rsp(env, 112);
3732 /* leaq 48(%rsp), %r_argp -- point into it */
3733 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3734 argp));
3735 /* andq $-16, %r_argp -- 16-align the pointer */
3736 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3737 AMD64RMI_Imm( ~(UInt)15 ),
3738 argp));
3739 /* Prepare 3 arg regs:
3740 leaq 0(%r_argp), %rdi
3741 leaq 16(%r_argp), %rsi
3742 leaq 32(%r_argp), %rdx
3744 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3745 hregAMD64_RDI()));
3746 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3747 hregAMD64_RSI()));
3748 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3749 hregAMD64_RDX()));
3750 /* Store the two args, at (%rsi) and (%rdx):
3751 movupd %argL, 0(%rsi)
3752 movupd %argR, 0(%rdx)
3754 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3755 AMD64AMode_IR(0, hregAMD64_RSI())));
3756 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3757 AMD64AMode_IR(0, hregAMD64_RDX())));
3758 /* call the helper */
3759 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3760 3, mk_RetLoc_simple(RLPri_None) ));
3761 /* fetch the result from memory, using %r_argp, which the
3762 register allocator will keep alive across the call. */
3763 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3764 AMD64AMode_IR(0, argp)));
3765 /* and finally, clear the space */
3766 add_to_rsp(env, 112);
3767 return dst;
3770 case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3771 goto do_SseAssistedVectorAndScalar;
3772 case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3773 goto do_SseAssistedVectorAndScalar;
3774 do_SseAssistedVectorAndScalar: {
3775 /* RRRufff! RRRufff code is what we're generating here. Oh
3776 well. */
3777 vassert(fn != 0);
3778 HReg dst = newVRegV(env);
3779 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3780 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3781 HReg argp = newVRegI(env);
3782 /* subq $112, %rsp -- make a space*/
3783 sub_from_rsp(env, 112);
3784 /* leaq 48(%rsp), %r_argp -- point into it */
3785 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3786 argp));
3787 /* andq $-16, %r_argp -- 16-align the pointer */
3788 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3789 AMD64RMI_Imm( ~(UInt)15 ),
3790 argp));
3791 /* Prepare 2 vector arg regs:
3792 leaq 0(%r_argp), %rdi
3793 leaq 16(%r_argp), %rsi
3795 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3796 hregAMD64_RDI()));
3797 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3798 hregAMD64_RSI()));
3799 /* Store the vector arg, at (%rsi):
3800 movupd %argL, 0(%rsi)
3802 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3803 AMD64AMode_IR(0, hregAMD64_RSI())));
3804 /* And get the scalar value into rdx */
3805 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3807 /* call the helper */
3808 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3809 3, mk_RetLoc_simple(RLPri_None) ));
3810 /* fetch the result from memory, using %r_argp, which the
3811 register allocator will keep alive across the call. */
3812 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3813 AMD64AMode_IR(0, argp)));
3814 /* and finally, clear the space */
3815 add_to_rsp(env, 112);
3816 return dst;
3819 case Iop_I32StoF32x4:
3820 case Iop_F32toI32Sx4: {
3821 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3822 HReg dst = newVRegV(env);
3823 AMD64SseOp mop
3824 = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3825 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3826 addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3827 set_SSE_rounding_default(env);
3828 return dst;
3831 // Half-float vector conversion
3832 case Iop_F32toF16x8: {
3833 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3834 HReg srcHi, srcLo;
3835 iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
3836 HReg dstHi = newVRegV(env);
3837 HReg dstLo = newVRegV(env);
3838 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3839 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
3840 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
3841 set_SSE_rounding_default(env);
3842 // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
3843 // need to compact all that into one register. There's probably a
3844 // more elegant way to do this, but ..
3845 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
3846 // dstHi is now 127:64 = useful data, 63:0 = zero
3847 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
3848 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
3849 // dstLo is now 127:64 = zero, 63:0 = useful data
3850 addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
3851 return dstLo;
3853 break;
3856 default:
3857 break;
3858 } /* switch (e->Iex.Binop.op) */
3859 } /* if (e->tag == Iex_Binop) */
3861 if (e->tag == Iex_Triop) {
3862 IRTriop *triop = e->Iex.Triop.details;
3863 switch (triop->op) {
3865 case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3866 case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3867 case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3868 case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3869 do_64Fx2_w_rm:
3871 HReg argL = iselVecExpr(env, triop->arg2);
3872 HReg argR = iselVecExpr(env, triop->arg3);
3873 HReg dst = newVRegV(env);
3874 addInstr(env, mk_vMOVsd_RR(argL, dst));
3875 /* XXXROUNDINGFIXME */
3876 /* set roundingmode here */
3877 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3878 return dst;
3881 case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3882 case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3883 case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3884 case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3885 do_32Fx4_w_rm:
3887 HReg argL = iselVecExpr(env, triop->arg2);
3888 HReg argR = iselVecExpr(env, triop->arg3);
3889 HReg dst = newVRegV(env);
3890 addInstr(env, mk_vMOVsd_RR(argL, dst));
3891 /* XXXROUNDINGFIXME */
3892 /* set roundingmode here */
3893 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3894 return dst;
3897 default:
3898 break;
3899 } /* switch (triop->op) */
3900 } /* if (e->tag == Iex_Triop) */
3902 if (e->tag == Iex_ITE) { // VFD
3903 HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue);
3904 HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse);
3905 HReg dst = newVRegV(env);
3906 addInstr(env, mk_vMOVsd_RR(r1,dst));
3907 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3908 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3909 return dst;
3912 //vec_fail:
3913 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3914 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3915 ppIRExpr(e);
3916 vpanic("iselVecExpr_wrk");
3920 /*---------------------------------------------------------*/
3921 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
3922 /*---------------------------------------------------------*/
3924 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3925 ISelEnv* env, const IRExpr* e )
3927 iselDVecExpr_wrk( rHi, rLo, env, e );
3928 # if 0
3929 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3930 # endif
3931 vassert(hregClass(*rHi) == HRcVec128);
3932 vassert(hregClass(*rLo) == HRcVec128);
3933 vassert(hregIsVirtual(*rHi));
3934 vassert(hregIsVirtual(*rLo));
3938 /* DO NOT CALL THIS DIRECTLY */
3939 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3940 ISelEnv* env, const IRExpr* e )
3942 HWord fn = 0; /* address of helper fn, if required */
3943 vassert(e);
3944 IRType ty = typeOfIRExpr(env->type_env, e);
3945 vassert(ty == Ity_V256);
3946 UInt laneBits = 0;
3948 AMD64SseOp op = Asse_INVALID;
3950 /* read 256-bit IRTemp */
3951 if (e->tag == Iex_RdTmp) {
3952 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3953 return;
3956 if (e->tag == Iex_Get) {
3957 HReg vHi = newVRegV(env);
3958 HReg vLo = newVRegV(env);
3959 HReg rbp = hregAMD64_RBP();
3960 AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp);
3961 AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3962 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3963 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3964 *rHi = vHi;
3965 *rLo = vLo;
3966 return;
3969 if (e->tag == Iex_Load) {
3970 HReg vHi = newVRegV(env);
3971 HReg vLo = newVRegV(env);
3972 HReg rA = iselIntExpr_R(env, e->Iex.Load.addr);
3973 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
3974 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3975 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3976 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3977 *rHi = vHi;
3978 *rLo = vLo;
3979 return;
3982 if (e->tag == Iex_Const) {
3983 vassert(e->Iex.Const.con->tag == Ico_V256);
3984 switch (e->Iex.Const.con->Ico.V256) {
3985 case 0x00000000: {
3986 HReg vHi = generate_zeroes_V128(env);
3987 HReg vLo = newVRegV(env);
3988 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3989 *rHi = vHi;
3990 *rLo = vLo;
3991 return;
3993 default:
3994 break; /* give up. Until such time as is necessary. */
3998 if (e->tag == Iex_Unop) {
3999 switch (e->Iex.Unop.op) {
4001 case Iop_NotV256: {
4002 HReg argHi, argLo;
4003 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4004 *rHi = do_sse_NotV128(env, argHi);
4005 *rLo = do_sse_NotV128(env, argLo);
4006 return;
4009 case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary;
4010 case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary;
4011 case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
4012 do_32Fx8_unary:
4014 HReg argHi, argLo;
4015 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4016 HReg dstHi = newVRegV(env);
4017 HReg dstLo = newVRegV(env);
4018 addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4019 addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4020 *rHi = dstHi;
4021 *rLo = dstLo;
4022 return;
4025 case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary;
4026 do_64Fx4_unary:
4028 HReg argHi, argLo;
4029 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4030 HReg dstHi = newVRegV(env);
4031 HReg dstLo = newVRegV(env);
4032 addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4033 addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4034 *rHi = dstHi;
4035 *rLo = dstLo;
4036 return;
4039 case Iop_CmpNEZ64x4: {
4040 /* We can use SSE2 instructions for this. */
4041 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4042 (obviously). See comment on Iop_CmpNEZ64x2 for
4043 explanation of what's going on here. */
4044 HReg argHi, argLo;
4045 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4046 HReg tmpHi = generate_zeroes_V128(env);
4047 HReg tmpLo = newVRegV(env);
4048 addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4049 HReg dstHi = newVRegV(env);
4050 HReg dstLo = newVRegV(env);
4051 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4052 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4053 tmpHi = do_sse_NotV128(env, tmpHi);
4054 tmpLo = do_sse_NotV128(env, tmpLo);
4055 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4056 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4057 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4058 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4059 *rHi = dstHi;
4060 *rLo = dstLo;
4061 return;
4064 case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4065 case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4066 case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
4067 do_CmpNEZ_vector:
4069 HReg argHi, argLo;
4070 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4071 HReg tmpHi = newVRegV(env);
4072 HReg tmpLo = newVRegV(env);
4073 HReg zero = generate_zeroes_V128(env);
4074 HReg dstHi, dstLo;
4075 addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4076 addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4077 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4078 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4079 dstHi = do_sse_NotV128(env, tmpHi);
4080 dstLo = do_sse_NotV128(env, tmpLo);
4081 *rHi = dstHi;
4082 *rLo = dstLo;
4083 return;
4086 case Iop_F16toF32x8: {
4087 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4088 HReg src = iselVecExpr(env, e->Iex.Unop.arg);
4089 HReg srcCopy = newVRegV(env);
4090 HReg dstHi = newVRegV(env);
4091 HReg dstLo = newVRegV(env);
4092 // Copy src, since we'll need to modify it.
4093 addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4094 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4095 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4096 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4097 *rHi = dstHi;
4098 *rLo = dstLo;
4099 return;
4101 break;
4104 default:
4105 break;
4106 } /* switch (e->Iex.Unop.op) */
4107 } /* if (e->tag == Iex_Unop) */
4109 if (e->tag == Iex_Binop) {
4110 switch (e->Iex.Binop.op) {
4112 case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4;
4113 case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4;
4114 do_64Fx4:
4116 HReg argLhi, argLlo, argRhi, argRlo;
4117 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4118 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4119 HReg dstHi = newVRegV(env);
4120 HReg dstLo = newVRegV(env);
4121 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4122 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4123 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4124 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4125 *rHi = dstHi;
4126 *rLo = dstLo;
4127 return;
4130 case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8;
4131 case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8;
4132 do_32Fx8:
4134 HReg argLhi, argLlo, argRhi, argRlo;
4135 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4136 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4137 HReg dstHi = newVRegV(env);
4138 HReg dstLo = newVRegV(env);
4139 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4140 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4141 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4142 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4143 *rHi = dstHi;
4144 *rLo = dstLo;
4145 return;
4148 case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
4149 case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
4150 case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
4151 case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
4152 case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
4153 case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
4154 case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
4155 case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
4156 case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
4157 case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
4158 case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
4159 case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
4160 case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
4161 case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
4162 case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
4163 case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
4164 case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
4165 case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4166 case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4167 case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
4168 case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
4169 case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
4170 case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
4171 case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4172 case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4173 case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
4174 case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
4175 case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
4176 case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
4177 case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
4178 case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
4179 case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
4180 case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
4181 case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
4182 do_SseReRg:
4184 HReg argLhi, argLlo, argRhi, argRlo;
4185 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4186 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4187 HReg dstHi = newVRegV(env);
4188 HReg dstLo = newVRegV(env);
4189 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4190 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4191 addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4192 addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4193 *rHi = dstHi;
4194 *rLo = dstLo;
4195 return;
4198 case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4199 case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4200 case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4201 case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4202 case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4203 case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4204 case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4205 case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4206 do_SseShift: {
4207 HReg dstHi = newVRegV(env);
4208 HReg dstLo = newVRegV(env);
4209 HReg gregHi, gregLo;
4210 iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4211 /* If it's a shift by an in-range immediate, generate two single
4212 instructions. */
4213 if (e->Iex.Binop.arg2->tag == Iex_Const) {
4214 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4215 vassert(c->tag == Ico_U8);
4216 UInt shift = c->Ico.U8;
4217 if (shift < laneBits) {
4218 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4219 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4220 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4221 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4222 *rHi = dstHi;
4223 *rLo = dstLo;
4224 return;
4227 /* Otherwise we have to do it the longwinded way. */
4228 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4229 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
4230 HReg ereg = newVRegV(env);
4231 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4232 addInstr(env, AMD64Instr_Push(rmi));
4233 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4234 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4235 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4236 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4237 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4238 add_to_rsp(env, 16);
4239 *rHi = dstHi;
4240 *rLo = dstLo;
4241 return;
4244 case Iop_V128HLtoV256: {
4245 // Curiously, there doesn't seem to be any benefit to be had here by
4246 // checking whether arg1 and arg2 are the same, in the style of how
4247 // (eg) 64HLtoV128 is handled elsewhere in this file.
4248 *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4249 *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4250 return;
4253 case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
4254 goto do_SseAssistedBinary;
4255 case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
4256 goto do_SseAssistedBinary;
4257 case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
4258 goto do_SseAssistedBinary;
4259 case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
4260 goto do_SseAssistedBinary;
4261 case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
4262 goto do_SseAssistedBinary;
4263 case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
4264 goto do_SseAssistedBinary;
4265 case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
4266 goto do_SseAssistedBinary;
4267 case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
4268 goto do_SseAssistedBinary;
4269 case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
4270 goto do_SseAssistedBinary;
4271 case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
4272 goto do_SseAssistedBinary;
4273 case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4274 goto do_SseAssistedBinary;
4275 do_SseAssistedBinary: {
4276 /* RRRufff! RRRufff code is what we're generating here. Oh
4277 well. */
4278 vassert(fn != 0);
4279 HReg dstHi = newVRegV(env);
4280 HReg dstLo = newVRegV(env);
4281 HReg argLhi, argLlo, argRhi, argRlo;
4282 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4283 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4284 HReg argp = newVRegI(env);
4285 /* subq $160, %rsp -- make a space*/
4286 sub_from_rsp(env, 160);
4287 /* leaq 48(%rsp), %r_argp -- point into it */
4288 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4289 argp));
4290 /* andq $-16, %r_argp -- 16-align the pointer */
4291 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4292 AMD64RMI_Imm( ~(UInt)15 ),
4293 argp));
4294 /* Prepare 3 arg regs:
4295 leaq 0(%r_argp), %rdi
4296 leaq 16(%r_argp), %rsi
4297 leaq 32(%r_argp), %rdx
4299 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4300 hregAMD64_RDI()));
4301 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4302 hregAMD64_RSI()));
4303 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4304 hregAMD64_RDX()));
4305 /* Store the two high args, at (%rsi) and (%rdx):
4306 movupd %argLhi, 0(%rsi)
4307 movupd %argRhi, 0(%rdx)
4309 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4310 AMD64AMode_IR(0, hregAMD64_RSI())));
4311 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4312 AMD64AMode_IR(0, hregAMD64_RDX())));
4313 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4314 movupd %argLlo, 48(%rsi)
4315 movupd %argRlo, 48(%rdx)
4317 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4318 AMD64AMode_IR(48, hregAMD64_RSI())));
4319 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4320 AMD64AMode_IR(48, hregAMD64_RDX())));
4321 /* call the helper */
4322 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4323 mk_RetLoc_simple(RLPri_None) ));
4324 /* Prepare 3 arg regs:
4325 leaq 48(%r_argp), %rdi
4326 leaq 64(%r_argp), %rsi
4327 leaq 80(%r_argp), %rdx
4329 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4330 hregAMD64_RDI()));
4331 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4332 hregAMD64_RSI()));
4333 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4334 hregAMD64_RDX()));
4335 /* call the helper */
4336 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4337 mk_RetLoc_simple(RLPri_None) ));
4338 /* fetch the result from memory, using %r_argp, which the
4339 register allocator will keep alive across the call. */
4340 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4341 AMD64AMode_IR(0, argp)));
4342 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4343 AMD64AMode_IR(48, argp)));
4344 /* and finally, clear the space */
4345 add_to_rsp(env, 160);
4346 *rHi = dstHi;
4347 *rLo = dstLo;
4348 return;
4351 case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
4352 goto do_SseAssistedBinary256;
4353 do_SseAssistedBinary256: {
4354 /* RRRufff! RRRufff code is what we're generating here. Oh
4355 well. */
4356 vassert(fn != 0);
4357 HReg dstHi = newVRegV(env);
4358 HReg dstLo = newVRegV(env);
4359 HReg argLhi, argLlo, argRhi, argRlo;
4360 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4361 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4362 HReg argp = newVRegI(env);
4363 /* subq $160, %rsp -- make a space*/
4364 sub_from_rsp(env, 160);
4365 /* leaq 48(%rsp), %r_argp -- point into it */
4366 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4367 argp));
4368 /* andq $-16, %r_argp -- 16-align the pointer */
4369 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4370 AMD64RMI_Imm( ~(UInt)15 ),
4371 argp));
4372 /* Prepare 3 arg regs:
4373 leaq 0(%r_argp), %rdi
4374 leaq 32(%r_argp), %rsi
4375 leaq 64(%r_argp), %rdx
4377 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4378 hregAMD64_RDI()));
4379 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4380 hregAMD64_RSI()));
4381 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4382 hregAMD64_RDX()));
4383 /* Store the two args, at (%rsi) and (%rdx):
4384 movupd %argLlo, 0(%rsi)
4385 movupd %argLhi, 16(%rsi)
4386 movupd %argRlo, 0(%rdx)
4387 movupd %argRhi, 16(%rdx)
4389 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4390 AMD64AMode_IR(0, hregAMD64_RSI())));
4391 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4392 AMD64AMode_IR(16, hregAMD64_RSI())));
4393 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4394 AMD64AMode_IR(0, hregAMD64_RDX())));
4395 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4396 AMD64AMode_IR(16, hregAMD64_RDX())));
4397 /* call the helper */
4398 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4399 mk_RetLoc_simple(RLPri_None) ));
4400 /* fetch the result from memory, using %r_argp, which the
4401 register allocator will keep alive across the call. */
4402 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4403 AMD64AMode_IR(0, argp)));
4404 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4405 AMD64AMode_IR(16, argp)));
4406 /* and finally, clear the space */
4407 add_to_rsp(env, 160);
4408 *rHi = dstHi;
4409 *rLo = dstLo;
4410 return;
4413 case Iop_I32StoF32x8:
4414 case Iop_F32toI32Sx8: {
4415 HReg argHi, argLo;
4416 iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4417 HReg dstHi = newVRegV(env);
4418 HReg dstLo = newVRegV(env);
4419 AMD64SseOp mop
4420 = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4421 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4422 addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4423 addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4424 set_SSE_rounding_default(env);
4425 *rHi = dstHi;
4426 *rLo = dstLo;
4427 return;
4430 default:
4431 break;
4432 } /* switch (e->Iex.Binop.op) */
4433 } /* if (e->tag == Iex_Binop) */
4435 if (e->tag == Iex_Triop) {
4436 IRTriop *triop = e->Iex.Triop.details;
4437 switch (triop->op) {
4439 case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4440 case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4441 case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4442 case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4443 do_64Fx4_w_rm:
4445 HReg argLhi, argLlo, argRhi, argRlo;
4446 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4447 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4448 HReg dstHi = newVRegV(env);
4449 HReg dstLo = newVRegV(env);
4450 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4451 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4452 /* XXXROUNDINGFIXME */
4453 /* set roundingmode here */
4454 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4455 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4456 *rHi = dstHi;
4457 *rLo = dstLo;
4458 return;
4461 case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4462 case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4463 case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4464 case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4465 do_32Fx8_w_rm:
4467 HReg argLhi, argLlo, argRhi, argRlo;
4468 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4469 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4470 HReg dstHi = newVRegV(env);
4471 HReg dstLo = newVRegV(env);
4472 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4473 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4474 /* XXXROUNDINGFIXME */
4475 /* set roundingmode here */
4476 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4477 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4478 *rHi = dstHi;
4479 *rLo = dstLo;
4480 return;
4483 default:
4484 break;
4485 } /* switch (triop->op) */
4486 } /* if (e->tag == Iex_Triop) */
4489 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4490 const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4491 const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4492 const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4493 const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4494 // If the args are trivially the same (tmp or const), use the same
4495 // source register for all four, and only one movq since those are
4496 // (relatively) expensive.
4497 if (areAtomsAndEqual(arg1, arg2)
4498 && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4499 HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4500 HReg tmp = newVRegV(env);
4501 HReg dst = newVRegV(env);
4502 addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4503 addInstr(env, mk_vMOVsd_RR(dst, tmp));
4504 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4505 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4506 *rHi = dst;
4507 *rLo = dst;
4508 } else {
4509 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4510 HReg q3 = iselIntExpr_R(env, arg1);
4511 HReg q2 = iselIntExpr_R(env, arg2);
4512 HReg q1 = iselIntExpr_R(env, arg3);
4513 HReg q0 = iselIntExpr_R(env, arg4);
4514 HReg tmp = newVRegV(env);
4515 HReg dstHi = newVRegV(env);
4516 HReg dstLo = newVRegV(env);
4517 addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4518 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4519 addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4520 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4521 addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4522 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4523 addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4524 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4525 *rHi = dstHi;
4526 *rLo = dstLo;
4528 return;
4531 if (e->tag == Iex_ITE) {
4532 HReg r1Hi, r1Lo, r0Hi, r0Lo;
4533 iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4534 iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4535 HReg dstHi = newVRegV(env);
4536 HReg dstLo = newVRegV(env);
4537 addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4538 addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4539 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4540 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4541 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4542 *rHi = dstHi;
4543 *rLo = dstLo;
4544 return;
4547 //avx_fail:
4548 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4549 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4550 ppIRExpr(e);
4551 vpanic("iselDVecExpr_wrk");
4555 /*---------------------------------------------------------*/
4556 /*--- ISEL: Statements ---*/
4557 /*---------------------------------------------------------*/
4559 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4561 if (vex_traceflags & VEX_TRACE_VCODE) {
4562 vex_printf("\n-- ");
4563 ppIRStmt(stmt);
4564 vex_printf("\n");
4567 switch (stmt->tag) {
4569 /* --------- LOADG (guarded load) --------- */
4570 case Ist_LoadG: {
4571 IRLoadG* lg = stmt->Ist.LoadG.details;
4572 if (lg->end != Iend_LE)
4573 goto stmt_fail;
4575 UChar szB = 0; /* invalid */
4576 switch (lg->cvt) {
4577 case ILGop_Ident32: szB = 4; break;
4578 case ILGop_Ident64: szB = 8; break;
4579 case ILGop_IdentV128: szB = 16; break;
4580 default: break;
4582 if (szB == 0)
4583 goto stmt_fail;
4585 AMD64AMode* amAddr
4586 = iselIntExpr_AMode(env, lg->addr);
4587 HReg rAlt
4588 = szB == 16 ? iselVecExpr(env, lg->alt)
4589 : iselIntExpr_R(env, lg->alt);
4590 HReg rDst
4591 = lookupIRTemp(env, lg->dst);
4593 /* Get the alt value into the dst. We'll do a conditional load
4594 which overwrites it -- or not -- with loaded data. */
4595 if (szB == 16) {
4596 addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4597 } else {
4598 addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4600 AMD64CondCode cc = iselCondCode(env, lg->guard);
4601 if (szB == 16) {
4602 addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4603 } else {
4604 addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4606 return;
4609 /* --------- STOREG (guarded store) --------- */
4610 case Ist_StoreG: {
4611 IRStoreG* sg = stmt->Ist.StoreG.details;
4612 if (sg->end != Iend_LE)
4613 goto stmt_fail;
4615 UChar szB = 0; /* invalid */
4616 switch (typeOfIRExpr(env->type_env, sg->data)) {
4617 case Ity_I32: szB = 4; break;
4618 case Ity_I64: szB = 8; break;
4619 case Ity_V128: szB = 16; break;
4620 default: break;
4622 if (szB == 0)
4623 goto stmt_fail;
4625 AMD64AMode* amAddr
4626 = iselIntExpr_AMode(env, sg->addr);
4627 HReg rSrc
4628 = szB == 16 ? iselVecExpr(env, sg->data)
4629 : iselIntExpr_R(env, sg->data);
4630 AMD64CondCode cc
4631 = iselCondCode(env, sg->guard);
4632 if (szB == 16) {
4633 addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4634 } else {
4635 addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4637 return;
4640 /* --------- STORE --------- */
4641 case Ist_Store: {
4642 IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4643 IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4644 IREndness end = stmt->Ist.Store.end;
4646 if (tya != Ity_I64 || end != Iend_LE)
4647 goto stmt_fail;
4649 if (tyd == Ity_I64) {
4650 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4651 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4652 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4653 return;
4655 if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4656 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4657 HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4658 addInstr(env, AMD64Instr_Store(
4659 toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4660 r,am));
4661 return;
4663 if (tyd == Ity_F64) {
4664 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4665 HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4666 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4667 return;
4669 if (tyd == Ity_F32) {
4670 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4671 HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4672 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4673 return;
4675 if (tyd == Ity_V128) {
4676 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4677 HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4678 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4679 return;
4681 if (tyd == Ity_V256) {
4682 HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4683 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4684 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4685 HReg vHi, vLo;
4686 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4687 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4688 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4689 return;
4691 break;
4694 /* --------- PUT --------- */
4695 case Ist_Put: {
4696 IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4697 if (ty == Ity_I64) {
4698 /* We're going to write to memory, so compute the RHS into an
4699 AMD64RI. */
4700 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4701 addInstr(env,
4702 AMD64Instr_Alu64M(
4703 Aalu_MOV,
4705 AMD64AMode_IR(stmt->Ist.Put.offset,
4706 hregAMD64_RBP())
4708 return;
4710 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4711 HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4712 addInstr(env, AMD64Instr_Store(
4713 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4715 AMD64AMode_IR(stmt->Ist.Put.offset,
4716 hregAMD64_RBP())));
4717 return;
4719 if (ty == Ity_F32) {
4720 HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4721 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4722 set_SSE_rounding_default(env); /* paranoia */
4723 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4724 return;
4726 if (ty == Ity_F64) {
4727 HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4728 AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4729 hregAMD64_RBP() );
4730 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4731 return;
4733 if (ty == Ity_V128) {
4734 HReg vec = iselVecExpr(env, stmt->Ist.Put.data);
4735 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset,
4736 hregAMD64_RBP());
4737 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4738 return;
4740 if (ty == Ity_V256) {
4741 HReg vHi, vLo;
4742 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4743 HReg rbp = hregAMD64_RBP();
4744 AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp);
4745 AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4746 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4747 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4748 return;
4750 break;
4753 /* --------- Indexed PUT --------- */
4754 case Ist_PutI: {
4755 IRPutI *puti = stmt->Ist.PutI.details;
4757 AMD64AMode* am
4758 = genGuestArrayOffset(
4759 env, puti->descr,
4760 puti->ix, puti->bias );
4762 IRType ty = typeOfIRExpr(env->type_env, puti->data);
4763 if (ty == Ity_F64) {
4764 HReg val = iselDblExpr(env, puti->data);
4765 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4766 return;
4768 if (ty == Ity_I8) {
4769 HReg r = iselIntExpr_R(env, puti->data);
4770 addInstr(env, AMD64Instr_Store( 1, r, am ));
4771 return;
4773 if (ty == Ity_I64) {
4774 AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4775 addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4776 return;
4778 break;
4781 /* --------- TMP --------- */
4782 case Ist_WrTmp: {
4783 IRTemp tmp = stmt->Ist.WrTmp.tmp;
4784 IRType ty = typeOfIRTemp(env->type_env, tmp);
4786 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4787 compute it into an AMode and then use LEA. This usually
4788 produces fewer instructions, often because (for memcheck
4789 created IR) we get t = address-expression, (t is later used
4790 twice) and so doing this naturally turns address-expression
4791 back into an AMD64 amode. */
4792 if (ty == Ity_I64
4793 && stmt->Ist.WrTmp.data->tag == Iex_Binop
4794 && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4795 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4796 HReg dst = lookupIRTemp(env, tmp);
4797 if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4798 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4799 value into a register. Just emit a normal reg-reg move
4800 so reg-alloc can coalesce it away in the usual way. */
4801 HReg src = am->Aam.IR.reg;
4802 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4803 } else {
4804 addInstr(env, AMD64Instr_Lea64(am,dst));
4806 return;
4809 if (ty == Ity_I64 || ty == Ity_I32
4810 || ty == Ity_I16 || ty == Ity_I8) {
4811 AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4812 HReg dst = lookupIRTemp(env, tmp);
4813 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4814 return;
4816 if (ty == Ity_I128) {
4817 HReg rHi, rLo, dstHi, dstLo;
4818 iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4819 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4820 addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4821 addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4822 return;
4824 if (ty == Ity_I1) {
4825 AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4826 HReg dst = lookupIRTemp(env, tmp);
4827 addInstr(env, AMD64Instr_Set64(cond, dst));
4828 return;
4830 if (ty == Ity_F64) {
4831 HReg dst = lookupIRTemp(env, tmp);
4832 HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4833 addInstr(env, mk_vMOVsd_RR(src, dst));
4834 return;
4836 if (ty == Ity_F32) {
4837 HReg dst = lookupIRTemp(env, tmp);
4838 HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4839 addInstr(env, mk_vMOVsd_RR(src, dst));
4840 return;
4842 if (ty == Ity_V128) {
4843 HReg dst = lookupIRTemp(env, tmp);
4844 HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4845 addInstr(env, mk_vMOVsd_RR(src, dst));
4846 return;
4848 if (ty == Ity_V256) {
4849 HReg rHi, rLo, dstHi, dstLo;
4850 iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4851 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4852 addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4853 addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4854 return;
4856 break;
4859 /* --------- Call to DIRTY helper --------- */
4860 case Ist_Dirty: {
4861 IRDirty* d = stmt->Ist.Dirty.details;
4863 /* Figure out the return type, if any. */
4864 IRType retty = Ity_INVALID;
4865 if (d->tmp != IRTemp_INVALID)
4866 retty = typeOfIRTemp(env->type_env, d->tmp);
4868 /* Throw out any return types we don't know about. */
4869 Bool retty_ok = False;
4870 switch (retty) {
4871 case Ity_INVALID: /* function doesn't return anything */
4872 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4873 case Ity_V128: case Ity_V256:
4874 retty_ok = True; break;
4875 default:
4876 break;
4878 if (!retty_ok)
4879 break; /* will go to stmt_fail: */
4881 /* Marshal args, do the call, and set the return value to
4882 0x555..555 if this is a conditional call that returns a value
4883 and the call is skipped. */
4884 UInt addToSp = 0;
4885 RetLoc rloc = mk_RetLoc_INVALID();
4886 doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4887 vassert(is_sane_RetLoc(rloc));
4889 /* Now figure out what to do with the returned value, if any. */
4890 switch (retty) {
4891 case Ity_INVALID: {
4892 /* No return value. Nothing to do. */
4893 vassert(d->tmp == IRTemp_INVALID);
4894 vassert(rloc.pri == RLPri_None);
4895 vassert(addToSp == 0);
4896 return;
4898 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4899 /* The returned value is in %rax. Park it in the register
4900 associated with tmp. */
4901 vassert(rloc.pri == RLPri_Int);
4902 vassert(addToSp == 0);
4903 HReg dst = lookupIRTemp(env, d->tmp);
4904 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4905 return;
4907 case Ity_V128: {
4908 /* The returned value is on the stack, and rloc.spOff
4909 tells us where. Fish it off the stack and then move
4910 the stack pointer upwards to clear it, as directed by
4911 doHelperCall. */
4912 vassert(rloc.pri == RLPri_V128SpRel);
4913 vassert(addToSp >= 16);
4914 HReg dst = lookupIRTemp(env, d->tmp);
4915 AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4916 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4917 add_to_rsp(env, addToSp);
4918 return;
4920 case Ity_V256: {
4921 /* See comments for Ity_V128. */
4922 vassert(rloc.pri == RLPri_V256SpRel);
4923 vassert(addToSp >= 32);
4924 HReg dstLo, dstHi;
4925 lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4926 AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4927 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4928 AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4929 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4930 add_to_rsp(env, addToSp);
4931 return;
4933 default:
4934 /*NOTREACHED*/
4935 vassert(0);
4937 break;
4940 /* --------- MEM FENCE --------- */
4941 case Ist_MBE:
4942 switch (stmt->Ist.MBE.event) {
4943 case Imbe_Fence:
4944 addInstr(env, AMD64Instr_MFence());
4945 return;
4946 default:
4947 break;
4949 break;
4951 /* --------- ACAS --------- */
4952 case Ist_CAS:
4953 if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4954 /* "normal" singleton CAS */
4955 UChar sz;
4956 IRCAS* cas = stmt->Ist.CAS.details;
4957 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4958 /* get: cas->expd into %rax, and cas->data into %rbx */
4959 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4960 HReg rData = iselIntExpr_R(env, cas->dataLo);
4961 HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4962 HReg rOld = lookupIRTemp(env, cas->oldLo);
4963 vassert(cas->expdHi == NULL);
4964 vassert(cas->dataHi == NULL);
4965 addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4966 addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4967 addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4968 switch (ty) {
4969 case Ity_I64: sz = 8; break;
4970 case Ity_I32: sz = 4; break;
4971 case Ity_I16: sz = 2; break;
4972 case Ity_I8: sz = 1; break;
4973 default: goto unhandled_cas;
4975 addInstr(env, AMD64Instr_ACAS(am, sz));
4976 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4977 return;
4978 } else {
4979 /* double CAS */
4980 UChar sz;
4981 IRCAS* cas = stmt->Ist.CAS.details;
4982 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4983 /* only 32-bit and 64-bit allowed in this case */
4984 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4985 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4986 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4987 HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4988 HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4989 HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4990 HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4991 HReg rOldHi = lookupIRTemp(env, cas->oldHi);
4992 HReg rOldLo = lookupIRTemp(env, cas->oldLo);
4993 switch (ty) {
4994 case Ity_I64:
4995 if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4996 goto unhandled_cas; /* we'd have to generate
4997 cmpxchg16b, but the host
4998 doesn't support that */
4999 sz = 8;
5000 break;
5001 case Ity_I32:
5002 sz = 4;
5003 break;
5004 default:
5005 goto unhandled_cas;
5007 addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
5008 addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
5009 addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
5010 addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
5011 addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
5012 addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
5013 addInstr(env, AMD64Instr_DACAS(am, sz));
5014 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
5015 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
5016 return;
5018 unhandled_cas:
5019 break;
5021 /* --------- INSTR MARK --------- */
5022 /* Doesn't generate any executable code ... */
5023 case Ist_IMark:
5024 return;
5026 /* --------- ABI HINT --------- */
5027 /* These have no meaning (denotation in the IR) and so we ignore
5028 them ... if any actually made it this far. */
5029 case Ist_AbiHint:
5030 return;
5032 /* --------- NO-OP --------- */
5033 case Ist_NoOp:
5034 return;
5036 /* --------- EXIT --------- */
5037 case Ist_Exit: {
5038 if (stmt->Ist.Exit.dst->tag != Ico_U64)
5039 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5041 AMD64CondCode cc = iselCondCode(env, stmt->Ist.Exit.guard);
5042 AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5043 hregAMD64_RBP());
5045 /* Case: boring transfer to known address */
5046 if (stmt->Ist.Exit.jk == Ijk_Boring) {
5047 if (env->chainingAllowed) {
5048 /* .. almost always true .. */
5049 /* Skip the event check at the dst if this is a forwards
5050 edge. */
5051 Bool toFastEP
5052 = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5053 if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5054 addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5055 amRIP, cc, toFastEP));
5056 } else {
5057 /* .. very occasionally .. */
5058 /* We can't use chaining, so ask for an assisted transfer,
5059 as that's the only alternative that is allowable. */
5060 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5061 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5063 return;
5066 /* Case: assisted transfer to arbitrary address */
5067 switch (stmt->Ist.Exit.jk) {
5068 /* Keep this list in sync with that in iselNext below */
5069 case Ijk_ClientReq:
5070 case Ijk_EmWarn:
5071 case Ijk_NoDecode:
5072 case Ijk_NoRedir:
5073 case Ijk_SigSEGV:
5074 case Ijk_SigTRAP:
5075 case Ijk_Sys_syscall:
5076 case Ijk_Sys_int210:
5077 case Ijk_InvalICache:
5078 case Ijk_Yield:
5080 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5081 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5082 return;
5084 default:
5085 break;
5088 /* Do we ever expect to see any other kind? */
5089 goto stmt_fail;
5092 default: break;
5094 stmt_fail:
5095 ppIRStmt(stmt);
5096 vpanic("iselStmt(amd64)");
5100 /*---------------------------------------------------------*/
5101 /*--- ISEL: Basic block terminators (Nexts) ---*/
5102 /*---------------------------------------------------------*/
5104 static void iselNext ( ISelEnv* env,
5105 IRExpr* next, IRJumpKind jk, Int offsIP )
5107 if (vex_traceflags & VEX_TRACE_VCODE) {
5108 vex_printf( "\n-- PUT(%d) = ", offsIP);
5109 ppIRExpr( next );
5110 vex_printf( "; exit-");
5111 ppIRJumpKind(jk);
5112 vex_printf( "\n");
5115 /* Case: boring transfer to known address */
5116 if (next->tag == Iex_Const) {
5117 IRConst* cdst = next->Iex.Const.con;
5118 vassert(cdst->tag == Ico_U64);
5119 if (jk == Ijk_Boring || jk == Ijk_Call) {
5120 /* Boring transfer to known address */
5121 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5122 if (env->chainingAllowed) {
5123 /* .. almost always true .. */
5124 /* Skip the event check at the dst if this is a forwards
5125 edge. */
5126 Bool toFastEP
5127 = ((Addr64)cdst->Ico.U64) > env->max_ga;
5128 if (0) vex_printf("%s", toFastEP ? "X" : ".");
5129 addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5130 amRIP, Acc_ALWAYS,
5131 toFastEP));
5132 } else {
5133 /* .. very occasionally .. */
5134 /* We can't use chaining, so ask for an indirect transfer,
5135 as that's the cheapest alternative that is
5136 allowable. */
5137 HReg r = iselIntExpr_R(env, next);
5138 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5139 Ijk_Boring));
5141 return;
5145 /* Case: call/return (==boring) transfer to any address */
5146 switch (jk) {
5147 case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5148 HReg r = iselIntExpr_R(env, next);
5149 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5150 if (env->chainingAllowed) {
5151 addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5152 } else {
5153 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5154 Ijk_Boring));
5156 return;
5158 default:
5159 break;
5162 /* Case: assisted transfer to arbitrary address */
5163 switch (jk) {
5164 /* Keep this list in sync with that for Ist_Exit above */
5165 case Ijk_ClientReq:
5166 case Ijk_EmWarn:
5167 case Ijk_NoDecode:
5168 case Ijk_NoRedir:
5169 case Ijk_SigSEGV:
5170 case Ijk_SigTRAP:
5171 case Ijk_Sys_syscall:
5172 case Ijk_Sys_int210:
5173 case Ijk_InvalICache:
5174 case Ijk_Yield: {
5175 HReg r = iselIntExpr_R(env, next);
5176 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5177 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5178 return;
5180 default:
5181 break;
5184 vex_printf( "\n-- PUT(%d) = ", offsIP);
5185 ppIRExpr( next );
5186 vex_printf( "; exit-");
5187 ppIRJumpKind(jk);
5188 vex_printf( "\n");
5189 vassert(0); // are we expecting any other kind?
5193 /*---------------------------------------------------------*/
5194 /*--- Insn selector top-level ---*/
5195 /*---------------------------------------------------------*/
5197 /* Translate an entire SB to amd64 code. */
5199 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5200 VexArch arch_host,
5201 const VexArchInfo* archinfo_host,
5202 const VexAbiInfo* vbi/*UNUSED*/,
5203 Int offs_Host_EvC_Counter,
5204 Int offs_Host_EvC_FailAddr,
5205 Bool chainingAllowed,
5206 Bool addProfInc,
5207 Addr max_ga )
5209 Int i, j;
5210 HReg hreg, hregHI;
5211 ISelEnv* env;
5212 UInt hwcaps_host = archinfo_host->hwcaps;
5213 AMD64AMode *amCounter, *amFailAddr;
5215 /* sanity ... */
5216 vassert(arch_host == VexArchAMD64);
5217 vassert(0 == (hwcaps_host
5218 & ~(VEX_HWCAPS_AMD64_SSE3
5219 | VEX_HWCAPS_AMD64_SSSE3
5220 | VEX_HWCAPS_AMD64_CX16
5221 | VEX_HWCAPS_AMD64_LZCNT
5222 | VEX_HWCAPS_AMD64_AVX
5223 | VEX_HWCAPS_AMD64_RDTSCP
5224 | VEX_HWCAPS_AMD64_BMI
5225 | VEX_HWCAPS_AMD64_AVX2
5226 | VEX_HWCAPS_AMD64_F16C
5227 | VEX_HWCAPS_AMD64_RDRAND)));
5229 /* Check that the host's endianness is as expected. */
5230 vassert(archinfo_host->endness == VexEndnessLE);
5232 /* Make up an initial environment to use. */
5233 env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5234 env->vreg_ctr = 0;
5236 /* Set up output code array. */
5237 env->code = newHInstrArray();
5239 /* Copy BB's type env. */
5240 env->type_env = bb->tyenv;
5242 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
5243 change as we go along. */
5244 env->n_vregmap = bb->tyenv->types_used;
5245 env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5246 env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5248 /* and finally ... */
5249 env->chainingAllowed = chainingAllowed;
5250 env->hwcaps = hwcaps_host;
5251 env->max_ga = max_ga;
5253 /* For each IR temporary, allocate a suitably-kinded virtual
5254 register. */
5255 j = 0;
5256 for (i = 0; i < env->n_vregmap; i++) {
5257 hregHI = hreg = INVALID_HREG;
5258 switch (bb->tyenv->types[i]) {
5259 case Ity_I1:
5260 case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5261 hreg = mkHReg(True, HRcInt64, 0, j++);
5262 break;
5263 case Ity_I128:
5264 hreg = mkHReg(True, HRcInt64, 0, j++);
5265 hregHI = mkHReg(True, HRcInt64, 0, j++);
5266 break;
5267 case Ity_F32:
5268 case Ity_F64:
5269 case Ity_V128:
5270 hreg = mkHReg(True, HRcVec128, 0, j++);
5271 break;
5272 case Ity_V256:
5273 hreg = mkHReg(True, HRcVec128, 0, j++);
5274 hregHI = mkHReg(True, HRcVec128, 0, j++);
5275 break;
5276 default:
5277 ppIRType(bb->tyenv->types[i]);
5278 vpanic("iselBB(amd64): IRTemp type");
5280 env->vregmap[i] = hreg;
5281 env->vregmapHI[i] = hregHI;
5283 env->vreg_ctr = j;
5285 /* The very first instruction must be an event check. */
5286 amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP());
5287 amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5288 addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5290 /* Possibly a block counter increment (for profiling). At this
5291 point we don't know the address of the counter, so just pretend
5292 it is zero. It will have to be patched later, but before this
5293 translation is used, by a call to LibVEX_patchProfCtr. */
5294 if (addProfInc) {
5295 addInstr(env, AMD64Instr_ProfInc());
5298 /* Ok, finally we can iterate over the statements. */
5299 for (i = 0; i < bb->stmts_used; i++)
5300 if (bb->stmts[i])
5301 iselStmt(env, bb->stmts[i]);
5303 iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5305 /* record the number of vregs we used. */
5306 env->code->n_vregs = env->vreg_ctr;
5307 return env->code;
5311 /*---------------------------------------------------------------*/
5312 /*--- end host_amd64_isel.c ---*/
5313 /*---------------------------------------------------------------*/