VEX/priv/host_x86_isel.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                   host_x86_isel.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_ir.h"
  36 #include "libvex.h"
  37
  38 #include "ir_match.h"
  39 #include "main_util.h"
  40 #include "main_globals.h"
  41 #include "host_generic_regs.h"
  42 #include "host_generic_simd64.h"
  43 #include "host_generic_simd128.h"
  44 #include "host_x86_defs.h"
  45
  46 /* TODO 21 Apr 2005:
  47
  48    -- (Really an assembler issue) don't emit CMov32 as a cmov
  49       insn, since that's expensive on P4 and conditional branch
  50       is cheaper if (as we expect) the condition is highly predictable
  51
  52    -- preserve xmm registers across function calls (by declaring them
  53       as trashed by call insns)
  54
  55    -- preserve x87 ST stack discipline across function calls.  Sigh.
  56
  57    -- Check doHelperCall: if a call is conditional, we cannot safely
  58       compute any regparm args directly to registers.  Hence, the
  59       fast-regparm marshalling should be restricted to unconditional
  60       calls only.
  61 */
  62
  63 /*---------------------------------------------------------*/
  64 /*--- x87 control word stuff                            ---*/
  65 /*---------------------------------------------------------*/
  66
  67 /* Vex-generated code expects to run with the FPU set as follows: all
  68    exceptions masked, round-to-nearest, precision = 53 bits.  This
  69    corresponds to a FPU control word value of 0x027F.
  70
  71    Similarly the SSE control word (%mxcsr) should be 0x1F80.
  72
  73    %fpucw and %mxcsr should have these values on entry to
  74    Vex-generated code, and should those values should be
  75    unchanged at exit.
  76 */
  77
  78 #define DEFAULT_FPUCW 0x027F
  79
  80 /* debugging only, do not use */
  81 /* define DEFAULT_FPUCW 0x037F */
  82
  83
  84 /*---------------------------------------------------------*/
  85 /*--- misc helpers                                      ---*/
  86 /*---------------------------------------------------------*/
  87
  88 /* These are duplicated in guest-x86/toIR.c */
  89 static IRExpr* unop ( IROp op, IRExpr* a )
  90 {
  91    return IRExpr_Unop(op, a);
  92 }
  93
  94 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
  95 {
  96    return IRExpr_Binop(op, a1, a2);
  97 }
  98
  99 static IRExpr* bind ( Int binder )
 100 {
 101    return IRExpr_Binder(binder);
 102 }
 103
 104 static Bool isZeroU8 ( IRExpr* e )
 105 {
 106    return e->tag == Iex_Const
 107           && e->Iex.Const.con->tag == Ico_U8
 108           && e->Iex.Const.con->Ico.U8 == 0;
 109 }
 110
 111 static Bool isZeroU32 ( IRExpr* e )
 112 {
 113    return e->tag == Iex_Const
 114           && e->Iex.Const.con->tag == Ico_U32
 115           && e->Iex.Const.con->Ico.U32 == 0;
 116 }
 117
 118 //static Bool isZeroU64 ( IRExpr* e )
 119 //{
 120 //   return e->tag == Iex_Const
 121 //          && e->Iex.Const.con->tag == Ico_U64
 122 //          && e->Iex.Const.con->Ico.U64 == 0ULL;
 123 //}
 124
 125
 126 /*---------------------------------------------------------*/
 127 /*--- ISelEnv                                           ---*/
 128 /*---------------------------------------------------------*/
 129
 130 /* This carries around:
 131
 132    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
 133      might encounter.  This is computed before insn selection starts,
 134      and does not change.
 135
 136    - A mapping from IRTemp to HReg.  This tells the insn selector
 137      which virtual register(s) are associated with each IRTemp
 138      temporary.  This is computed before insn selection starts, and
 139      does not change.  We expect this mapping to map precisely the
 140      same set of IRTemps as the type mapping does.
 141
 142         - vregmap   holds the primary register for the IRTemp.
 143         - vregmapHI is only used for 64-bit integer-typed
 144              IRTemps.  It holds the identity of a second
 145              32-bit virtual HReg, which holds the high half
 146              of the value.
 147
 148    - The code array, that is, the insns selected so far.
 149
 150    - A counter, for generating new virtual registers.
 151
 152    - The host subarchitecture we are selecting insns for.
 153      This is set at the start and does not change.
 154
 155    - A Bool for indicating whether we may generate chain-me
 156      instructions for control flow transfers, or whether we must use
 157      XAssisted.
 158
 159    - The maximum guest address of any guest insn in this block.
 160      Actually, the address of the highest-addressed byte from any insn
 161      in this block.  Is set at the start and does not change.  This is
 162      used for detecting jumps which are definitely forward-edges from
 163      this block, and therefore can be made (chained) to the fast entry
 164      point of the destination, thereby avoiding the destination's
 165      event check.
 166
 167    Note, this is all (well, mostly) host-independent.
 168 */
 169
 170 typedef
 171    struct {
 172       /* Constant -- are set at the start and do not change. */
 173       IRTypeEnv*   type_env;
 174
 175       HReg*        vregmap;
 176       HReg*        vregmapHI;
 177       Int          n_vregmap;
 178
 179       UInt         hwcaps;
 180
 181       Bool         chainingAllowed;
 182       Addr32       max_ga;
 183
 184       /* These are modified as we go along. */
 185       HInstrArray* code;
 186       Int          vreg_ctr;
 187    }
 188    ISelEnv;
 189
 190
 191 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
 192 {
 193    vassert(tmp < env->n_vregmap);
 194    return env->vregmap[tmp];
 195 }
 196
 197 static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
 198 {
 199    vassert(tmp < env->n_vregmap);
 200    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
 201    *vrLO = env->vregmap[tmp];
 202    *vrHI = env->vregmapHI[tmp];
 203 }
 204
 205 static void addInstr ( ISelEnv* env, X86Instr* instr )
 206 {
 207    addHInstr(env->code, instr);
 208    if (vex_traceflags & VEX_TRACE_VCODE) {
 209       ppX86Instr(instr, False);
 210       vex_printf("\n");
 211    }
 212 }
 213
 214 static HReg newVRegI ( ISelEnv* env )
 215 {
 216    HReg reg = mkHReg(True/*virtual reg*/, HRcInt32, 0/*enc*/, env->vreg_ctr);
 217    env->vreg_ctr++;
 218    return reg;
 219 }
 220
 221 static HReg newVRegF ( ISelEnv* env )
 222 {
 223    HReg reg = mkHReg(True/*virtual reg*/, HRcFlt64, 0/*enc*/, env->vreg_ctr);
 224    env->vreg_ctr++;
 225    return reg;
 226 }
 227
 228 static HReg newVRegV ( ISelEnv* env )
 229 {
 230    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
 231    env->vreg_ctr++;
 232    return reg;
 233 }
 234
 235
 236 /*---------------------------------------------------------*/
 237 /*--- ISEL: Forward declarations                        ---*/
 238 /*---------------------------------------------------------*/
 239
 240 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
 241    iselXXX_wrk do the real work, but are not to be called directly.
 242    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
 243    checks that all returned registers are virtual.  You should not
 244    call the _wrk version directly.
 245 */
 246 static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
 247 static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
 248
 249 static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
 250 static X86RI*      iselIntExpr_RI     ( ISelEnv* env, const IRExpr* e );
 251
 252 static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
 253 static X86RM*      iselIntExpr_RM     ( ISelEnv* env, const IRExpr* e );
 254
 255 static HReg        iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
 256 static HReg        iselIntExpr_R     ( ISelEnv* env, const IRExpr* e );
 257
 258 static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
 259 static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
 260
 261 static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
 262                                        ISelEnv* env, const IRExpr* e );
 263 static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
 264                                        ISelEnv* env, const IRExpr* e );
 265
 266 static X86CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e );
 267 static X86CondCode iselCondCode     ( ISelEnv* env, const IRExpr* e );
 268
 269 static HReg        iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
 270 static HReg        iselDblExpr     ( ISelEnv* env, const IRExpr* e );
 271
 272 static HReg        iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
 273 static HReg        iselFltExpr     ( ISelEnv* env, const IRExpr* e );
 274
 275 static HReg        iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
 276 static HReg        iselVecExpr     ( ISelEnv* env, const IRExpr* e );
 277
 278
 279 /*---------------------------------------------------------*/
 280 /*--- ISEL: Misc helpers                                ---*/
 281 /*---------------------------------------------------------*/
 282
 283 /* Make a int reg-reg move. */
 284
 285 static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
 286 {
 287    vassert(hregClass(src) == HRcInt32);
 288    vassert(hregClass(dst) == HRcInt32);
 289    return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
 290 }
 291
 292
 293 /* Make a vector reg-reg move. */
 294
 295 static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
 296 {
 297    vassert(hregClass(src) == HRcVec128);
 298    vassert(hregClass(dst) == HRcVec128);
 299    return X86Instr_SseReRg(Xsse_MOV, src, dst);
 300 }
 301
 302 /* Advance/retreat %esp by n. */
 303
 304 static void add_to_esp ( ISelEnv* env, Int n )
 305 {
 306    vassert(n > 0 && n < 256 && (n%4) == 0);
 307    addInstr(env,
 308             X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
 309 }
 310
 311 static void sub_from_esp ( ISelEnv* env, Int n )
 312 {
 313    vassert(n > 0 && n < 256 && (n%4) == 0);
 314    addInstr(env,
 315             X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
 316 }
 317
 318
 319 /* Given an amode, return one which references 4 bytes further
 320    along. */
 321
 322 static X86AMode* advance4 ( X86AMode* am )
 323 {
 324    X86AMode* am4 = dopyX86AMode(am);
 325    switch (am4->tag) {
 326       case Xam_IRRS:
 327          am4->Xam.IRRS.imm += 4; break;
 328       case Xam_IR:
 329          am4->Xam.IR.imm += 4; break;
 330       default:
 331          vpanic("advance4(x86,host)");
 332    }
 333    return am4;
 334 }
 335
 336
 337 /* Push an arg onto the host stack, in preparation for a call to a
 338    helper function of some kind.  Returns the number of 32-bit words
 339    pushed.  If we encounter an IRExpr_VECRET() then we expect that
 340    r_vecRetAddr will be a valid register, that holds the relevant
 341    address.
 342 */
 343 static Int pushArg ( ISelEnv* env, IRExpr* arg, HReg r_vecRetAddr )
 344 {
 345    if (UNLIKELY(arg->tag == Iex_VECRET)) {
 346       vassert(0); //ATC
 347       vassert(!hregIsInvalid(r_vecRetAddr));
 348       addInstr(env, X86Instr_Push(X86RMI_Reg(r_vecRetAddr)));
 349       return 1;
 350    }
 351    if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 352       addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
 353       return 1;
 354    }
 355    /* Else it's a "normal" expression. */
 356    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
 357    if (arg_ty == Ity_I32) {
 358       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
 359       return 1;
 360    } else
 361    if (arg_ty == Ity_I64) {
 362       HReg rHi, rLo;
 363       iselInt64Expr(&rHi, &rLo, env, arg);
 364       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
 365       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
 366       return 2;
 367    }
 368    ppIRExpr(arg);
 369    vpanic("pushArg(x86): can't handle arg of this type");
 370 }
 371
 372
 373 /* Complete the call to a helper function, by calling the
 374    helper and clearing the args off the stack. */
 375
 376 static
 377 void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
 378                               IRCallee* cee, Int n_arg_ws,
 379                               RetLoc rloc )
 380 {
 381    /* Complication.  Need to decide which reg to use as the fn address
 382       pointer, in a way that doesn't trash regparm-passed
 383       parameters. */
 384    vassert(sizeof(void*) == 4);
 385
 386    addInstr(env, X86Instr_Call( cc, (Addr)cee->addr,
 387                                 cee->regparms, rloc));
 388    if (n_arg_ws > 0)
 389       add_to_esp(env, 4*n_arg_ws);
 390 }
 391
 392
 393 /* Used only in doHelperCall.  See big comment in doHelperCall re
 394    handling of regparm args.  This function figures out whether
 395    evaluation of an expression might require use of a fixed register.
 396    If in doubt return True (safe but suboptimal).
 397 */
 398 static
 399 Bool mightRequireFixedRegs ( IRExpr* e )
 400 {
 401    if (UNLIKELY(is_IRExpr_VECRET_or_GSPTR(e))) {
 402       // These are always "safe" -- either a copy of %esp in some
 403       // arbitrary vreg, or a copy of %ebp, respectively.
 404       return False;
 405    }
 406    /* Else it's a "normal" expression. */
 407    switch (e->tag) {
 408       case Iex_RdTmp: case Iex_Const: case Iex_Get:
 409          return False;
 410       default:
 411          return True;
 412    }
 413 }
 414
 415
 416 /* Do a complete function call.  |guard| is a Ity_Bit expression
 417    indicating whether or not the call happens.  If guard==NULL, the
 418    call is unconditional.  |retloc| is set to indicate where the
 419    return value is after the call.  The caller (of this fn) must
 420    generate code to add |stackAdjustAfterCall| to the stack pointer
 421    after the call is done. */
 422
 423 static
 424 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
 425                     /*OUT*/RetLoc* retloc,
 426                     ISelEnv* env,
 427                     IRExpr* guard,
 428                     IRCallee* cee, IRType retTy, IRExpr** args )
 429 {
 430    X86CondCode cc;
 431    HReg        argregs[3];
 432    HReg        tmpregs[3];
 433    Bool        danger;
 434    Int         not_done_yet, n_args, n_arg_ws, stack_limit,
 435                i, argreg, argregX;
 436
 437    /* Set default returns.  We'll update them later if needed. */
 438    *stackAdjustAfterCall = 0;
 439    *retloc               = mk_RetLoc_INVALID();
 440
 441    /* These are used for cross-checking that IR-level constraints on
 442       the use of Iex_VECRET and Iex_GSPTR are observed. */
 443    UInt nVECRETs = 0;
 444    UInt nGSPTRs  = 0;
 445
 446    /* Marshal args for a call, do the call, and clear the stack.
 447       Complexities to consider:
 448
 449       * The return type can be I{64,32,16,8} or V128.  In the V128
 450         case, it is expected that |args| will contain the special
 451         node IRExpr_VECRET(), in which case this routine generates
 452         code to allocate space on the stack for the vector return
 453         value.  Since we are not passing any scalars on the stack, it
 454         is enough to preallocate the return space before marshalling
 455         any arguments, in this case.
 456
 457         |args| may also contain IRExpr_GSPTR(), in which case the
 458         value in %ebp is passed as the corresponding argument.
 459
 460       * If the callee claims regparmness of 1, 2 or 3, we must pass the
 461         first 1, 2 or 3 args in registers (EAX, EDX, and ECX
 462         respectively).  To keep things relatively simple, only args of
 463         type I32 may be passed as regparms -- just bomb out if anything
 464         else turns up.  Clearly this depends on the front ends not
 465         trying to pass any other types as regparms.
 466    */
 467
 468    /* 16 Nov 2004: the regparm handling is complicated by the
 469       following problem.
 470
 471       Consider a call two a function with two regparm parameters:
 472       f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
 473       Suppose code is first generated to compute e1 into %eax.  Then,
 474       code is generated to compute e2 into %edx.  Unfortunately, if
 475       the latter code sequence uses %eax, it will trash the value of
 476       e1 computed by the former sequence.  This could happen if (for
 477       example) e2 itself involved a function call.  In the code below,
 478       args are evaluated right-to-left, not left-to-right, but the
 479       principle and the problem are the same.
 480
 481       One solution is to compute all regparm-bound args into vregs
 482       first, and once they are all done, move them to the relevant
 483       real regs.  This always gives correct code, but it also gives
 484       a bunch of vreg-to-rreg moves which are usually redundant but
 485       are hard for the register allocator to get rid of.
 486
 487       A compromise is to first examine all regparm'd argument
 488       expressions.  If they are all so simple that it is clear
 489       they will be evaluated without use of any fixed registers,
 490       use the old compute-directly-to-fixed-target scheme.  If not,
 491       be safe and use the via-vregs scheme.
 492
 493       Note this requires being able to examine an expression and
 494       determine whether or not evaluation of it might use a fixed
 495       register.  That requires knowledge of how the rest of this
 496       insn selector works.  Currently just the following 3 are
 497       regarded as safe -- hopefully they cover the majority of
 498       arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
 499    */
 500    vassert(cee->regparms >= 0 && cee->regparms <= 3);
 501
 502    /* Count the number of args and also the VECRETs */
 503    n_args = n_arg_ws = 0;
 504    while (args[n_args]) {
 505       IRExpr* arg = args[n_args];
 506       n_args++;
 507       if (UNLIKELY(arg->tag == Iex_VECRET)) {
 508          nVECRETs++;
 509       } else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 510          nGSPTRs++;
 511       }
 512    }
 513
 514    /* If this fails, the IR is ill-formed */
 515    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 516
 517    /* If we have a VECRET, allocate space on the stack for the return
 518       value, and record the stack pointer after that. */
 519    HReg r_vecRetAddr = INVALID_HREG;
 520    if (nVECRETs == 1) {
 521       vassert(retTy == Ity_V128 || retTy == Ity_V256);
 522       vassert(retTy != Ity_V256); // we don't handle that yet (if ever)
 523       r_vecRetAddr = newVRegI(env);
 524       sub_from_esp(env, 16);
 525       addInstr(env, mk_iMOVsd_RR( hregX86_ESP(), r_vecRetAddr ));
 526    } else {
 527       // If either of these fail, the IR is ill-formed
 528       vassert(retTy != Ity_V128 && retTy != Ity_V256);
 529       vassert(nVECRETs == 0);
 530    }
 531
 532    not_done_yet = n_args;
 533
 534    stack_limit = cee->regparms;
 535
 536    /* ------ BEGIN marshall all arguments ------ */
 537
 538    /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
 539    for (i = n_args-1; i >= stack_limit; i--) {
 540       n_arg_ws += pushArg(env, args[i], r_vecRetAddr);
 541       not_done_yet--;
 542    }
 543
 544    /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
 545       registers. */
 546
 547    if (cee->regparms > 0) {
 548
 549       /* ------ BEGIN deal with regparms ------ */
 550
 551       /* deal with regparms, not forgetting %ebp if needed. */
 552       argregs[0] = hregX86_EAX();
 553       argregs[1] = hregX86_EDX();
 554       argregs[2] = hregX86_ECX();
 555       tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
 556
 557       argreg = cee->regparms;
 558
 559       /* In keeping with big comment above, detect potential danger
 560          and use the via-vregs scheme if needed. */
 561       danger = False;
 562       for (i = stack_limit-1; i >= 0; i--) {
 563          if (mightRequireFixedRegs(args[i])) {
 564             danger = True;
 565             break;
 566          }
 567       }
 568
 569       if (danger) {
 570
 571          /* Move via temporaries */
 572          argregX = argreg;
 573          for (i = stack_limit-1; i >= 0; i--) {
 574
 575             if (0) {
 576                vex_printf("x86 host: register param is complex: ");
 577                ppIRExpr(args[i]);
 578                vex_printf("\n");
 579             }
 580
 581             IRExpr* arg = args[i];
 582             argreg--;
 583             vassert(argreg >= 0);
 584             if (UNLIKELY(arg->tag == Iex_VECRET)) {
 585                vassert(0); //ATC
 586             }
 587             else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 588                vassert(0); //ATC
 589             } else {
 590                vassert(typeOfIRExpr(env->type_env, arg) == Ity_I32);
 591                tmpregs[argreg] = iselIntExpr_R(env, arg);
 592             }
 593             not_done_yet--;
 594          }
 595          for (i = stack_limit-1; i >= 0; i--) {
 596             argregX--;
 597             vassert(argregX >= 0);
 598             addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
 599          }
 600
 601       } else {
 602          /* It's safe to compute all regparm args directly into their
 603             target registers. */
 604          for (i = stack_limit-1; i >= 0; i--) {
 605             IRExpr* arg = args[i];
 606             argreg--;
 607             vassert(argreg >= 0);
 608             if (UNLIKELY(arg->tag == Iex_VECRET)) {
 609                vassert(!hregIsInvalid(r_vecRetAddr));
 610                addInstr(env, X86Instr_Alu32R(Xalu_MOV,
 611                                              X86RMI_Reg(r_vecRetAddr),
 612                                              argregs[argreg]));
 613             }
 614             else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 615                vassert(0); //ATC
 616             } else {
 617                vassert(typeOfIRExpr(env->type_env, arg) == Ity_I32);
 618                addInstr(env, X86Instr_Alu32R(Xalu_MOV,
 619                                              iselIntExpr_RMI(env, arg),
 620                                              argregs[argreg]));
 621             }
 622             not_done_yet--;
 623          }
 624
 625       }
 626
 627       /* ------ END deal with regparms ------ */
 628
 629    }
 630
 631    vassert(not_done_yet == 0);
 632
 633    /* ------ END marshall all arguments ------ */
 634
 635    /* Now we can compute the condition.  We can't do it earlier
 636       because the argument computations could trash the condition
 637       codes.  Be a bit clever to handle the common case where the
 638       guard is 1:Bit. */
 639    cc = Xcc_ALWAYS;
 640    if (guard) {
 641       if (guard->tag == Iex_Const
 642           && guard->Iex.Const.con->tag == Ico_U1
 643           && guard->Iex.Const.con->Ico.U1 == True) {
 644          /* unconditional -- do nothing */
 645       } else {
 646          cc = iselCondCode( env, guard );
 647       }
 648    }
 649
 650    /* Do final checks, set the return values, and generate the call
 651       instruction proper. */
 652    vassert(*stackAdjustAfterCall == 0);
 653    vassert(is_RetLoc_INVALID(*retloc));
 654    switch (retTy) {
 655          case Ity_INVALID:
 656             /* Function doesn't return a value. */
 657             *retloc = mk_RetLoc_simple(RLPri_None);
 658             break;
 659          case Ity_I64:
 660             *retloc = mk_RetLoc_simple(RLPri_2Int);
 661             break;
 662          case Ity_I32: case Ity_I16: case Ity_I8:
 663             *retloc = mk_RetLoc_simple(RLPri_Int);
 664             break;
 665          case Ity_V128:
 666             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
 667             *stackAdjustAfterCall = 16;
 668             break;
 669          case Ity_V256:
 670             vassert(0); // ATC
 671             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
 672             *stackAdjustAfterCall = 32;
 673             break;
 674          default:
 675             /* IR can denote other possible return types, but we don't
 676                handle those here. */
 677            vassert(0);
 678    }
 679
 680    /* Finally, generate the call itself.  This needs the *retloc value
 681       set in the switch above, which is why it's at the end. */
 682    callHelperAndClearArgs( env, cc, cee, n_arg_ws, *retloc );
 683 }
 684
 685
 686 /* Given a guest-state array descriptor, an index expression and a
 687    bias, generate an X86AMode holding the relevant guest state
 688    offset. */
 689
 690 static
 691 X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
 692                                 IRExpr* off, Int bias )
 693 {
 694    HReg tmp, roff;
 695    Int  elemSz = sizeofIRType(descr->elemTy);
 696    Int  nElems = descr->nElems;
 697    Int  shift  = 0;
 698
 699    /* throw out any cases not generated by an x86 front end.  In
 700       theory there might be a day where we need to handle them -- if
 701       we ever run non-x86-guest on x86 host. */
 702
 703    if (nElems != 8)
 704       vpanic("genGuestArrayOffset(x86 host)(1)");
 705
 706    switch (elemSz) {
 707       case 1:  shift = 0; break;
 708       case 4:  shift = 2; break;
 709       case 8:  shift = 3; break;
 710       default: vpanic("genGuestArrayOffset(x86 host)(2)");
 711    }
 712
 713    /* Compute off into a reg, %off.  Then return:
 714
 715          movl %off, %tmp
 716          addl $bias, %tmp  (if bias != 0)
 717          andl %tmp, 7
 718          ... base(%ebp, %tmp, shift) ...
 719    */
 720    tmp  = newVRegI(env);
 721    roff = iselIntExpr_R(env, off);
 722    addInstr(env, mk_iMOVsd_RR(roff, tmp));
 723    if (bias != 0) {
 724       addInstr(env,
 725                X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
 726    }
 727    addInstr(env,
 728             X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
 729    return
 730       X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
 731 }
 732
 733
 734 /* Mess with the FPU's rounding mode: set to the default rounding mode
 735    (DEFAULT_FPUCW). */
 736 static
 737 void set_FPU_rounding_default ( ISelEnv* env )
 738 {
 739    /* pushl $DEFAULT_FPUCW
 740       fldcw 0(%esp)
 741       addl $4, %esp
 742    */
 743    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
 744    addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
 745    addInstr(env, X86Instr_FpLdCW(zero_esp));
 746    add_to_esp(env, 4);
 747 }
 748
 749
 750 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
 751    expression denoting a value in the range 0 .. 3, indicating a round
 752    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
 753    the same rounding.
 754 */
 755 static
 756 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
 757 {
 758    HReg rrm  = iselIntExpr_R(env, mode);
 759    HReg rrm2 = newVRegI(env);
 760    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
 761
 762    /* movl  %rrm, %rrm2
 763       andl  $3, %rrm2   -- shouldn't be needed; paranoia
 764       shll  $10, %rrm2
 765       orl   $DEFAULT_FPUCW, %rrm2
 766       pushl %rrm2
 767       fldcw 0(%esp)
 768       addl  $4, %esp
 769    */
 770    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
 771    addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
 772    addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
 773    addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
 774    addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
 775    addInstr(env, X86Instr_FpLdCW(zero_esp));
 776    add_to_esp(env, 4);
 777 }
 778
 779
 780 /* Generate !src into a new vector register, and be sure that the code
 781    is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
 782    way to do this.
 783 */
 784 static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
 785 {
 786    HReg dst = newVRegV(env);
 787    /* Set dst to zero.  If dst contains a NaN then all hell might
 788       break loose after the comparison.  So, first zero it. */
 789    addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
 790    /* And now make it all 1s ... */
 791    addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
 792    /* Finally, xor 'src' into it. */
 793    addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
 794    /* Doesn't that just totally suck? */
 795    return dst;
 796 }
 797
 798
 799 /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
 800    after most non-simple FPU operations (simple = +, -, *, / and
 801    sqrt).
 802
 803    This could be done a lot more efficiently if needed, by loading
 804    zero and adding it to the value to be rounded (fldz ; faddp?).
 805 */
 806 static void roundToF64 ( ISelEnv* env, HReg reg )
 807 {
 808    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
 809    sub_from_esp(env, 8);
 810    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
 811    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
 812    add_to_esp(env, 8);
 813 }
 814
 815
 816 /*---------------------------------------------------------*/
 817 /*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
 818 /*---------------------------------------------------------*/
 819
 820 /* Select insns for an integer-typed expression, and add them to the
 821    code list.  Return a reg holding the result.  This reg will be a
 822    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
 823    want to modify it, ask for a new vreg, copy it in there, and modify
 824    the copy.  The register allocator will do its best to map both
 825    vregs to the same real register, so the copies will often disappear
 826    later in the game.
 827
 828    This should handle expressions of 32, 16 and 8-bit type.  All
 829    results are returned in a 32-bit register.  For 16- and 8-bit
 830    expressions, the upper 16/24 bits are arbitrary, so you should mask
 831    or sign extend partial values if necessary.
 832 */
 833
 834 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
 835 {
 836    HReg r = iselIntExpr_R_wrk(env, e);
 837    /* sanity checks ... */
 838 #  if 0
 839    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
 840 #  endif
 841    vassert(hregClass(r) == HRcInt32);
 842    vassert(hregIsVirtual(r));
 843    return r;
 844 }
 845
 846 /* DO NOT CALL THIS DIRECTLY ! */
 847 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
 848 {
 849    MatchInfo mi;
 850
 851    IRType ty = typeOfIRExpr(env->type_env,e);
 852    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
 853
 854    switch (e->tag) {
 855
 856    /* --------- TEMP --------- */
 857    case Iex_RdTmp: {
 858       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
 859    }
 860
 861    /* --------- LOAD --------- */
 862    case Iex_Load: {
 863       HReg dst = newVRegI(env);
 864       X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
 865
 866       /* We can't handle big-endian loads, nor load-linked. */
 867       if (e->Iex.Load.end != Iend_LE)
 868          goto irreducible;
 869
 870       if (ty == Ity_I32) {
 871          addInstr(env, X86Instr_Alu32R(Xalu_MOV,
 872                                        X86RMI_Mem(amode), dst) );
 873          return dst;
 874       }
 875       if (ty == Ity_I16) {
 876          addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
 877          return dst;
 878       }
 879       if (ty == Ity_I8) {
 880          addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
 881          return dst;
 882       }
 883       break;
 884    }
 885
 886    /* --------- TERNARY OP --------- */
 887    case Iex_Triop: {
 888       IRTriop *triop = e->Iex.Triop.details;
 889       /* C3210 flags following FPU partial remainder (fprem), both
 890          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
 891       if (triop->op == Iop_PRemC3210F64
 892           || triop->op == Iop_PRem1C3210F64) {
 893          HReg junk = newVRegF(env);
 894          HReg dst  = newVRegI(env);
 895          HReg srcL = iselDblExpr(env, triop->arg2);
 896          HReg srcR = iselDblExpr(env, triop->arg3);
 897          /* XXXROUNDINGFIXME */
 898          /* set roundingmode here */
 899          addInstr(env, X86Instr_FpBinary(
 900                            e->Iex.Binop.op==Iop_PRemC3210F64
 901                               ? Xfp_PREM : Xfp_PREM1,
 902                            srcL,srcR,junk
 903                  ));
 904          /* The previous pseudo-insn will have left the FPU's C3210
 905             flags set correctly.  So bag them. */
 906          addInstr(env, X86Instr_FpStSW_AX());
 907          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
 908          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
 909          return dst;
 910       }
 911
 912       break;
 913    }
 914
 915    /* --------- BINARY OP --------- */
 916    case Iex_Binop: {
 917       X86AluOp   aluOp;
 918       X86ShiftOp shOp;
 919
 920       /* Pattern: Sub32(0,x) */
 921       if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
 922          HReg dst = newVRegI(env);
 923          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
 924          addInstr(env, mk_iMOVsd_RR(reg,dst));
 925          addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
 926          return dst;
 927       }
 928
 929       /* Is it an addition or logical style op? */
 930       switch (e->Iex.Binop.op) {
 931          case Iop_Add8: case Iop_Add16: case Iop_Add32:
 932             aluOp = Xalu_ADD; break;
 933          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
 934             aluOp = Xalu_SUB; break;
 935          case Iop_And8: case Iop_And16: case Iop_And32:
 936             aluOp = Xalu_AND; break;
 937          case Iop_Or8: case Iop_Or16: case Iop_Or32:
 938             aluOp = Xalu_OR; break;
 939          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
 940             aluOp = Xalu_XOR; break;
 941          case Iop_Mul16: case Iop_Mul32:
 942             aluOp = Xalu_MUL; break;
 943          default:
 944             aluOp = Xalu_INVALID; break;
 945       }
 946       /* For commutative ops we assume any literal
 947          values are on the second operand. */
 948       if (aluOp != Xalu_INVALID) {
 949          HReg dst    = newVRegI(env);
 950          HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
 951          X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
 952          addInstr(env, mk_iMOVsd_RR(reg,dst));
 953          addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
 954          return dst;
 955       }
 956       /* Could do better here; forcing the first arg into a reg
 957          isn't always clever.
 958          -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
 959                         LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
 960                         t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
 961             movl 0xFFFFFFA0(%vr41),%vr107
 962             movl 0xFFFFFFA4(%vr41),%vr108
 963             movl %vr107,%vr106
 964             xorl %vr108,%vr106
 965             movl 0xFFFFFFA8(%vr41),%vr109
 966             movl %vr106,%vr105
 967             andl %vr109,%vr105
 968             movl 0xFFFFFFA0(%vr41),%vr110
 969             movl %vr105,%vr104
 970             xorl %vr110,%vr104
 971             movl %vr104,%vr70
 972       */
 973
 974       /* Perhaps a shift op? */
 975       switch (e->Iex.Binop.op) {
 976          case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
 977             shOp = Xsh_SHL; break;
 978          case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
 979             shOp = Xsh_SHR; break;
 980          case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
 981             shOp = Xsh_SAR; break;
 982          default:
 983             shOp = Xsh_INVALID; break;
 984       }
 985       if (shOp != Xsh_INVALID) {
 986          HReg dst = newVRegI(env);
 987
 988          /* regL = the value to be shifted */
 989          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
 990          addInstr(env, mk_iMOVsd_RR(regL,dst));
 991
 992          /* Do any necessary widening for 16/8 bit operands */
 993          switch (e->Iex.Binop.op) {
 994             case Iop_Shr8:
 995                addInstr(env, X86Instr_Alu32R(
 996                                 Xalu_AND, X86RMI_Imm(0xFF), dst));
 997                break;
 998             case Iop_Shr16:
 999                addInstr(env, X86Instr_Alu32R(
1000                                 Xalu_AND, X86RMI_Imm(0xFFFF), dst));
1001                break;
1002             case Iop_Sar8:
1003                addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
1004                addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
1005                break;
1006             case Iop_Sar16:
1007                addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
1008                addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
1009                break;
1010             default: break;
1011          }
1012
1013          /* Now consider the shift amount.  If it's a literal, we
1014             can do a much better job than the general case. */
1015          if (e->Iex.Binop.arg2->tag == Iex_Const) {
1016             /* assert that the IR is well-typed */
1017             Int nshift;
1018             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1019             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1020             vassert(nshift >= 0);
1021             if (nshift > 0)
1022                /* Can't allow nshift==0 since that means %cl */
1023                addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
1024          } else {
1025             /* General case; we have to force the amount into %cl. */
1026             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1027             addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
1028             addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
1029          }
1030          return dst;
1031       }
1032
1033       /* Handle misc other ops. */
1034
1035       if (e->Iex.Binop.op == Iop_Max32U) {
1036          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1037          HReg dst  = newVRegI(env);
1038          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1039          addInstr(env, mk_iMOVsd_RR(src1,dst));
1040          addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
1041          addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
1042          return dst;
1043       }
1044
1045       if (e->Iex.Binop.op == Iop_8HLto16) {
1046          HReg hi8  = newVRegI(env);
1047          HReg lo8  = newVRegI(env);
1048          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1049          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1050          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1051          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1052          addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
1053          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
1054          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
1055          return hi8;
1056       }
1057
1058       if (e->Iex.Binop.op == Iop_16HLto32) {
1059          HReg hi16  = newVRegI(env);
1060          HReg lo16  = newVRegI(env);
1061          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1062          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1063          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1064          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1065          addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
1066          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
1067          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
1068          return hi16;
1069       }
1070
1071       if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
1072           || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
1073          HReg a16   = newVRegI(env);
1074          HReg b16   = newVRegI(env);
1075          HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1076          HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1077          Int  shift = (e->Iex.Binop.op == Iop_MullS8
1078                        || e->Iex.Binop.op == Iop_MullU8)
1079                          ? 24 : 16;
1080          X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
1081                               || e->Iex.Binop.op == Iop_MullS16)
1082                                 ? Xsh_SAR : Xsh_SHR;
1083
1084          addInstr(env, mk_iMOVsd_RR(a16s, a16));
1085          addInstr(env, mk_iMOVsd_RR(b16s, b16));
1086          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
1087          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
1088          addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
1089          addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
1090          addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
1091          return b16;
1092       }
1093
1094       if (e->Iex.Binop.op == Iop_CmpF64) {
1095          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1096          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1097          HReg dst = newVRegI(env);
1098          addInstr(env, X86Instr_FpCmp(fL,fR,dst));
1099          /* shift this right 8 bits so as to conform to CmpF64
1100             definition. */
1101          addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
1102          return dst;
1103       }
1104
1105       if (e->Iex.Binop.op == Iop_F64toI32S
1106           || e->Iex.Binop.op == Iop_F64toI16S) {
1107          Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
1108          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1109          HReg dst = newVRegI(env);
1110
1111          /* Used several times ... */
1112          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1113
1114          /* rf now holds the value to be converted, and rrm holds the
1115             rounding mode value, encoded as per the IRRoundingMode
1116             enum.  The first thing to do is set the FPU's rounding
1117             mode accordingly. */
1118
1119          /* Create a space for the format conversion. */
1120          /* subl $4, %esp */
1121          sub_from_esp(env, 4);
1122
1123          /* Set host rounding mode */
1124          set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
1125
1126          /* gistw/l %rf, 0(%esp) */
1127          addInstr(env, X86Instr_FpLdStI(False/*store*/,
1128                                         toUChar(sz), rf, zero_esp));
1129
1130          if (sz == 2) {
1131             /* movzwl 0(%esp), %dst */
1132             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
1133          } else {
1134             /* movl 0(%esp), %dst */
1135             vassert(sz == 4);
1136             addInstr(env, X86Instr_Alu32R(
1137                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1138          }
1139
1140          /* Restore default FPU rounding. */
1141          set_FPU_rounding_default( env );
1142
1143          /* addl $4, %esp */
1144          add_to_esp(env, 4);
1145          return dst;
1146       }
1147
1148       break;
1149    }
1150
1151    /* --------- UNARY OP --------- */
1152    case Iex_Unop: {
1153
1154       /* 1Uto8(32to1(expr32)) */
1155       if (e->Iex.Unop.op == Iop_1Uto8) {
1156          DECLARE_PATTERN(p_32to1_then_1Uto8);
1157          DEFINE_PATTERN(p_32to1_then_1Uto8,
1158                         unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
1159          if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
1160             const IRExpr* expr32 = mi.bindee[0];
1161             HReg dst = newVRegI(env);
1162             HReg src = iselIntExpr_R(env, expr32);
1163             addInstr(env, mk_iMOVsd_RR(src,dst) );
1164             addInstr(env, X86Instr_Alu32R(Xalu_AND,
1165                                           X86RMI_Imm(1), dst));
1166             return dst;
1167          }
1168       }
1169
1170       /* 8Uto32(LDle(expr32)) */
1171       if (e->Iex.Unop.op == Iop_8Uto32) {
1172          DECLARE_PATTERN(p_LDle8_then_8Uto32);
1173          DEFINE_PATTERN(p_LDle8_then_8Uto32,
1174                         unop(Iop_8Uto32,
1175                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1176          if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
1177             HReg dst = newVRegI(env);
1178             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1179             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1180             return dst;
1181          }
1182       }
1183
1184       /* 8Sto32(LDle(expr32)) */
1185       if (e->Iex.Unop.op == Iop_8Sto32) {
1186          DECLARE_PATTERN(p_LDle8_then_8Sto32);
1187          DEFINE_PATTERN(p_LDle8_then_8Sto32,
1188                         unop(Iop_8Sto32,
1189                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1190          if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
1191             HReg dst = newVRegI(env);
1192             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1193             addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
1194             return dst;
1195          }
1196       }
1197
1198       /* 16Uto32(LDle(expr32)) */
1199       if (e->Iex.Unop.op == Iop_16Uto32) {
1200          DECLARE_PATTERN(p_LDle16_then_16Uto32);
1201          DEFINE_PATTERN(p_LDle16_then_16Uto32,
1202                         unop(Iop_16Uto32,
1203                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1204          if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
1205             HReg dst = newVRegI(env);
1206             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1207             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1208             return dst;
1209          }
1210       }
1211
1212       /* 8Uto32(GET:I8) */
1213       if (e->Iex.Unop.op == Iop_8Uto32) {
1214          if (e->Iex.Unop.arg->tag == Iex_Get) {
1215             HReg      dst;
1216             X86AMode* amode;
1217             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
1218             dst = newVRegI(env);
1219             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1220                                 hregX86_EBP());
1221             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
1222             return dst;
1223          }
1224       }
1225
1226       /* 16to32(GET:I16) */
1227       if (e->Iex.Unop.op == Iop_16Uto32) {
1228          if (e->Iex.Unop.arg->tag == Iex_Get) {
1229             HReg      dst;
1230             X86AMode* amode;
1231             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
1232             dst = newVRegI(env);
1233             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1234                                 hregX86_EBP());
1235             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
1236             return dst;
1237          }
1238       }
1239
1240       switch (e->Iex.Unop.op) {
1241          case Iop_8Uto16:
1242          case Iop_8Uto32:
1243          case Iop_16Uto32: {
1244             HReg dst = newVRegI(env);
1245             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1246             UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
1247             addInstr(env, mk_iMOVsd_RR(src,dst) );
1248             addInstr(env, X86Instr_Alu32R(Xalu_AND,
1249                                           X86RMI_Imm(mask), dst));
1250             return dst;
1251          }
1252          case Iop_8Sto16:
1253          case Iop_8Sto32:
1254          case Iop_16Sto32: {
1255             HReg dst = newVRegI(env);
1256             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1257             UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
1258             addInstr(env, mk_iMOVsd_RR(src,dst) );
1259             addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
1260             addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
1261             return dst;
1262          }
1263          case Iop_Not8:
1264          case Iop_Not16:
1265          case Iop_Not32: {
1266             HReg dst = newVRegI(env);
1267             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1268             addInstr(env, mk_iMOVsd_RR(src,dst) );
1269             addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
1270             return dst;
1271          }
1272          case Iop_64HIto32: {
1273             HReg rHi, rLo;
1274             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1275             return rHi; /* and abandon rLo .. poor wee thing :-) */
1276          }
1277          case Iop_64to32: {
1278             HReg rHi, rLo;
1279             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1280             return rLo; /* similar stupid comment to the above ... */
1281          }
1282          case Iop_16HIto8:
1283          case Iop_32HIto16: {
1284             HReg dst  = newVRegI(env);
1285             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1286             Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
1287             addInstr(env, mk_iMOVsd_RR(src,dst) );
1288             addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
1289             return dst;
1290          }
1291          case Iop_1Uto32:
1292          case Iop_1Uto8: {
1293             HReg dst         = newVRegI(env);
1294             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1295             addInstr(env, X86Instr_Set32(cond,dst));
1296             return dst;
1297          }
1298          case Iop_1Sto8:
1299          case Iop_1Sto16:
1300          case Iop_1Sto32: {
1301             /* could do better than this, but for now ... */
1302             HReg dst         = newVRegI(env);
1303             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1304             addInstr(env, X86Instr_Set32(cond,dst));
1305             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
1306             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1307             return dst;
1308          }
1309          case Iop_Ctz32: {
1310             /* Count trailing zeroes, implemented by x86 'bsfl' */
1311             HReg dst = newVRegI(env);
1312             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1313             addInstr(env, X86Instr_Bsfr32(True,src,dst));
1314             return dst;
1315          }
1316          case Iop_Clz32: {
1317             /* Count leading zeroes.  Do 'bsrl' to establish the index
1318                of the highest set bit, and subtract that value from
1319                31. */
1320             HReg tmp = newVRegI(env);
1321             HReg dst = newVRegI(env);
1322             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1323             addInstr(env, X86Instr_Bsfr32(False,src,tmp));
1324             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
1325                                           X86RMI_Imm(31), dst));
1326             addInstr(env, X86Instr_Alu32R(Xalu_SUB,
1327                                           X86RMI_Reg(tmp), dst));
1328             return dst;
1329          }
1330
1331          case Iop_CmpwNEZ32: {
1332             HReg dst = newVRegI(env);
1333             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1334             addInstr(env, mk_iMOVsd_RR(src,dst));
1335             addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
1336             addInstr(env, X86Instr_Alu32R(Xalu_OR,
1337                                           X86RMI_Reg(src), dst));
1338             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
1339             return dst;
1340          }
1341          case Iop_Left8:
1342          case Iop_Left16:
1343          case Iop_Left32: {
1344             HReg dst = newVRegI(env);
1345             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1346             addInstr(env, mk_iMOVsd_RR(src, dst));
1347             addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
1348             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
1349             return dst;
1350          }
1351
1352          case Iop_V128to32: {
1353             HReg      dst  = newVRegI(env);
1354             HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
1355             X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
1356             sub_from_esp(env, 16);
1357             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
1358             addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
1359             add_to_esp(env, 16);
1360             return dst;
1361          }
1362
1363          /* ReinterpF32asI32(e) */
1364          /* Given an IEEE754 single, produce an I32 with the same bit
1365             pattern.  Keep stack 8-aligned even though only using 4
1366             bytes. */
1367          case Iop_ReinterpF32asI32: {
1368             HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
1369             HReg dst  = newVRegI(env);
1370             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
1371             /* paranoia */
1372             set_FPU_rounding_default(env);
1373             /* subl $8, %esp */
1374             sub_from_esp(env, 8);
1375             /* gstF %rf, 0(%esp) */
1376             addInstr(env,
1377                      X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
1378             /* movl 0(%esp), %dst */
1379             addInstr(env,
1380                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
1381             /* addl $8, %esp */
1382             add_to_esp(env, 8);
1383             return dst;
1384          }
1385
1386          case Iop_16to8:
1387          case Iop_32to8:
1388          case Iop_32to16:
1389             /* These are no-ops. */
1390             return iselIntExpr_R(env, e->Iex.Unop.arg);
1391
1392          case Iop_GetMSBs8x8: {
1393             /* Note: the following assumes the helper is of
1394                signature
1395                   UInt fn ( ULong ), and is not a regparm fn.
1396             */
1397             HReg  xLo, xHi;
1398             HReg  dst = newVRegI(env);
1399             Addr fn = (Addr)h_generic_calc_GetMSBs8x8;
1400             iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
1401             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
1402             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
1403             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
1404                                          0, mk_RetLoc_simple(RLPri_Int) ));
1405             add_to_esp(env, 2*4);
1406             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1407             return dst;
1408          }
1409
1410          default:
1411             break;
1412       }
1413       break;
1414    }
1415
1416    /* --------- GET --------- */
1417    case Iex_Get: {
1418       if (ty == Ity_I32) {
1419          HReg dst = newVRegI(env);
1420          addInstr(env, X86Instr_Alu32R(
1421                           Xalu_MOV,
1422                           X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1423                                                  hregX86_EBP())),
1424                           dst));
1425          return dst;
1426       }
1427       if (ty == Ity_I8 || ty == Ity_I16) {
1428          HReg dst = newVRegI(env);
1429          addInstr(env, X86Instr_LoadEX(
1430                           toUChar(ty==Ity_I8 ? 1 : 2),
1431                           False,
1432                           X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
1433                           dst));
1434          return dst;
1435       }
1436       break;
1437    }
1438
1439    case Iex_GetI: {
1440       X86AMode* am
1441          = genGuestArrayOffset(
1442               env, e->Iex.GetI.descr,
1443                    e->Iex.GetI.ix, e->Iex.GetI.bias );
1444       HReg dst = newVRegI(env);
1445       if (ty == Ity_I8) {
1446          addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
1447          return dst;
1448       }
1449       if (ty == Ity_I32) {
1450          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
1451          return dst;
1452       }
1453       break;
1454    }
1455
1456    /* --------- CCALL --------- */
1457    case Iex_CCall: {
1458       HReg    dst = newVRegI(env);
1459       vassert(ty == e->Iex.CCall.retty);
1460
1461       /* be very restrictive for now.  Only 32/64-bit ints allowed for
1462          args, and 32 bits for return type.  Don't forget to change
1463          the RetLoc if more return types are allowed in future. */
1464       if (e->Iex.CCall.retty != Ity_I32)
1465          goto irreducible;
1466
1467       /* Marshal args, do the call, clear stack. */
1468       UInt   addToSp = 0;
1469       RetLoc rloc    = mk_RetLoc_INVALID();
1470       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1471                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1472       vassert(is_sane_RetLoc(rloc));
1473       vassert(rloc.pri == RLPri_Int);
1474       vassert(addToSp == 0);
1475
1476       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
1477       return dst;
1478    }
1479
1480    /* --------- LITERAL --------- */
1481    /* 32/16/8-bit literals */
1482    case Iex_Const: {
1483       X86RMI* rmi = iselIntExpr_RMI ( env, e );
1484       HReg    r   = newVRegI(env);
1485       addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
1486       return r;
1487    }
1488
1489    /* --------- MULTIPLEX --------- */
1490    case Iex_ITE: { // VFD
1491      if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1492          && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1493         HReg   r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1494         X86RM* r0  = iselIntExpr_RM(env, e->Iex.ITE.iffalse);
1495         HReg   dst = newVRegI(env);
1496         addInstr(env, mk_iMOVsd_RR(r1,dst));
1497         X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1498         addInstr(env, X86Instr_CMov32(cc ^ 1, r0, dst));
1499         return dst;
1500       }
1501       break;
1502    }
1503
1504    default:
1505    break;
1506    } /* switch (e->tag) */
1507
1508    /* We get here if no pattern matched. */
1509   irreducible:
1510    ppIRExpr(e);
1511    vpanic("iselIntExpr_R: cannot reduce tree");
1512 }
1513
1514
1515 /*---------------------------------------------------------*/
1516 /*--- ISEL: Integer expression auxiliaries              ---*/
1517 /*---------------------------------------------------------*/
1518
1519 /* --------------------- AMODEs --------------------- */
1520
1521 /* Return an AMode which computes the value of the specified
1522    expression, possibly also adding insns to the code list as a
1523    result.  The expression may only be a 32-bit one.
1524 */
1525
1526 static Bool sane_AMode ( X86AMode* am )
1527 {
1528    switch (am->tag) {
1529       case Xam_IR:
1530          return
1531             toBool( hregClass(am->Xam.IR.reg) == HRcInt32
1532                     && (hregIsVirtual(am->Xam.IR.reg)
1533                         || sameHReg(am->Xam.IR.reg, hregX86_EBP())) );
1534       case Xam_IRRS:
1535          return
1536             toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
1537                     && hregIsVirtual(am->Xam.IRRS.base)
1538                     && hregClass(am->Xam.IRRS.index) == HRcInt32
1539                     && hregIsVirtual(am->Xam.IRRS.index) );
1540       default:
1541         vpanic("sane_AMode: unknown x86 amode tag");
1542    }
1543 }
1544
1545 static X86AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
1546 {
1547    X86AMode* am = iselIntExpr_AMode_wrk(env, e);
1548    vassert(sane_AMode(am));
1549    return am;
1550 }
1551
1552 /* DO NOT CALL THIS DIRECTLY ! */
1553 static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
1554 {
1555    IRType ty = typeOfIRExpr(env->type_env,e);
1556    vassert(ty == Ity_I32);
1557
1558    /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
1559    if (e->tag == Iex_Binop
1560        && e->Iex.Binop.op == Iop_Add32
1561        && e->Iex.Binop.arg2->tag == Iex_Const
1562        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
1563        && e->Iex.Binop.arg1->tag == Iex_Binop
1564        && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
1565        && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
1566        && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1567        && e->Iex.Binop.arg1
1568            ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1569        && e->Iex.Binop.arg1
1570            ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1571       UInt shift = e->Iex.Binop.arg1
1572                     ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1573       UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1574       if (shift == 1 || shift == 2 || shift == 3) {
1575          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
1576          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
1577                                        ->Iex.Binop.arg2->Iex.Binop.arg1 );
1578          return X86AMode_IRRS(imm32, r1, r2, shift);
1579       }
1580    }
1581
1582    /* Add32(expr1, Shl32(expr2, imm)) */
1583    if (e->tag == Iex_Binop
1584        && e->Iex.Binop.op == Iop_Add32
1585        && e->Iex.Binop.arg2->tag == Iex_Binop
1586        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
1587        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1588        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1589       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1590       if (shift == 1 || shift == 2 || shift == 3) {
1591          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1592          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1593          return X86AMode_IRRS(0, r1, r2, shift);
1594       }
1595    }
1596
1597    /* Add32(expr,i) */
1598    if (e->tag == Iex_Binop
1599        && e->Iex.Binop.op == Iop_Add32
1600        && e->Iex.Binop.arg2->tag == Iex_Const
1601        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
1602       HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
1603       return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
1604    }
1605
1606    /* Doesn't match anything in particular.  Generate it into
1607       a register and use that. */
1608    {
1609       HReg r1 = iselIntExpr_R(env, e);
1610       return X86AMode_IR(0, r1);
1611    }
1612 }
1613
1614
1615 /* --------------------- RMIs --------------------- */
1616
1617 /* Similarly, calculate an expression into an X86RMI operand.  As with
1618    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1619
1620 static X86RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
1621 {
1622    X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
1623    /* sanity checks ... */
1624    switch (rmi->tag) {
1625       case Xrmi_Imm:
1626          return rmi;
1627       case Xrmi_Reg:
1628          vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
1629          vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
1630          return rmi;
1631       case Xrmi_Mem:
1632          vassert(sane_AMode(rmi->Xrmi.Mem.am));
1633          return rmi;
1634       default:
1635          vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
1636    }
1637 }
1638
1639 /* DO NOT CALL THIS DIRECTLY ! */
1640 static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
1641 {
1642    IRType ty = typeOfIRExpr(env->type_env,e);
1643    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1644
1645    /* special case: immediate */
1646    if (e->tag == Iex_Const) {
1647       UInt u;
1648       switch (e->Iex.Const.con->tag) {
1649          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1650          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1651          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1652          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1653       }
1654       return X86RMI_Imm(u);
1655    }
1656
1657    /* special case: 32-bit GET */
1658    if (e->tag == Iex_Get && ty == Ity_I32) {
1659       return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
1660                                     hregX86_EBP()));
1661    }
1662
1663    /* special case: 32-bit load from memory */
1664    if (e->tag == Iex_Load && ty == Ity_I32
1665        && e->Iex.Load.end == Iend_LE) {
1666       X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
1667       return X86RMI_Mem(am);
1668    }
1669
1670    /* default case: calculate into a register and return that */
1671    {
1672       HReg r = iselIntExpr_R ( env, e );
1673       return X86RMI_Reg(r);
1674    }
1675 }
1676
1677
1678 /* --------------------- RIs --------------------- */
1679
1680 /* Calculate an expression into an X86RI operand.  As with
1681    iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
1682
1683 static X86RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
1684 {
1685    X86RI* ri = iselIntExpr_RI_wrk(env, e);
1686    /* sanity checks ... */
1687    switch (ri->tag) {
1688       case Xri_Imm:
1689          return ri;
1690       case Xri_Reg:
1691          vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
1692          vassert(hregIsVirtual(ri->Xri.Reg.reg));
1693          return ri;
1694       default:
1695          vpanic("iselIntExpr_RI: unknown x86 RI tag");
1696    }
1697 }
1698
1699 /* DO NOT CALL THIS DIRECTLY ! */
1700 static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
1701 {
1702    IRType ty = typeOfIRExpr(env->type_env,e);
1703    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1704
1705    /* special case: immediate */
1706    if (e->tag == Iex_Const) {
1707       UInt u;
1708       switch (e->Iex.Const.con->tag) {
1709          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
1710          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
1711          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
1712          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
1713       }
1714       return X86RI_Imm(u);
1715    }
1716
1717    /* default case: calculate into a register and return that */
1718    {
1719       HReg r = iselIntExpr_R ( env, e );
1720       return X86RI_Reg(r);
1721    }
1722 }
1723
1724
1725 /* --------------------- RMs --------------------- */
1726
1727 /* Similarly, calculate an expression into an X86RM operand.  As with
1728    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
1729
1730 static X86RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
1731 {
1732    X86RM* rm = iselIntExpr_RM_wrk(env, e);
1733    /* sanity checks ... */
1734    switch (rm->tag) {
1735       case Xrm_Reg:
1736          vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
1737          vassert(hregIsVirtual(rm->Xrm.Reg.reg));
1738          return rm;
1739       case Xrm_Mem:
1740          vassert(sane_AMode(rm->Xrm.Mem.am));
1741          return rm;
1742       default:
1743          vpanic("iselIntExpr_RM: unknown x86 RM tag");
1744    }
1745 }
1746
1747 /* DO NOT CALL THIS DIRECTLY ! */
1748 static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
1749 {
1750    IRType ty = typeOfIRExpr(env->type_env,e);
1751    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1752
1753    /* special case: 32-bit GET */
1754    if (e->tag == Iex_Get && ty == Ity_I32) {
1755       return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
1756                                    hregX86_EBP()));
1757    }
1758
1759    /* special case: load from memory */
1760
1761    /* default case: calculate into a register and return that */
1762    {
1763       HReg r = iselIntExpr_R ( env, e );
1764       return X86RM_Reg(r);
1765    }
1766 }
1767
1768
1769 /* --------------------- CONDCODE --------------------- */
1770
1771 /* Generate code to evaluated a bit-typed expression, returning the
1772    condition code which would correspond when the expression would
1773    notionally have returned 1. */
1774
1775 static X86CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
1776 {
1777    /* Uh, there's nothing we can sanity check here, unfortunately. */
1778    return iselCondCode_wrk(env,e);
1779 }
1780
1781 /* DO NOT CALL THIS DIRECTLY ! */
1782 static X86CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
1783 {
1784    MatchInfo mi;
1785
1786    vassert(e);
1787    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
1788
1789    /* var */
1790    if (e->tag == Iex_RdTmp) {
1791       HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
1792       /* Test32 doesn't modify r32; so this is OK. */
1793       addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
1794       return Xcc_NZ;
1795    }
1796
1797    /* Constant 1:Bit */
1798    if (e->tag == Iex_Const) {
1799       HReg r;
1800       vassert(e->Iex.Const.con->tag == Ico_U1);
1801       vassert(e->Iex.Const.con->Ico.U1 == True
1802               || e->Iex.Const.con->Ico.U1 == False);
1803       r = newVRegI(env);
1804       addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
1805       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
1806       return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
1807    }
1808
1809    /* Not1(e) */
1810    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
1811       /* Generate code for the arg, and negate the test condition */
1812       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
1813    }
1814
1815    /* --- patterns rooted at: 32to1 --- */
1816
1817    if (e->tag == Iex_Unop
1818        && e->Iex.Unop.op == Iop_32to1) {
1819       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1820       addInstr(env, X86Instr_Test32(1,rm));
1821       return Xcc_NZ;
1822    }
1823
1824    /* --- patterns rooted at: CmpNEZ8 --- */
1825
1826    /* CmpNEZ8(x) */
1827    if (e->tag == Iex_Unop
1828        && e->Iex.Unop.op == Iop_CmpNEZ8) {
1829       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1830       addInstr(env, X86Instr_Test32(0xFF,rm));
1831       return Xcc_NZ;
1832    }
1833
1834    /* --- patterns rooted at: CmpNEZ16 --- */
1835
1836    /* CmpNEZ16(x) */
1837    if (e->tag == Iex_Unop
1838        && e->Iex.Unop.op == Iop_CmpNEZ16) {
1839       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
1840       addInstr(env, X86Instr_Test32(0xFFFF,rm));
1841       return Xcc_NZ;
1842    }
1843
1844    /* --- patterns rooted at: CmpNEZ32 --- */
1845
1846    /* CmpNEZ32(And32(x,y)) */
1847    {
1848       DECLARE_PATTERN(p_CmpNEZ32_And32);
1849       DEFINE_PATTERN(p_CmpNEZ32_And32,
1850                      unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
1851       if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
1852          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1853          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1854          HReg    tmp  = newVRegI(env);
1855          addInstr(env, mk_iMOVsd_RR(r0, tmp));
1856          addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
1857          return Xcc_NZ;
1858       }
1859    }
1860
1861    /* CmpNEZ32(Or32(x,y)) */
1862    {
1863       DECLARE_PATTERN(p_CmpNEZ32_Or32);
1864       DEFINE_PATTERN(p_CmpNEZ32_Or32,
1865                      unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
1866       if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
1867          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
1868          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
1869          HReg    tmp  = newVRegI(env);
1870          addInstr(env, mk_iMOVsd_RR(r0, tmp));
1871          addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
1872          return Xcc_NZ;
1873       }
1874    }
1875
1876    /* CmpNEZ32(GET(..):I32) */
1877    if (e->tag == Iex_Unop
1878        && e->Iex.Unop.op == Iop_CmpNEZ32
1879        && e->Iex.Unop.arg->tag == Iex_Get) {
1880       X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
1881                                  hregX86_EBP());
1882       addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
1883       return Xcc_NZ;
1884    }
1885
1886    /* CmpNEZ32(x) */
1887    if (e->tag == Iex_Unop
1888        && e->Iex.Unop.op == Iop_CmpNEZ32) {
1889       HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1890       X86RMI* rmi2 = X86RMI_Imm(0);
1891       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
1892       return Xcc_NZ;
1893    }
1894
1895    /* --- patterns rooted at: CmpNEZ64 --- */
1896
1897    /* CmpNEZ64(Or64(x,y)) */
1898    {
1899       DECLARE_PATTERN(p_CmpNEZ64_Or64);
1900       DEFINE_PATTERN(p_CmpNEZ64_Or64,
1901                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
1902       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
1903          HReg    hi1, lo1, hi2, lo2;
1904          HReg    tmp  = newVRegI(env);
1905          iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
1906          addInstr(env, mk_iMOVsd_RR(hi1, tmp));
1907          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
1908          iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
1909          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
1910          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
1911          return Xcc_NZ;
1912       }
1913    }
1914
1915    /* CmpNEZ64(x) */
1916    if (e->tag == Iex_Unop
1917        && e->Iex.Unop.op == Iop_CmpNEZ64) {
1918       HReg hi, lo;
1919       HReg tmp = newVRegI(env);
1920       iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
1921       addInstr(env, mk_iMOVsd_RR(hi, tmp));
1922       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
1923       return Xcc_NZ;
1924    }
1925
1926    /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
1927
1928    /* CmpEQ8 / CmpNE8 */
1929    if (e->tag == Iex_Binop
1930        && (e->Iex.Binop.op == Iop_CmpEQ8
1931            || e->Iex.Binop.op == Iop_CmpNE8
1932            || e->Iex.Binop.op == Iop_CasCmpEQ8
1933            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
1934       if (isZeroU8(e->Iex.Binop.arg2)) {
1935          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1936          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
1937          switch (e->Iex.Binop.op) {
1938             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1939             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1940             default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
1941          }
1942       } else {
1943          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1944          X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1945          HReg    r    = newVRegI(env);
1946          addInstr(env, mk_iMOVsd_RR(r1,r));
1947          addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1948          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
1949          switch (e->Iex.Binop.op) {
1950             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
1951             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
1952             default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
1953          }
1954       }
1955    }
1956
1957    /* CmpEQ16 / CmpNE16 */
1958    if (e->tag == Iex_Binop
1959        && (e->Iex.Binop.op == Iop_CmpEQ16
1960            || e->Iex.Binop.op == Iop_CmpNE16
1961            || e->Iex.Binop.op == Iop_CasCmpEQ16
1962            || e->Iex.Binop.op == Iop_CasCmpNE16
1963            || e->Iex.Binop.op == Iop_ExpCmpNE16)) {
1964       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1965       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1966       HReg    r    = newVRegI(env);
1967       addInstr(env, mk_iMOVsd_RR(r1,r));
1968       addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
1969       addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
1970       switch (e->Iex.Binop.op) {
1971          case Iop_CmpEQ16: case Iop_CasCmpEQ16:
1972             return Xcc_Z;
1973          case Iop_CmpNE16: case Iop_CasCmpNE16: case Iop_ExpCmpNE16:
1974             return Xcc_NZ;
1975          default:
1976             vpanic("iselCondCode(x86): CmpXX16");
1977       }
1978    }
1979
1980    /* CmpNE32(ccall, 32-bit constant) (--smc-check=all optimisation).
1981       Saves a "movl %eax, %tmp" compared to the default route. */
1982    if (e->tag == Iex_Binop
1983        && e->Iex.Binop.op == Iop_CmpNE32
1984        && e->Iex.Binop.arg1->tag == Iex_CCall
1985        && e->Iex.Binop.arg2->tag == Iex_Const) {
1986       IRExpr* cal = e->Iex.Binop.arg1;
1987       IRExpr* con = e->Iex.Binop.arg2;
1988       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
1989       vassert(cal->Iex.CCall.retty == Ity_I32); /* else ill-typed IR */
1990       vassert(con->Iex.Const.con->tag == Ico_U32);
1991       /* Marshal args, do the call. */
1992       UInt   addToSp = 0;
1993       RetLoc rloc    = mk_RetLoc_INVALID();
1994       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1995                     cal->Iex.CCall.cee,
1996                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
1997       vassert(is_sane_RetLoc(rloc));
1998       vassert(rloc.pri == RLPri_Int);
1999       vassert(addToSp == 0);
2000       /* */
2001       addInstr(env, X86Instr_Alu32R(Xalu_CMP,
2002                                     X86RMI_Imm(con->Iex.Const.con->Ico.U32),
2003                                     hregX86_EAX()));
2004       return Xcc_NZ;
2005    }
2006
2007    /* Cmp*32*(x,y) */
2008    if (e->tag == Iex_Binop
2009        && (e->Iex.Binop.op == Iop_CmpEQ32
2010            || e->Iex.Binop.op == Iop_CmpNE32
2011            || e->Iex.Binop.op == Iop_CmpLT32S
2012            || e->Iex.Binop.op == Iop_CmpLT32U
2013            || e->Iex.Binop.op == Iop_CmpLE32S
2014            || e->Iex.Binop.op == Iop_CmpLE32U
2015            || e->Iex.Binop.op == Iop_CasCmpEQ32
2016            || e->Iex.Binop.op == Iop_CasCmpNE32
2017            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2018       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2019       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2020       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
2021       switch (e->Iex.Binop.op) {
2022          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
2023          case Iop_CmpNE32:
2024          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Xcc_NZ;
2025          case Iop_CmpLT32S: return Xcc_L;
2026          case Iop_CmpLT32U: return Xcc_B;
2027          case Iop_CmpLE32S: return Xcc_LE;
2028          case Iop_CmpLE32U: return Xcc_BE;
2029          default: vpanic("iselCondCode(x86): CmpXX32");
2030       }
2031    }
2032
2033    /* CmpNE64 */
2034    if (e->tag == Iex_Binop
2035        && (e->Iex.Binop.op == Iop_CmpNE64
2036            || e->Iex.Binop.op == Iop_CmpEQ64)) {
2037       HReg hi1, hi2, lo1, lo2;
2038       HReg tHi = newVRegI(env);
2039       HReg tLo = newVRegI(env);
2040       iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
2041       iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
2042       addInstr(env, mk_iMOVsd_RR(hi1, tHi));
2043       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
2044       addInstr(env, mk_iMOVsd_RR(lo1, tLo));
2045       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
2046       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
2047       switch (e->Iex.Binop.op) {
2048          case Iop_CmpNE64: return Xcc_NZ;
2049          case Iop_CmpEQ64: return Xcc_Z;
2050          default: vpanic("iselCondCode(x86): CmpXX64");
2051       }
2052    }
2053
2054    /* And1(x,y), Or1(x,y) */
2055    /* FIXME: We could (and probably should) do a lot better here.  If both args
2056       are in temps already then we can just emit a reg-reg And/Or directly,
2057       followed by the final Test. */
2058    if (e->tag == Iex_Binop
2059        && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2060       // We could probably be cleverer about this.  In the meantime ..
2061       HReg x_as_32 = newVRegI(env);
2062       X86CondCode cc_x = iselCondCode(env, e->Iex.Binop.arg1);
2063       addInstr(env, X86Instr_Set32(cc_x, x_as_32));
2064       HReg y_as_32 = newVRegI(env);
2065       X86CondCode cc_y = iselCondCode(env, e->Iex.Binop.arg2);
2066       addInstr(env, X86Instr_Set32(cc_y, y_as_32));
2067       X86AluOp aop = e->Iex.Binop.op == Iop_And1 ? Xalu_AND : Xalu_OR;
2068       addInstr(env, X86Instr_Alu32R(aop, X86RMI_Reg(x_as_32), y_as_32));
2069       addInstr(env, X86Instr_Test32(1, X86RM_Reg(y_as_32)));
2070       return Xcc_NZ;
2071    }
2072
2073    ppIRExpr(e);
2074    vpanic("iselCondCode");
2075 }
2076
2077
2078 /*---------------------------------------------------------*/
2079 /*--- ISEL: Integer expressions (64 bit)                ---*/
2080 /*---------------------------------------------------------*/
2081
2082 /* Compute a 64-bit value into a register pair, which is returned as
2083    the first two parameters.  As with iselIntExpr_R, these may be
2084    either real or virtual regs; in any case they must not be changed
2085    by subsequent code emitted by the caller.  */
2086
2087 static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env,
2088                             const IRExpr* e )
2089 {
2090    iselInt64Expr_wrk(rHi, rLo, env, e);
2091 #  if 0
2092    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2093 #  endif
2094    vassert(hregClass(*rHi) == HRcInt32);
2095    vassert(hregIsVirtual(*rHi));
2096    vassert(hregClass(*rLo) == HRcInt32);
2097    vassert(hregIsVirtual(*rLo));
2098 }
2099
2100 /* DO NOT CALL THIS DIRECTLY ! */
2101 static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env,
2102                                 const IRExpr* e )
2103 {
2104    MatchInfo mi;
2105    HWord fn = 0; /* helper fn for most SIMD64 stuff */
2106    vassert(e);
2107    vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
2108
2109    /* 64-bit literal */
2110    if (e->tag == Iex_Const) {
2111       ULong w64 = e->Iex.Const.con->Ico.U64;
2112       UInt  wHi = toUInt(w64 >> 32);
2113       UInt  wLo = toUInt(w64);
2114       HReg  tLo = newVRegI(env);
2115       HReg  tHi = newVRegI(env);
2116       vassert(e->Iex.Const.con->tag == Ico_U64);
2117       if (wLo == wHi) {
2118          /* Save a precious Int register in this special case. */
2119          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2120          *rHi = tLo;
2121          *rLo = tLo;
2122       } else {
2123          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
2124          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
2125          *rHi = tHi;
2126          *rLo = tLo;
2127       }
2128       return;
2129    }
2130
2131    /* read 64-bit IRTemp */
2132    if (e->tag == Iex_RdTmp) {
2133       lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
2134       return;
2135    }
2136
2137    /* 64-bit load */
2138    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2139       HReg     tLo, tHi;
2140       X86AMode *am0, *am4;
2141       vassert(e->Iex.Load.ty == Ity_I64);
2142       tLo = newVRegI(env);
2143       tHi = newVRegI(env);
2144       am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
2145       am4 = advance4(am0);
2146       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
2147       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2148       *rHi = tHi;
2149       *rLo = tLo;
2150       return;
2151    }
2152
2153    /* 64-bit GET */
2154    if (e->tag == Iex_Get) {
2155       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
2156       X86AMode* am4 = advance4(am);
2157       HReg tLo = newVRegI(env);
2158       HReg tHi = newVRegI(env);
2159       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2160       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2161       *rHi = tHi;
2162       *rLo = tLo;
2163       return;
2164    }
2165
2166    /* 64-bit GETI */
2167    if (e->tag == Iex_GetI) {
2168       X86AMode* am
2169          = genGuestArrayOffset( env, e->Iex.GetI.descr,
2170                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
2171       X86AMode* am4 = advance4(am);
2172       HReg tLo = newVRegI(env);
2173       HReg tHi = newVRegI(env);
2174       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
2175       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
2176       *rHi = tHi;
2177       *rLo = tLo;
2178       return;
2179    }
2180
2181    /* 64-bit ITE: ITE(g, expr, expr) */ // VFD
2182    if (e->tag == Iex_ITE) {
2183       HReg e0Lo, e0Hi, e1Lo, e1Hi;
2184       HReg tLo = newVRegI(env);
2185       HReg tHi = newVRegI(env);
2186       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.ITE.iffalse);
2187       iselInt64Expr(&e1Hi, &e1Lo, env, e->Iex.ITE.iftrue);
2188       addInstr(env, mk_iMOVsd_RR(e1Hi, tHi));
2189       addInstr(env, mk_iMOVsd_RR(e1Lo, tLo));
2190       X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
2191       /* This assumes the first cmov32 doesn't trash the condition
2192          codes, so they are still available for the second cmov32 */
2193       addInstr(env, X86Instr_CMov32(cc ^ 1, X86RM_Reg(e0Hi), tHi));
2194       addInstr(env, X86Instr_CMov32(cc ^ 1, X86RM_Reg(e0Lo), tLo));
2195       *rHi = tHi;
2196       *rLo = tLo;
2197       return;
2198    }
2199
2200    /* --------- BINARY ops --------- */
2201    if (e->tag == Iex_Binop) {
2202       switch (e->Iex.Binop.op) {
2203          /* 32 x 32 -> 64 multiply */
2204          case Iop_MullU32:
2205          case Iop_MullS32: {
2206             /* get one operand into %eax, and the other into a R/M.
2207                Need to make an educated guess about which is better in
2208                which. */
2209             HReg   tLo    = newVRegI(env);
2210             HReg   tHi    = newVRegI(env);
2211             Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
2212             X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2213             HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2214             addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
2215             addInstr(env, X86Instr_MulL(syned, rmLeft));
2216             /* Result is now in EDX:EAX.  Tell the caller. */
2217             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2218             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2219             *rHi = tHi;
2220             *rLo = tLo;
2221             return;
2222          }
2223
2224          /* 64 x 32 -> (32(rem),32(div)) division */
2225          case Iop_DivModU64to32:
2226          case Iop_DivModS64to32: {
2227             /* Get the 64-bit operand into edx:eax, and the other into
2228                any old R/M. */
2229             HReg sHi, sLo;
2230             HReg   tLo     = newVRegI(env);
2231             HReg   tHi     = newVRegI(env);
2232             Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
2233             X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2234             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2235             addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
2236             addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
2237             addInstr(env, X86Instr_Div(syned, rmRight));
2238             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2239             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2240             *rHi = tHi;
2241             *rLo = tLo;
2242             return;
2243          }
2244
2245          /* Or64/And64/Xor64 */
2246          case Iop_Or64:
2247          case Iop_And64:
2248          case Iop_Xor64: {
2249             HReg xLo, xHi, yLo, yHi;
2250             HReg tLo = newVRegI(env);
2251             HReg tHi = newVRegI(env);
2252             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
2253                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
2254                           : Xalu_XOR;
2255             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2256             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2257             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2258             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
2259             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2260             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
2261             *rHi = tHi;
2262             *rLo = tLo;
2263             return;
2264          }
2265
2266          /* Add64/Sub64 */
2267          case Iop_Add64:
2268             if (e->Iex.Binop.arg2->tag == Iex_Const) {
2269                /* special case Add64(e, const) */
2270                ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
2271                UInt  wHi = toUInt(w64 >> 32);
2272                UInt  wLo = toUInt(w64);
2273                HReg  tLo = newVRegI(env);
2274                HReg  tHi = newVRegI(env);
2275                HReg  xLo, xHi;
2276                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
2277                iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2278                addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2279                addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2280                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
2281                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
2282                *rHi = tHi;
2283                *rLo = tLo;
2284                return;
2285             }
2286             /* else fall through to the generic case */
2287          case Iop_Sub64: {
2288             HReg xLo, xHi, yLo, yHi;
2289             HReg tLo = newVRegI(env);
2290             HReg tHi = newVRegI(env);
2291             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2292             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
2293             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
2294             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2295             if (e->Iex.Binop.op==Iop_Add64) {
2296                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
2297                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
2298             } else {
2299                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2300                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2301             }
2302             *rHi = tHi;
2303             *rLo = tLo;
2304             return;
2305          }
2306
2307          /* 32HLto64(e1,e2) */
2308          case Iop_32HLto64:
2309             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2310             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2311             return;
2312
2313          /* 64-bit shifts */
2314          case Iop_Shl64: {
2315             /* We use the same ingenious scheme as gcc.  Put the value
2316                to be shifted into %hi:%lo, and the shift amount into
2317                %cl.  Then (dsts on right, a la ATT syntax):
2318
2319                shldl %cl, %lo, %hi   -- make %hi be right for the
2320                                      -- shift amt %cl % 32
2321                shll  %cl, %lo        -- make %lo be right for the
2322                                      -- shift amt %cl % 32
2323
2324                Now, if (shift amount % 64) is in the range 32 .. 63,
2325                we have to do a fixup, which puts the result low half
2326                into the result high half, and zeroes the low half:
2327
2328                testl $32, %ecx
2329
2330                cmovnz %lo, %hi
2331                movl $0, %tmp         -- sigh; need yet another reg
2332                cmovnz %tmp, %lo
2333             */
2334             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2335             tLo = newVRegI(env);
2336             tHi = newVRegI(env);
2337             tTemp = newVRegI(env);
2338             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2339             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2340             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2341             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2342             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2343             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2344                and those regs are legitimately modifiable. */
2345             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
2346             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
2347             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2348             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
2349             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2350             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
2351             *rHi = tHi;
2352             *rLo = tLo;
2353             return;
2354          }
2355
2356          case Iop_Shr64: {
2357             /* We use the same ingenious scheme as gcc.  Put the value
2358                to be shifted into %hi:%lo, and the shift amount into
2359                %cl.  Then:
2360
2361                shrdl %cl, %hi, %lo   -- make %lo be right for the
2362                                      -- shift amt %cl % 32
2363                shrl  %cl, %hi        -- make %hi be right for the
2364                                      -- shift amt %cl % 32
2365
2366                Now, if (shift amount % 64) is in the range 32 .. 63,
2367                we have to do a fixup, which puts the result high half
2368                into the result low half, and zeroes the high half:
2369
2370                testl $32, %ecx
2371
2372                cmovnz %hi, %lo
2373                movl $0, %tmp         -- sigh; need yet another reg
2374                cmovnz %tmp, %hi
2375             */
2376             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
2377             tLo = newVRegI(env);
2378             tHi = newVRegI(env);
2379             tTemp = newVRegI(env);
2380             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
2381             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2382             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
2383             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2384             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2385             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
2386                and those regs are legitimately modifiable. */
2387             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
2388             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
2389             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
2390             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
2391             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
2392             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
2393             *rHi = tHi;
2394             *rLo = tLo;
2395             return;
2396          }
2397
2398          case Iop_Sar64: {
2399             /* gcc -O2 does the following.  I don't know how it works, but it
2400                does work.  Don't mess with it.  This is hard to test because the
2401                x86 front end doesn't create Iop_Sar64 for any x86 instruction,
2402                so it's impossible to write a test program that feeds values
2403                through Iop_Sar64 and prints their results.  The implementation
2404                here was tested by using psrlq on mmx registers -- that generates
2405                Iop_Shr64 -- and temporarily hacking the front end to generate
2406                Iop_Sar64 for that instruction instead.
2407
2408                movl  %amount, %ecx
2409                movl  %srcHi,  %r1
2410                movl  %srcLo,  %r2
2411
2412                movl   %r1, %r3
2413                sarl   %cl, %r3
2414                movl   %r2, %r4
2415                shrdl  %cl, %r1, %r4
2416                movl   %r3, %r2
2417                sarl   $31, %r2
2418                andl   $32, %ecx
2419                cmovne %r3, %r4   // = resLo
2420                cmovne %r2, %r3   // = resHi
2421             */
2422             HReg amount = iselIntExpr_R(env, e->Iex.Binop.arg2);
2423             HReg srcHi = INVALID_HREG, srcLo = INVALID_HREG;
2424             iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Binop.arg1);
2425             HReg r1 = newVRegI(env);
2426             HReg r2 = newVRegI(env);
2427             HReg r3 = newVRegI(env);
2428             HReg r4 = newVRegI(env);
2429             addInstr(env, mk_iMOVsd_RR(amount, hregX86_ECX()));
2430             addInstr(env, mk_iMOVsd_RR(srcHi, r1));
2431             addInstr(env, mk_iMOVsd_RR(srcLo, r2));
2432
2433             addInstr(env, mk_iMOVsd_RR(r1, r3));
2434             addInstr(env, X86Instr_Sh32(Xsh_SAR, 0/*%cl*/, r3));
2435             addInstr(env, mk_iMOVsd_RR(r2, r4));
2436             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, r1, r4));
2437             addInstr(env, mk_iMOVsd_RR(r3, r2));
2438             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, r2));
2439             addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(32),
2440                                                     hregX86_ECX()));
2441             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(r3), r4));
2442             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(r2), r3));
2443             *rHi = r3;
2444             *rLo = r4;
2445             return;
2446          }
2447
2448          /* F64 -> I64 */
2449          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
2450             case.  Unfortunately I see no easy way to avoid the
2451             duplication. */
2452          case Iop_F64toI64S: {
2453             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
2454             HReg tLo = newVRegI(env);
2455             HReg tHi = newVRegI(env);
2456
2457             /* Used several times ... */
2458             /* Careful ... this sharing is only safe because
2459                zero_esp/four_esp do not hold any registers which the
2460                register allocator could attempt to swizzle later. */
2461             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2462             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2463
2464             /* rf now holds the value to be converted, and rrm holds
2465                the rounding mode value, encoded as per the
2466                IRRoundingMode enum.  The first thing to do is set the
2467                FPU's rounding mode accordingly. */
2468
2469             /* Create a space for the format conversion. */
2470             /* subl $8, %esp */
2471             sub_from_esp(env, 8);
2472
2473             /* Set host rounding mode */
2474             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2475
2476             /* gistll %rf, 0(%esp) */
2477             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
2478
2479             /* movl 0(%esp), %dstLo */
2480             /* movl 4(%esp), %dstHi */
2481             addInstr(env, X86Instr_Alu32R(
2482                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2483             addInstr(env, X86Instr_Alu32R(
2484                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2485
2486             /* Restore default FPU rounding. */
2487             set_FPU_rounding_default( env );
2488
2489             /* addl $8, %esp */
2490             add_to_esp(env, 8);
2491
2492             *rHi = tHi;
2493             *rLo = tLo;
2494             return;
2495          }
2496
2497          case Iop_Add8x8:
2498             fn = (HWord)h_generic_calc_Add8x8; goto binnish;
2499          case Iop_Add16x4:
2500             fn = (HWord)h_generic_calc_Add16x4; goto binnish;
2501          case Iop_Add32x2:
2502             fn = (HWord)h_generic_calc_Add32x2; goto binnish;
2503
2504          case Iop_Avg8Ux8:
2505             fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
2506          case Iop_Avg16Ux4:
2507             fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
2508
2509          case Iop_CmpEQ8x8:
2510             fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
2511          case Iop_CmpEQ16x4:
2512             fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
2513          case Iop_CmpEQ32x2:
2514             fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
2515
2516          case Iop_CmpGT8Sx8:
2517             fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
2518          case Iop_CmpGT16Sx4:
2519             fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
2520          case Iop_CmpGT32Sx2:
2521             fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
2522
2523          case Iop_InterleaveHI8x8:
2524             fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
2525          case Iop_InterleaveLO8x8:
2526             fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
2527          case Iop_InterleaveHI16x4:
2528             fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
2529          case Iop_InterleaveLO16x4:
2530             fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
2531          case Iop_InterleaveHI32x2:
2532             fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
2533          case Iop_InterleaveLO32x2:
2534             fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
2535          case Iop_CatOddLanes16x4:
2536             fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
2537          case Iop_CatEvenLanes16x4:
2538             fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
2539          case Iop_Perm8x8:
2540             fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
2541
2542          case Iop_Max8Ux8:
2543             fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
2544          case Iop_Max16Sx4:
2545             fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
2546          case Iop_Min8Ux8:
2547             fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
2548          case Iop_Min16Sx4:
2549             fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
2550
2551          case Iop_Mul16x4:
2552             fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
2553          case Iop_Mul32x2:
2554             fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
2555          case Iop_MulHi16Sx4:
2556             fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
2557          case Iop_MulHi16Ux4:
2558             fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
2559
2560          case Iop_QAdd8Sx8:
2561             fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
2562          case Iop_QAdd16Sx4:
2563             fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
2564          case Iop_QAdd8Ux8:
2565             fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
2566          case Iop_QAdd16Ux4:
2567             fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
2568
2569          case Iop_QNarrowBin32Sto16Sx4:
2570             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; goto binnish;
2571          case Iop_QNarrowBin16Sto8Sx8:
2572             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
2573          case Iop_QNarrowBin16Sto8Ux8:
2574             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
2575          case Iop_NarrowBin16to8x8:
2576             fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
2577          case Iop_NarrowBin32to16x4:
2578             fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
2579
2580          case Iop_QSub8Sx8:
2581             fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
2582          case Iop_QSub16Sx4:
2583             fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
2584          case Iop_QSub8Ux8:
2585             fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
2586          case Iop_QSub16Ux4:
2587             fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
2588
2589          case Iop_Sub8x8:
2590             fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
2591          case Iop_Sub16x4:
2592             fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
2593          case Iop_Sub32x2:
2594             fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
2595
2596          binnish: {
2597             /* Note: the following assumes all helpers are of
2598                signature
2599                   ULong fn ( ULong, ULong ), and they are
2600                not marked as regparm functions.
2601             */
2602             HReg xLo, xHi, yLo, yHi;
2603             HReg tLo = newVRegI(env);
2604             HReg tHi = newVRegI(env);
2605             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
2606             addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
2607             addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
2608             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2609             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2610             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2611             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2612                                          0, mk_RetLoc_simple(RLPri_2Int) ));
2613             add_to_esp(env, 4*4);
2614             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2615             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2616             *rHi = tHi;
2617             *rLo = tLo;
2618             return;
2619          }
2620
2621          case Iop_ShlN32x2:
2622             fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
2623          case Iop_ShlN16x4:
2624             fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
2625          case Iop_ShlN8x8:
2626             fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
2627          case Iop_ShrN32x2:
2628             fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
2629          case Iop_ShrN16x4:
2630             fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
2631          case Iop_SarN32x2:
2632             fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
2633          case Iop_SarN16x4:
2634             fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
2635          case Iop_SarN8x8:
2636             fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
2637          shifty: {
2638             /* Note: the following assumes all helpers are of
2639                signature
2640                   ULong fn ( ULong, UInt ), and they are
2641                not marked as regparm functions.
2642             */
2643             HReg xLo, xHi;
2644             HReg tLo = newVRegI(env);
2645             HReg tHi = newVRegI(env);
2646             X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2647             addInstr(env, X86Instr_Push(y));
2648             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
2649             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2650             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2651             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2652                                          0, mk_RetLoc_simple(RLPri_2Int) ));
2653             add_to_esp(env, 3*4);
2654             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2655             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2656             *rHi = tHi;
2657             *rLo = tLo;
2658             return;
2659          }
2660
2661          default:
2662             break;
2663       }
2664    } /* if (e->tag == Iex_Binop) */
2665
2666
2667    /* --------- UNARY ops --------- */
2668    if (e->tag == Iex_Unop) {
2669       switch (e->Iex.Unop.op) {
2670
2671          /* 32Sto64(e) */
2672          case Iop_32Sto64: {
2673             HReg tLo = newVRegI(env);
2674             HReg tHi = newVRegI(env);
2675             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2676             addInstr(env, mk_iMOVsd_RR(src,tHi));
2677             addInstr(env, mk_iMOVsd_RR(src,tLo));
2678             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
2679             *rHi = tHi;
2680             *rLo = tLo;
2681             return;
2682          }
2683
2684          /* 32Uto64(e) */
2685          case Iop_32Uto64: {
2686             HReg tLo = newVRegI(env);
2687             HReg tHi = newVRegI(env);
2688             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2689             addInstr(env, mk_iMOVsd_RR(src,tLo));
2690             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2691             *rHi = tHi;
2692             *rLo = tLo;
2693             return;
2694          }
2695
2696          /* 16Uto64(e) */
2697          case Iop_16Uto64: {
2698             HReg tLo = newVRegI(env);
2699             HReg tHi = newVRegI(env);
2700             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2701             addInstr(env, mk_iMOVsd_RR(src,tLo));
2702             addInstr(env, X86Instr_Alu32R(Xalu_AND,
2703                                           X86RMI_Imm(0xFFFF), tLo));
2704             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2705             *rHi = tHi;
2706             *rLo = tLo;
2707             return;
2708          }
2709
2710          /* V128{HI}to64 */
2711          case Iop_V128HIto64:
2712          case Iop_V128to64: {
2713             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
2714             HReg tLo = newVRegI(env);
2715             HReg tHi = newVRegI(env);
2716             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
2717             X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
2718             X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
2719             X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
2720             sub_from_esp(env, 16);
2721             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
2722             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2723                                            X86RMI_Mem(espLO), tLo ));
2724             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
2725                                            X86RMI_Mem(espHI), tHi ));
2726             add_to_esp(env, 16);
2727             *rHi = tHi;
2728             *rLo = tLo;
2729             return;
2730          }
2731
2732          /* could do better than this, but for now ... */
2733          case Iop_1Sto64: {
2734             HReg tLo = newVRegI(env);
2735             HReg tHi = newVRegI(env);
2736             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
2737             addInstr(env, X86Instr_Set32(cond,tLo));
2738             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
2739             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
2740             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
2741             *rHi = tHi;
2742             *rLo = tLo;
2743             return;
2744          }
2745
2746          /* Not64(e) */
2747          case Iop_Not64: {
2748             HReg tLo = newVRegI(env);
2749             HReg tHi = newVRegI(env);
2750             HReg sHi, sLo;
2751             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
2752             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
2753             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
2754             addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
2755             addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
2756             *rHi = tHi;
2757             *rLo = tLo;
2758             return;
2759          }
2760
2761          /* Left64(e) */
2762          case Iop_Left64: {
2763             HReg yLo, yHi;
2764             HReg tLo = newVRegI(env);
2765             HReg tHi = newVRegI(env);
2766             /* yHi:yLo = arg */
2767             iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
2768             /* tLo = 0 - yLo, and set carry */
2769             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
2770             addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
2771             /* tHi = 0 - yHi - carry */
2772             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
2773             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
2774             /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
2775                back in, so as to give the final result
2776                tHi:tLo = arg | -arg. */
2777             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
2778             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
2779             *rHi = tHi;
2780             *rLo = tLo;
2781             return;
2782          }
2783
2784          /* --- patterns rooted at: CmpwNEZ64 --- */
2785
2786          /* CmpwNEZ64(e) */
2787          case Iop_CmpwNEZ64: {
2788
2789          DECLARE_PATTERN(p_CmpwNEZ64_Or64);
2790          DEFINE_PATTERN(p_CmpwNEZ64_Or64,
2791                         unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
2792          if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
2793             /* CmpwNEZ64(Or64(x,y)) */
2794             HReg xHi,xLo,yHi,yLo;
2795             HReg xBoth = newVRegI(env);
2796             HReg merged = newVRegI(env);
2797             HReg tmp2 = newVRegI(env);
2798
2799             iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
2800             addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
2801             addInstr(env, X86Instr_Alu32R(Xalu_OR,
2802                                           X86RMI_Reg(xLo),xBoth));
2803
2804             iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
2805             addInstr(env, mk_iMOVsd_RR(yHi,merged));
2806             addInstr(env, X86Instr_Alu32R(Xalu_OR,
2807                                           X86RMI_Reg(yLo),merged));
2808             addInstr(env, X86Instr_Alu32R(Xalu_OR,
2809                                              X86RMI_Reg(xBoth),merged));
2810
2811             /* tmp2 = (merged | -merged) >>s 31 */
2812             addInstr(env, mk_iMOVsd_RR(merged,tmp2));
2813             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2814             addInstr(env, X86Instr_Alu32R(Xalu_OR,
2815                                           X86RMI_Reg(merged), tmp2));
2816             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2817             *rHi = tmp2;
2818             *rLo = tmp2;
2819             return;
2820          } else {
2821             /* CmpwNEZ64(e) */
2822             HReg srcLo, srcHi;
2823             HReg tmp1  = newVRegI(env);
2824             HReg tmp2  = newVRegI(env);
2825             /* srcHi:srcLo = arg */
2826             iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
2827             /* tmp1 = srcHi | srcLo */
2828             addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
2829             addInstr(env, X86Instr_Alu32R(Xalu_OR,
2830                                           X86RMI_Reg(srcLo), tmp1));
2831             /* tmp2 = (tmp1 | -tmp1) >>s 31 */
2832             addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
2833             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
2834             addInstr(env, X86Instr_Alu32R(Xalu_OR,
2835                                           X86RMI_Reg(tmp1), tmp2));
2836             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
2837             *rHi = tmp2;
2838             *rLo = tmp2;
2839             return;
2840          }
2841          }
2842
2843          /* ReinterpF64asI64(e) */
2844          /* Given an IEEE754 double, produce an I64 with the same bit
2845             pattern. */
2846          case Iop_ReinterpF64asI64: {
2847             HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
2848             HReg tLo  = newVRegI(env);
2849             HReg tHi  = newVRegI(env);
2850             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
2851             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
2852             /* paranoia */
2853             set_FPU_rounding_default(env);
2854             /* subl $8, %esp */
2855             sub_from_esp(env, 8);
2856             /* gstD %rf, 0(%esp) */
2857             addInstr(env,
2858                      X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
2859             /* movl 0(%esp), %tLo */
2860             addInstr(env,
2861                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
2862             /* movl 4(%esp), %tHi */
2863             addInstr(env,
2864                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
2865             /* addl $8, %esp */
2866             add_to_esp(env, 8);
2867             *rHi = tHi;
2868             *rLo = tLo;
2869             return;
2870          }
2871
2872          case Iop_CmpNEZ32x2:
2873             fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
2874          case Iop_CmpNEZ16x4:
2875             fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
2876          case Iop_CmpNEZ8x8:
2877             fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
2878          unish: {
2879             /* Note: the following assumes all helpers are of
2880                signature
2881                   ULong fn ( ULong ), and they are
2882                not marked as regparm functions.
2883             */
2884             HReg xLo, xHi;
2885             HReg tLo = newVRegI(env);
2886             HReg tHi = newVRegI(env);
2887             iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
2888             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
2889             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
2890             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
2891                                          0, mk_RetLoc_simple(RLPri_2Int) ));
2892             add_to_esp(env, 2*4);
2893             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2894             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2895             *rHi = tHi;
2896             *rLo = tLo;
2897             return;
2898          }
2899
2900          default:
2901             break;
2902       }
2903    } /* if (e->tag == Iex_Unop) */
2904
2905
2906    /* --------- CCALL --------- */
2907    if (e->tag == Iex_CCall) {
2908       HReg tLo = newVRegI(env);
2909       HReg tHi = newVRegI(env);
2910
2911       /* Marshal args, do the call, clear stack. */
2912       UInt   addToSp = 0;
2913       RetLoc rloc    = mk_RetLoc_INVALID();
2914       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2915                     e->Iex.CCall.cee,
2916                     e->Iex.CCall.retty, e->Iex.CCall.args );
2917       vassert(is_sane_RetLoc(rloc));
2918       vassert(rloc.pri == RLPri_2Int);
2919       vassert(addToSp == 0);
2920       /* */
2921
2922       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
2923       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
2924       *rHi = tHi;
2925       *rLo = tLo;
2926       return;
2927    }
2928
2929    ppIRExpr(e);
2930    vpanic("iselInt64Expr");
2931 }
2932
2933
2934 /*---------------------------------------------------------*/
2935 /*--- ISEL: Floating point expressions (32 bit)         ---*/
2936 /*---------------------------------------------------------*/
2937
2938 /* Nothing interesting here; really just wrappers for
2939    64-bit stuff. */
2940
2941 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2942 {
2943    HReg r = iselFltExpr_wrk( env, e );
2944 #  if 0
2945    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2946 #  endif
2947    vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
2948    vassert(hregIsVirtual(r));
2949    return r;
2950 }
2951
2952 /* DO NOT CALL THIS DIRECTLY */
2953 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2954 {
2955    IRType ty = typeOfIRExpr(env->type_env,e);
2956    vassert(ty == Ity_F32);
2957
2958    if (e->tag == Iex_RdTmp) {
2959       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2960    }
2961
2962    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2963       X86AMode* am;
2964       HReg res = newVRegF(env);
2965       vassert(e->Iex.Load.ty == Ity_F32);
2966       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2967       addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
2968       return res;
2969    }
2970
2971    if (e->tag == Iex_Binop
2972        && e->Iex.Binop.op == Iop_F64toF32) {
2973       /* Although the result is still held in a standard FPU register,
2974          we need to round it to reflect the loss of accuracy/range
2975          entailed in casting it to a 32-bit float. */
2976       HReg dst = newVRegF(env);
2977       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2978       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2979       addInstr(env, X86Instr_Fp64to32(src,dst));
2980       set_FPU_rounding_default( env );
2981       return dst;
2982    }
2983
2984    if (e->tag == Iex_Get) {
2985       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
2986                                   hregX86_EBP() );
2987       HReg res = newVRegF(env);
2988       addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
2989       return res;
2990    }
2991
2992    if (e->tag == Iex_Unop
2993        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2994        /* Given an I32, produce an IEEE754 float with the same bit
2995           pattern. */
2996       HReg    dst = newVRegF(env);
2997       X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
2998       /* paranoia */
2999       addInstr(env, X86Instr_Push(rmi));
3000       addInstr(env, X86Instr_FpLdSt(
3001                        True/*load*/, 4, dst,
3002                        X86AMode_IR(0, hregX86_ESP())));
3003       add_to_esp(env, 4);
3004       return dst;
3005    }
3006
3007    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
3008       HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
3009       HReg dst = newVRegF(env);
3010
3011       /* rf now holds the value to be rounded.  The first thing to do
3012          is set the FPU's rounding mode accordingly. */
3013
3014       /* Set host rounding mode */
3015       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3016
3017       /* grndint %rf, %dst */
3018       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
3019
3020       /* Restore default FPU rounding. */
3021       set_FPU_rounding_default( env );
3022
3023       return dst;
3024    }
3025
3026    ppIRExpr(e);
3027    vpanic("iselFltExpr_wrk");
3028 }
3029
3030
3031 /*---------------------------------------------------------*/
3032 /*--- ISEL: Floating point expressions (64 bit)         ---*/
3033 /*---------------------------------------------------------*/
3034
3035 /* Compute a 64-bit floating point value into a register, the identity
3036    of which is returned.  As with iselIntExpr_R, the reg may be either
3037    real or virtual; in any case it must not be changed by subsequent
3038    code emitted by the caller.  */
3039
3040 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
3041
3042     Type                  S (1 bit)   E (11 bits)   F (52 bits)
3043     ----                  ---------   -----------   -----------
3044     signalling NaN        u           2047 (max)    .0uuuuu---u
3045                                                     (with at least
3046                                                      one 1 bit)
3047     quiet NaN             u           2047 (max)    .1uuuuu---u
3048
3049     negative infinity     1           2047 (max)    .000000---0
3050
3051     positive infinity     0           2047 (max)    .000000---0
3052
3053     negative zero         1           0             .000000---0
3054
3055     positive zero         0           0             .000000---0
3056 */
3057
3058 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
3059 {
3060    HReg r = iselDblExpr_wrk( env, e );
3061 #  if 0
3062    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3063 #  endif
3064    vassert(hregClass(r) == HRcFlt64);
3065    vassert(hregIsVirtual(r));
3066    return r;
3067 }
3068
3069 /* DO NOT CALL THIS DIRECTLY */
3070 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
3071 {
3072    IRType ty = typeOfIRExpr(env->type_env,e);
3073    vassert(e);
3074    vassert(ty == Ity_F64);
3075
3076    if (e->tag == Iex_RdTmp) {
3077       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3078    }
3079
3080    if (e->tag == Iex_Const) {
3081       union { UInt u32x2[2]; ULong u64; Double f64; } u;
3082       HReg freg = newVRegF(env);
3083       vassert(sizeof(u) == 8);
3084       vassert(sizeof(u.u64) == 8);
3085       vassert(sizeof(u.f64) == 8);
3086       vassert(sizeof(u.u32x2) == 8);
3087
3088       if (e->Iex.Const.con->tag == Ico_F64) {
3089          u.f64 = e->Iex.Const.con->Ico.F64;
3090       }
3091       else if (e->Iex.Const.con->tag == Ico_F64i) {
3092          u.u64 = e->Iex.Const.con->Ico.F64i;
3093       }
3094       else
3095          vpanic("iselDblExpr(x86): const");
3096
3097       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
3098       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
3099       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
3100                                     X86AMode_IR(0, hregX86_ESP())));
3101       add_to_esp(env, 8);
3102       return freg;
3103    }
3104
3105    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3106       X86AMode* am;
3107       HReg res = newVRegF(env);
3108       vassert(e->Iex.Load.ty == Ity_F64);
3109       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3110       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
3111       return res;
3112    }
3113
3114    if (e->tag == Iex_Get) {
3115       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
3116                                   hregX86_EBP() );
3117       HReg res = newVRegF(env);
3118       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
3119       return res;
3120    }
3121
3122    if (e->tag == Iex_GetI) {
3123       X86AMode* am
3124          = genGuestArrayOffset(
3125               env, e->Iex.GetI.descr,
3126                    e->Iex.GetI.ix, e->Iex.GetI.bias );
3127       HReg res = newVRegF(env);
3128       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
3129       return res;
3130    }
3131
3132    if (e->tag == Iex_Triop) {
3133       X86FpOp fpop = Xfp_INVALID;
3134       IRTriop *triop = e->Iex.Triop.details;
3135       switch (triop->op) {
3136          case Iop_AddF64:    fpop = Xfp_ADD; break;
3137          case Iop_SubF64:    fpop = Xfp_SUB; break;
3138          case Iop_MulF64:    fpop = Xfp_MUL; break;
3139          case Iop_DivF64:    fpop = Xfp_DIV; break;
3140          case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
3141          case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
3142          case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
3143          case Iop_AtanF64:   fpop = Xfp_ATAN; break;
3144          case Iop_PRemF64:   fpop = Xfp_PREM; break;
3145          case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
3146          default: break;
3147       }
3148       if (fpop != Xfp_INVALID) {
3149          HReg res  = newVRegF(env);
3150          HReg srcL = iselDblExpr(env, triop->arg2);
3151          HReg srcR = iselDblExpr(env, triop->arg3);
3152          /* XXXROUNDINGFIXME */
3153          /* set roundingmode here */
3154          addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
3155          if (fpop != Xfp_ADD && fpop != Xfp_SUB
3156              && fpop != Xfp_MUL && fpop != Xfp_DIV)
3157             roundToF64(env, res);
3158          return res;
3159       }
3160    }
3161
3162    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
3163       HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
3164       HReg dst = newVRegF(env);
3165
3166       /* rf now holds the value to be rounded.  The first thing to do
3167          is set the FPU's rounding mode accordingly. */
3168
3169       /* Set host rounding mode */
3170       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3171
3172       /* grndint %rf, %dst */
3173       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
3174
3175       /* Restore default FPU rounding. */
3176       set_FPU_rounding_default( env );
3177
3178       return dst;
3179    }
3180
3181    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3182       HReg dst = newVRegF(env);
3183       HReg rHi,rLo;
3184       iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
3185       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3186       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3187
3188       /* Set host rounding mode */
3189       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3190
3191       addInstr(env, X86Instr_FpLdStI(
3192                        True/*load*/, 8, dst,
3193                        X86AMode_IR(0, hregX86_ESP())));
3194
3195       /* Restore default FPU rounding. */
3196       set_FPU_rounding_default( env );
3197
3198       add_to_esp(env, 8);
3199       return dst;
3200    }
3201
3202    if (e->tag == Iex_Binop) {
3203       X86FpOp fpop = Xfp_INVALID;
3204       switch (e->Iex.Binop.op) {
3205          case Iop_SinF64:  fpop = Xfp_SIN; break;
3206          case Iop_CosF64:  fpop = Xfp_COS; break;
3207          case Iop_TanF64:  fpop = Xfp_TAN; break;
3208          case Iop_2xm1F64: fpop = Xfp_2XM1; break;
3209          case Iop_SqrtF64: fpop = Xfp_SQRT; break;
3210          default: break;
3211       }
3212       if (fpop != Xfp_INVALID) {
3213          HReg res = newVRegF(env);
3214          HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
3215          /* XXXROUNDINGFIXME */
3216          /* set roundingmode here */
3217          /* Note that X86Instr_FpUnary(Xfp_TAN,..) sets the condition
3218             codes.  I don't think that matters, since this insn
3219             selector never generates such an instruction intervening
3220             between an flag-setting instruction and a flag-using
3221             instruction. */
3222          addInstr(env, X86Instr_FpUnary(fpop,src,res));
3223          if (fpop != Xfp_SQRT
3224              && fpop != Xfp_NEG && fpop != Xfp_ABS)
3225             roundToF64(env, res);
3226          return res;
3227       }
3228    }
3229
3230    if (e->tag == Iex_Unop) {
3231       X86FpOp fpop = Xfp_INVALID;
3232       switch (e->Iex.Unop.op) {
3233          case Iop_NegF64:  fpop = Xfp_NEG; break;
3234          case Iop_AbsF64:  fpop = Xfp_ABS; break;
3235          default: break;
3236       }
3237       if (fpop != Xfp_INVALID) {
3238          HReg res = newVRegF(env);
3239          HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3240          addInstr(env, X86Instr_FpUnary(fpop,src,res));
3241          /* No need to do roundToF64(env,res) for Xfp_NEG or Xfp_ABS,
3242             but might need to do that for other unary ops. */
3243          return res;
3244       }
3245    }
3246
3247    if (e->tag == Iex_Unop) {
3248       switch (e->Iex.Unop.op) {
3249          case Iop_I32StoF64: {
3250             HReg dst = newVRegF(env);
3251             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3252             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3253             set_FPU_rounding_default(env);
3254             addInstr(env, X86Instr_FpLdStI(
3255                              True/*load*/, 4, dst,
3256                              X86AMode_IR(0, hregX86_ESP())));
3257             add_to_esp(env, 4);
3258             return dst;
3259          }
3260          case Iop_ReinterpI64asF64: {
3261             /* Given an I64, produce an IEEE754 double with the same
3262                bit pattern. */
3263             HReg dst = newVRegF(env);
3264             HReg rHi, rLo;
3265             iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
3266             /* paranoia */
3267             set_FPU_rounding_default(env);
3268             addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3269             addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3270             addInstr(env, X86Instr_FpLdSt(
3271                              True/*load*/, 8, dst,
3272                              X86AMode_IR(0, hregX86_ESP())));
3273             add_to_esp(env, 8);
3274             return dst;
3275          }
3276          case Iop_F32toF64: {
3277             /* this is a no-op */
3278             HReg res = iselFltExpr(env, e->Iex.Unop.arg);
3279             return res;
3280          }
3281          default:
3282             break;
3283       }
3284    }
3285
3286    /* --------- MULTIPLEX --------- */
3287    if (e->tag == Iex_ITE) { // VFD
3288      if (ty == Ity_F64
3289          && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
3290         HReg r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3291         HReg r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3292         HReg dst = newVRegF(env);
3293         addInstr(env, X86Instr_FpUnary(Xfp_MOV,r1,dst));
3294         X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3295         addInstr(env, X86Instr_FpCMov(cc ^ 1, r0, dst));
3296         return dst;
3297       }
3298    }
3299
3300    ppIRExpr(e);
3301    vpanic("iselDblExpr_wrk");
3302 }
3303
3304
3305 /*---------------------------------------------------------*/
3306 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3307 /*---------------------------------------------------------*/
3308
3309 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3310 {
3311    HReg r = iselVecExpr_wrk( env, e );
3312 #  if 0
3313    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3314 #  endif
3315    vassert(hregClass(r) == HRcVec128);
3316    vassert(hregIsVirtual(r));
3317    return r;
3318 }
3319
3320
3321 /* DO NOT CALL THIS DIRECTLY */
3322 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3323 {
3324
3325 #  define REQUIRE_SSE1                                    \
3326       do { if (env->hwcaps == 0/*baseline, no sse*/       \
3327                ||  env->hwcaps == VEX_HWCAPS_X86_MMXEXT /*Integer SSE*/) \
3328               goto vec_fail;                              \
3329       } while (0)
3330
3331 #  define REQUIRE_SSE2                                    \
3332       do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
3333               goto vec_fail;                              \
3334       } while (0)
3335
3336 #  define SSE2_OR_ABOVE                                   \
3337        (env->hwcaps & VEX_HWCAPS_X86_SSE2)
3338
3339    HWord     fn = 0; /* address of helper fn, if required */
3340    MatchInfo mi;
3341    Bool      arg1isEReg = False;
3342    X86SseOp  op = Xsse_INVALID;
3343    IRType    ty = typeOfIRExpr(env->type_env,e);
3344    vassert(e);
3345    vassert(ty == Ity_V128);
3346
3347    REQUIRE_SSE1;
3348
3349    if (e->tag == Iex_RdTmp) {
3350       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3351    }
3352
3353    if (e->tag == Iex_Get) {
3354       HReg dst = newVRegV(env);
3355       addInstr(env, X86Instr_SseLdSt(
3356                        True/*load*/,
3357                        dst,
3358                        X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
3359                     )
3360               );
3361       return dst;
3362    }
3363
3364    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3365       HReg      dst = newVRegV(env);
3366       X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3367       addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
3368       return dst;
3369    }
3370
3371    if (e->tag == Iex_Const) {
3372       HReg dst = newVRegV(env);
3373       vassert(e->Iex.Const.con->tag == Ico_V128);
3374       addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
3375       return dst;
3376    }
3377
3378    if (e->tag == Iex_Unop) {
3379
3380    if (SSE2_OR_ABOVE) {
3381       /* 64UtoV128(LDle:I64(addr)) */
3382       DECLARE_PATTERN(p_zwiden_load64);
3383       DEFINE_PATTERN(p_zwiden_load64,
3384                      unop(Iop_64UtoV128,
3385                           IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
3386       if (matchIRExpr(&mi, p_zwiden_load64, e)) {
3387          X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
3388          HReg dst = newVRegV(env);
3389          addInstr(env, X86Instr_SseLdzLO(8, dst, am));
3390          return dst;
3391       }
3392    }
3393
3394    switch (e->Iex.Unop.op) {
3395
3396       case Iop_NotV128: {
3397          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3398          return do_sse_Not128(env, arg);
3399       }
3400
3401       case Iop_CmpNEZ64x2: {
3402          /* We can use SSE2 instructions for this. */
3403          /* Ideally, we want to do a 64Ix2 comparison against zero of
3404             the operand.  Problem is no such insn exists.  Solution
3405             therefore is to do a 32Ix4 comparison instead, and bitwise-
3406             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3407             let the not'd result of this initial comparison be a:b:c:d.
3408             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3409             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3410             giving the required result.
3411
3412             The required selection sequence is 2,3,0,1, which
3413             according to Intel's documentation means the pshufd
3414             literal value is 0xB1, that is,
3415             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3416          */
3417          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3418          HReg tmp  = newVRegV(env);
3419          HReg dst  = newVRegV(env);
3420          REQUIRE_SSE2;
3421          addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
3422          addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
3423          tmp = do_sse_Not128(env, tmp);
3424          addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
3425          addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
3426          return dst;
3427       }
3428
3429       case Iop_CmpNEZ32x4: {
3430          /* Sigh, we have to generate lousy code since this has to
3431             work on SSE1 hosts */
3432          /* basically, the idea is: for each lane:
3433                movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
3434                sbbl %r, %r               (now %r = 1Sto32(CF))
3435                movl %r, lane
3436          */
3437          Int       i;
3438          X86AMode* am;
3439          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3440          HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
3441          HReg      dst  = newVRegV(env);
3442          HReg      r32  = newVRegI(env);
3443          sub_from_esp(env, 16);
3444          addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
3445          for (i = 0; i < 4; i++) {
3446             am = X86AMode_IR(i*4, hregX86_ESP());
3447             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
3448             addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
3449             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
3450             addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
3451          }
3452          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3453          add_to_esp(env, 16);
3454          return dst;
3455       }
3456
3457       case Iop_CmpNEZ8x16:
3458       case Iop_CmpNEZ16x8: {
3459          /* We can use SSE2 instructions for this. */
3460          HReg arg;
3461          HReg vec0 = newVRegV(env);
3462          HReg vec1 = newVRegV(env);
3463          HReg dst  = newVRegV(env);
3464          X86SseOp cmpOp
3465             = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
3466                                              : Xsse_CMPEQ8;
3467          REQUIRE_SSE2;
3468          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
3469          addInstr(env, mk_vMOVsd_RR(vec0, vec1));
3470          addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
3471          /* defer arg computation to here so as to give CMPEQF as long
3472             as possible to complete */
3473          arg = iselVecExpr(env, e->Iex.Unop.arg);
3474          /* vec0 is all 0s; vec1 is all 1s */
3475          addInstr(env, mk_vMOVsd_RR(arg, dst));
3476          /* 16x8 or 8x16 comparison == */
3477          addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
3478          /* invert result */
3479          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
3480          return dst;
3481       }
3482
3483       case Iop_RecipEst32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
3484       case Iop_RSqrtEst32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
3485       do_32Fx4_unary:
3486       {
3487          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3488          HReg dst = newVRegV(env);
3489          addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
3490          return dst;
3491       }
3492
3493       case Iop_RecipEst32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
3494       case Iop_RSqrtEst32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
3495       case Iop_Sqrt32F0x4:     op = Xsse_SQRTF;  goto do_32F0x4_unary;
3496       do_32F0x4_unary:
3497       {
3498          /* A bit subtle.  We have to copy the arg to the result
3499             register first, because actually doing the SSE scalar insn
3500             leaves the upper 3/4 of the destination register
3501             unchanged.  Whereas the required semantics of these
3502             primops is that the upper 3/4 is simply copied in from the
3503             argument. */
3504          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3505          HReg dst = newVRegV(env);
3506          addInstr(env, mk_vMOVsd_RR(arg, dst));
3507          addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
3508          return dst;
3509       }
3510
3511       case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
3512       do_64F0x2_unary:
3513       {
3514          /* A bit subtle.  We have to copy the arg to the result
3515             register first, because actually doing the SSE scalar insn
3516             leaves the upper half of the destination register
3517             unchanged.  Whereas the required semantics of these
3518             primops is that the upper half is simply copied in from the
3519             argument. */
3520          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3521          HReg dst = newVRegV(env);
3522          REQUIRE_SSE2;
3523          addInstr(env, mk_vMOVsd_RR(arg, dst));
3524          addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
3525          return dst;
3526       }
3527
3528       case Iop_32UtoV128: {
3529          HReg      dst  = newVRegV(env);
3530          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3531          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3532          addInstr(env, X86Instr_Push(rmi));
3533          addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
3534          add_to_esp(env, 4);
3535          return dst;
3536       }
3537
3538       case Iop_64UtoV128: {
3539          HReg      rHi, rLo;
3540          HReg      dst  = newVRegV(env);
3541          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3542          iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
3543          addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
3544          addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
3545          addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
3546          add_to_esp(env, 8);
3547          return dst;
3548       }
3549
3550       default:
3551          break;
3552    } /* switch (e->Iex.Unop.op) */
3553    } /* if (e->tag == Iex_Unop) */
3554
3555    if (e->tag == Iex_Binop) {
3556    switch (e->Iex.Binop.op) {
3557
3558       case Iop_Sqrt64Fx2:
3559          REQUIRE_SSE2;
3560          /* fallthrough */
3561       case Iop_Sqrt32Fx4: {
3562          /* :: (rmode, vec) -> vec */
3563          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3564          HReg dst = newVRegV(env);
3565          /* XXXROUNDINGFIXME */
3566          /* set roundingmode here */
3567          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3568                            ? X86Instr_Sse64Fx2 : X86Instr_Sse32Fx4)
3569                        (Xsse_SQRTF, arg, dst));
3570          return dst;
3571       }
3572
3573       case Iop_SetV128lo32: {
3574          HReg dst = newVRegV(env);
3575          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3576          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3577          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3578          sub_from_esp(env, 16);
3579          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3580          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
3581          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3582          add_to_esp(env, 16);
3583          return dst;
3584       }
3585
3586       case Iop_SetV128lo64: {
3587          HReg dst = newVRegV(env);
3588          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3589          HReg srcIhi, srcIlo;
3590          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3591          X86AMode* esp4 = advance4(esp0);
3592          iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
3593          sub_from_esp(env, 16);
3594          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
3595          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
3596          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
3597          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3598          add_to_esp(env, 16);
3599          return dst;
3600       }
3601
3602       case Iop_64HLtoV128: {
3603          HReg r3, r2, r1, r0;
3604          X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
3605          X86AMode* esp4  = advance4(esp0);
3606          X86AMode* esp8  = advance4(esp4);
3607          X86AMode* esp12 = advance4(esp8);
3608          HReg dst = newVRegV(env);
3609          /* do this via the stack (easy, convenient, etc) */
3610          sub_from_esp(env, 16);
3611          /* Do the less significant 64 bits */
3612          iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
3613          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
3614          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
3615          /* Do the more significant 64 bits */
3616          iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
3617          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
3618          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
3619          /* Fetch result back from stack. */
3620          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
3621          add_to_esp(env, 16);
3622          return dst;
3623       }
3624
3625       case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
3626       case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
3627       case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
3628       case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
3629       case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
3630       case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
3631       do_32Fx4:
3632       {
3633          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3634          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3635          HReg dst = newVRegV(env);
3636          addInstr(env, mk_vMOVsd_RR(argL, dst));
3637          addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3638          return dst;
3639       }
3640
3641       case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
3642       case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
3643       case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
3644       case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
3645       case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
3646       case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
3647       do_64Fx2:
3648       {
3649          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3650          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3651          HReg dst = newVRegV(env);
3652          REQUIRE_SSE2;
3653          addInstr(env, mk_vMOVsd_RR(argL, dst));
3654          addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3655          return dst;
3656       }
3657
3658       case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
3659       case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
3660       case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
3661       case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
3662       case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
3663       case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
3664       case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
3665       case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
3666       case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
3667       case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
3668       do_32F0x4: {
3669          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3670          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3671          HReg dst = newVRegV(env);
3672          addInstr(env, mk_vMOVsd_RR(argL, dst));
3673          addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
3674          return dst;
3675       }
3676
3677       case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
3678       case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
3679       case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
3680       case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
3681       case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
3682       case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
3683       case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
3684       case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
3685       case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
3686       case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
3687       do_64F0x2: {
3688          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3689          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3690          HReg dst = newVRegV(env);
3691          REQUIRE_SSE2;
3692          addInstr(env, mk_vMOVsd_RR(argL, dst));
3693          addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
3694          return dst;
3695       }
3696
3697       case Iop_QNarrowBin32Sto16Sx8:
3698          op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3699       case Iop_QNarrowBin16Sto8Sx16:
3700          op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3701       case Iop_QNarrowBin16Sto8Ux16:
3702          op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3703
3704       case Iop_InterleaveHI8x16:
3705          op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3706       case Iop_InterleaveHI16x8:
3707          op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3708       case Iop_InterleaveHI32x4:
3709          op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3710       case Iop_InterleaveHI64x2:
3711          op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3712
3713       case Iop_InterleaveLO8x16:
3714          op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3715       case Iop_InterleaveLO16x8:
3716          op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3717       case Iop_InterleaveLO32x4:
3718          op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3719       case Iop_InterleaveLO64x2:
3720          op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3721
3722       case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
3723       case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
3724       case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
3725       case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
3726       case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
3727       case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
3728       case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
3729       case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
3730       case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
3731       case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
3732       case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
3733       case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
3734       case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
3735       case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
3736       case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
3737       case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
3738       case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
3739       case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
3740       case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
3741       case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
3742       case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
3743       case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
3744       case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
3745       case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
3746       case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
3747       case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
3748       case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
3749       case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
3750       case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
3751       case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
3752       case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
3753       case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
3754       case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
3755       case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
3756       do_SseReRg: {
3757          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3758          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3759          HReg dst = newVRegV(env);
3760          if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
3761             REQUIRE_SSE2;
3762          if (arg1isEReg) {
3763             addInstr(env, mk_vMOVsd_RR(arg2, dst));
3764             addInstr(env, X86Instr_SseReRg(op, arg1, dst));
3765          } else {
3766             addInstr(env, mk_vMOVsd_RR(arg1, dst));
3767             addInstr(env, X86Instr_SseReRg(op, arg2, dst));
3768          }
3769          return dst;
3770       }
3771
3772       case Iop_ShlN8x16: {
3773          /* This instruction doesn't exist so we need to fake it using
3774             Xsse_SHL16 and Xsse_SHR16.
3775
3776             We'd like to shift every byte in the 16-byte register to the left by
3777             some amount.
3778
3779             Instead, we will make a copy and shift all the 16-bit words to the
3780             *right* by 8 and then to the left by 8 plus the shift amount.  That
3781             will get us the correct answer for the upper 8 bits of each 16-bit
3782             word and zero elsewhere.
3783
3784             Then we will shift all the 16-bit words in the original to the left
3785             by 8 plus the shift amount and then to the right by 8.  This will
3786             get the correct answer for the lower 8 bits of each 16-bit word and
3787             zero elsewhere.
3788
3789             Finally, we will OR those two results together.
3790
3791             Because we don't have a shift by constant in x86, we store the
3792             constant 8 into a register and shift by that as needed.
3793          */
3794          HReg      greg  = iselVecExpr(env, e->Iex.Binop.arg1);
3795          X86RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3796          X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
3797          HReg      ereg  = newVRegV(env);
3798          HReg      eight = newVRegV(env); // To store the constant value 8.
3799          HReg      dst   = newVRegV(env);
3800          HReg      hi    = newVRegV(env);
3801          REQUIRE_SSE2;
3802          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3803          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3804          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3805          addInstr(env, X86Instr_Push(rmi));
3806          addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
3807          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3808          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3809          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3810          addInstr(env, X86Instr_Push(X86RMI_Imm(8)));
3811          addInstr(env, X86Instr_SseLdSt(True/*load*/, eight, esp0));
3812
3813          op = Xsse_SHL16;
3814          X86SseOp reverse_op = Xsse_SHR16;
3815          addInstr(env, mk_vMOVsd_RR(greg, hi));
3816          addInstr(env, X86Instr_SseReRg(reverse_op, eight, hi));
3817          addInstr(env, X86Instr_SseReRg(op, eight, hi));
3818          addInstr(env, X86Instr_SseReRg(op, ereg, hi));
3819          addInstr(env, mk_vMOVsd_RR(greg, dst));
3820          addInstr(env, X86Instr_SseReRg(op, eight, dst));
3821          addInstr(env, X86Instr_SseReRg(op, ereg, dst));
3822          addInstr(env, X86Instr_SseReRg(reverse_op, eight, dst));
3823          addInstr(env, X86Instr_SseReRg(Xsse_OR, hi, dst));
3824
3825          add_to_esp(env, 32);
3826          return dst;
3827       }
3828       case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
3829       case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
3830       case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
3831       case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
3832       case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
3833       case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
3834       case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
3835       case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
3836       do_SseShift: {
3837          HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
3838          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3839          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
3840          HReg      ereg = newVRegV(env);
3841          HReg      dst  = newVRegV(env);
3842          REQUIRE_SSE2;
3843          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3844          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3845          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
3846          addInstr(env, X86Instr_Push(rmi));
3847          addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
3848          addInstr(env, mk_vMOVsd_RR(greg, dst));
3849          addInstr(env, X86Instr_SseReRg(op, ereg, dst));
3850          add_to_esp(env, 16);
3851          return dst;
3852       }
3853
3854       case Iop_NarrowBin32to16x8:
3855          fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3856          goto do_SseAssistedBinary;
3857       case Iop_NarrowBin16to8x16:
3858          fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3859          goto do_SseAssistedBinary;
3860       do_SseAssistedBinary: {
3861          /* As with the amd64 case (where this is copied from) we
3862             generate pretty bad code. */
3863          vassert(fn != 0);
3864          HReg dst = newVRegV(env);
3865          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3866          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3867          HReg argp = newVRegI(env);
3868          /* subl $112, %esp         -- make a space */
3869          sub_from_esp(env, 112);
3870          /* leal 48(%esp), %r_argp  -- point into it */
3871          addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
3872                                       argp));
3873          /* andl $-16, %r_argp      -- 16-align the pointer */
3874          addInstr(env, X86Instr_Alu32R(Xalu_AND,
3875                                        X86RMI_Imm( ~(UInt)15 ),
3876                                        argp));
3877          /* Prepare 3 arg regs:
3878             leal  0(%r_argp), %eax
3879             leal 16(%r_argp), %edx
3880             leal 32(%r_argp), %ecx
3881          */
3882          addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
3883                                       hregX86_EAX()));
3884          addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
3885                                       hregX86_EDX()));
3886          addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
3887                                       hregX86_ECX()));
3888          /* Store the two args, at (%edx) and (%ecx):
3889             movupd  %argL, 0(%edx)
3890             movupd  %argR, 0(%ecx)
3891          */
3892          addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
3893                                         X86AMode_IR(0, hregX86_EDX())));
3894          addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
3895                                         X86AMode_IR(0, hregX86_ECX())));
3896          /* call the helper */
3897          addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn,
3898                                       3, mk_RetLoc_simple(RLPri_None) ));
3899          /* fetch the result from memory, using %r_argp, which the
3900             register allocator will keep alive across the call. */
3901          addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
3902                                         X86AMode_IR(0, argp)));
3903          /* and finally, clear the space */
3904          add_to_esp(env, 112);
3905          return dst;
3906       }
3907
3908       default:
3909          break;
3910    } /* switch (e->Iex.Binop.op) */
3911    } /* if (e->tag == Iex_Binop) */
3912
3913
3914    if (e->tag == Iex_Triop) {
3915    IRTriop *triop = e->Iex.Triop.details;
3916    switch (triop->op) {
3917
3918       case Iop_Add32Fx4: op = Xsse_ADDF; goto do_32Fx4_w_rm;
3919       case Iop_Sub32Fx4: op = Xsse_SUBF; goto do_32Fx4_w_rm;
3920       case Iop_Mul32Fx4: op = Xsse_MULF; goto do_32Fx4_w_rm;
3921       case Iop_Div32Fx4: op = Xsse_DIVF; goto do_32Fx4_w_rm;
3922       do_32Fx4_w_rm:
3923       {
3924          HReg argL = iselVecExpr(env, triop->arg2);
3925          HReg argR = iselVecExpr(env, triop->arg3);
3926          HReg dst = newVRegV(env);
3927          addInstr(env, mk_vMOVsd_RR(argL, dst));
3928          /* XXXROUNDINGFIXME */
3929          /* set roundingmode here */
3930          addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
3931          return dst;
3932       }
3933
3934       case Iop_Add64Fx2: op = Xsse_ADDF; goto do_64Fx2_w_rm;
3935       case Iop_Sub64Fx2: op = Xsse_SUBF; goto do_64Fx2_w_rm;
3936       case Iop_Mul64Fx2: op = Xsse_MULF; goto do_64Fx2_w_rm;
3937       case Iop_Div64Fx2: op = Xsse_DIVF; goto do_64Fx2_w_rm;
3938       do_64Fx2_w_rm:
3939       {
3940          HReg argL = iselVecExpr(env, triop->arg2);
3941          HReg argR = iselVecExpr(env, triop->arg3);
3942          HReg dst = newVRegV(env);
3943          REQUIRE_SSE2;
3944          addInstr(env, mk_vMOVsd_RR(argL, dst));
3945          /* XXXROUNDINGFIXME */
3946          /* set roundingmode here */
3947          addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
3948          return dst;
3949       }
3950
3951       default:
3952          break;
3953    } /* switch (triop->op) */
3954    } /* if (e->tag == Iex_Triop) */
3955
3956
3957    if (e->tag == Iex_ITE) { // VFD
3958       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3959       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3960       HReg dst = newVRegV(env);
3961       addInstr(env, mk_vMOVsd_RR(r1,dst));
3962       X86CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3963       addInstr(env, X86Instr_SseCMov(cc ^ 1, r0, dst));
3964       return dst;
3965    }
3966
3967    vec_fail:
3968    vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
3969               LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
3970    ppIRExpr(e);
3971    vpanic("iselVecExpr_wrk");
3972
3973 #  undef REQUIRE_SSE1
3974 #  undef REQUIRE_SSE2
3975 #  undef SSE2_OR_ABOVE
3976 }
3977
3978
3979 /*---------------------------------------------------------*/
3980 /*--- ISEL: Statements                                  ---*/
3981 /*---------------------------------------------------------*/
3982
3983 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3984 {
3985    if (vex_traceflags & VEX_TRACE_VCODE) {
3986       vex_printf("\n-- ");
3987       ppIRStmt(stmt);
3988       vex_printf("\n");
3989    }
3990
3991    switch (stmt->tag) {
3992
3993    /* --------- STORE --------- */
3994    case Ist_Store: {
3995       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3996       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3997       IREndness end   = stmt->Ist.Store.end;
3998
3999       if (tya != Ity_I32 || end != Iend_LE)
4000          goto stmt_fail;
4001
4002       if (tyd == Ity_I32) {
4003          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4004          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4005          addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
4006          return;
4007       }
4008       if (tyd == Ity_I8 || tyd == Ity_I16) {
4009          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4010          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4011          addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
4012                                        r,am ));
4013          return;
4014       }
4015       if (tyd == Ity_F64) {
4016          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4017          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4018          addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
4019          return;
4020       }
4021       if (tyd == Ity_F32) {
4022          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4023          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4024          addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
4025          return;
4026       }
4027       if (tyd == Ity_I64) {
4028          HReg vHi, vLo, rA;
4029          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
4030          rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4031          addInstr(env, X86Instr_Alu32M(
4032                           Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
4033          addInstr(env, X86Instr_Alu32M(
4034                           Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
4035          return;
4036       }
4037       if (tyd == Ity_V128) {
4038          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4039          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4040          addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
4041          return;
4042       }
4043       break;
4044    }
4045
4046    /* --------- PUT --------- */
4047    case Ist_Put: {
4048       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4049       if (ty == Ity_I32) {
4050          /* We're going to write to memory, so compute the RHS into an
4051             X86RI. */
4052          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4053          addInstr(env,
4054                   X86Instr_Alu32M(
4055                      Xalu_MOV,
4056                      ri,
4057                      X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
4058                  ));
4059          return;
4060       }
4061       if (ty == Ity_I8 || ty == Ity_I16) {
4062          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4063          addInstr(env, X86Instr_Store(
4064                           toUChar(ty==Ity_I8 ? 1 : 2),
4065                           r,
4066                           X86AMode_IR(stmt->Ist.Put.offset,
4067                                       hregX86_EBP())));
4068          return;
4069       }
4070       if (ty == Ity_I64) {
4071          HReg vHi, vLo;
4072          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
4073          X86AMode* am4 = advance4(am);
4074          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
4075          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
4076          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
4077          return;
4078       }
4079       if (ty == Ity_V128) {
4080          HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
4081          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
4082          addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
4083          return;
4084       }
4085       if (ty == Ity_F32) {
4086          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4087          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
4088          set_FPU_rounding_default(env); /* paranoia */
4089          addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
4090          return;
4091       }
4092       if (ty == Ity_F64) {
4093          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4094          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
4095          set_FPU_rounding_default(env); /* paranoia */
4096          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
4097          return;
4098       }
4099       break;
4100    }
4101
4102    /* --------- Indexed PUT --------- */
4103    case Ist_PutI: {
4104       IRPutI *puti = stmt->Ist.PutI.details;
4105
4106       X86AMode* am
4107          = genGuestArrayOffset(
4108               env, puti->descr,
4109                    puti->ix, puti->bias );
4110
4111       IRType ty = typeOfIRExpr(env->type_env, puti->data);
4112       if (ty == Ity_F64) {
4113          HReg val = iselDblExpr(env, puti->data);
4114          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
4115          return;
4116       }
4117       if (ty == Ity_I8) {
4118          HReg r = iselIntExpr_R(env, puti->data);
4119          addInstr(env, X86Instr_Store( 1, r, am ));
4120          return;
4121       }
4122       if (ty == Ity_I32) {
4123          HReg r = iselIntExpr_R(env, puti->data);
4124          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
4125          return;
4126       }
4127       if (ty == Ity_I64) {
4128          HReg rHi, rLo;
4129          X86AMode* am4 = advance4(am);
4130          iselInt64Expr(&rHi, &rLo, env, puti->data);
4131          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
4132          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
4133          return;
4134       }
4135       break;
4136    }
4137
4138    /* --------- TMP --------- */
4139    case Ist_WrTmp: {
4140       IRTemp tmp = stmt->Ist.WrTmp.tmp;
4141       IRType ty = typeOfIRTemp(env->type_env, tmp);
4142
4143       /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
4144          compute it into an AMode and then use LEA.  This usually
4145          produces fewer instructions, often because (for memcheck
4146          created IR) we get t = address-expression, (t is later used
4147          twice) and so doing this naturally turns address-expression
4148          back into an X86 amode. */
4149       if (ty == Ity_I32
4150           && stmt->Ist.WrTmp.data->tag == Iex_Binop
4151           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
4152          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4153          HReg dst = lookupIRTemp(env, tmp);
4154          if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
4155             /* Hmm, iselIntExpr_AMode wimped out and just computed the
4156                value into a register.  Just emit a normal reg-reg move
4157                so reg-alloc can coalesce it away in the usual way. */
4158             HReg src = am->Xam.IR.reg;
4159             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
4160          } else {
4161             addInstr(env, X86Instr_Lea32(am,dst));
4162          }
4163          return;
4164       }
4165
4166       if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
4167          X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4168          HReg dst = lookupIRTemp(env, tmp);
4169          addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
4170          return;
4171       }
4172       if (ty == Ity_I64) {
4173          HReg rHi, rLo, dstHi, dstLo;
4174          iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4175          lookupIRTemp64( &dstHi, &dstLo, env, tmp);
4176          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4177          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4178          return;
4179       }
4180       if (ty == Ity_I1) {
4181          X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4182          HReg dst = lookupIRTemp(env, tmp);
4183          addInstr(env, X86Instr_Set32(cond, dst));
4184          return;
4185       }
4186       if (ty == Ity_F64) {
4187          HReg dst = lookupIRTemp(env, tmp);
4188          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4189          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
4190          return;
4191       }
4192       if (ty == Ity_F32) {
4193          HReg dst = lookupIRTemp(env, tmp);
4194          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4195          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
4196          return;
4197       }
4198       if (ty == Ity_V128) {
4199          HReg dst = lookupIRTemp(env, tmp);
4200          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4201          addInstr(env, mk_vMOVsd_RR(src,dst));
4202          return;
4203       }
4204       break;
4205    }
4206
4207    /* --------- Call to DIRTY helper --------- */
4208    case Ist_Dirty: {
4209       IRDirty* d = stmt->Ist.Dirty.details;
4210
4211       /* Figure out the return type, if any. */
4212       IRType retty = Ity_INVALID;
4213       if (d->tmp != IRTemp_INVALID)
4214          retty = typeOfIRTemp(env->type_env, d->tmp);
4215
4216       Bool retty_ok = False;
4217       switch (retty) {
4218          case Ity_INVALID: /* function doesn't return anything */
4219          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4220          case Ity_V128:
4221             retty_ok = True; break;
4222          default:
4223             break;
4224       }
4225       if (!retty_ok)
4226          break; /* will go to stmt_fail: */
4227
4228       /* Marshal args, do the call, and set the return value to
4229          0x555..555 if this is a conditional call that returns a value
4230          and the call is skipped. */
4231       UInt   addToSp = 0;
4232       RetLoc rloc    = mk_RetLoc_INVALID();
4233       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4234       vassert(is_sane_RetLoc(rloc));
4235
4236       /* Now figure out what to do with the returned value, if any. */
4237       switch (retty) {
4238          case Ity_INVALID: {
4239             /* No return value.  Nothing to do. */
4240             vassert(d->tmp == IRTemp_INVALID);
4241             vassert(rloc.pri == RLPri_None);
4242             vassert(addToSp == 0);
4243             return;
4244          }
4245          case Ity_I32: case Ity_I16: case Ity_I8: {
4246             /* The returned value is in %eax.  Park it in the register
4247                associated with tmp. */
4248             vassert(rloc.pri == RLPri_Int);
4249             vassert(addToSp == 0);
4250             HReg dst = lookupIRTemp(env, d->tmp);
4251             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
4252             return;
4253          }
4254          case Ity_I64: {
4255             /* The returned value is in %edx:%eax.  Park it in the
4256                register-pair associated with tmp. */
4257             vassert(rloc.pri == RLPri_2Int);
4258             vassert(addToSp == 0);
4259             HReg dstHi, dstLo;
4260             lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
4261             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
4262             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
4263             return;
4264          }
4265          case Ity_V128: {
4266             /* The returned value is on the stack, and *retloc tells
4267                us where.  Fish it off the stack and then move the
4268                stack pointer upwards to clear it, as directed by
4269                doHelperCall. */
4270             vassert(rloc.pri == RLPri_V128SpRel);
4271             vassert(addToSp >= 16);
4272             HReg      dst = lookupIRTemp(env, d->tmp);
4273             X86AMode* am  = X86AMode_IR(rloc.spOff, hregX86_ESP());
4274             addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
4275             add_to_esp(env, addToSp);
4276             return;
4277          }
4278          default:
4279             /*NOTREACHED*/
4280             vassert(0);
4281       }
4282       break;
4283    }
4284
4285    /* --------- MEM FENCE --------- */
4286    case Ist_MBE:
4287       switch (stmt->Ist.MBE.event) {
4288          case Imbe_Fence:
4289             addInstr(env, X86Instr_MFence(env->hwcaps));
4290             return;
4291          default:
4292             break;
4293       }
4294       break;
4295
4296    /* --------- ACAS --------- */
4297    case Ist_CAS:
4298       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4299          /* "normal" singleton CAS */
4300          UChar  sz;
4301          IRCAS* cas = stmt->Ist.CAS.details;
4302          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4303          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4304          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4305          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4306          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4307          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4308          vassert(cas->expdHi == NULL);
4309          vassert(cas->dataHi == NULL);
4310          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4311          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4312          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4313          switch (ty) {
4314             case Ity_I32: sz = 4; break;
4315             case Ity_I16: sz = 2; break;
4316             case Ity_I8:  sz = 1; break;
4317             default: goto unhandled_cas;
4318          }
4319          addInstr(env, X86Instr_ACAS(am, sz));
4320          addInstr(env,
4321                   X86Instr_CMov32(Xcc_NZ,
4322                                   X86RM_Reg(hregX86_EAX()), rOldLo));
4323          return;
4324       } else {
4325          /* double CAS */
4326          IRCAS* cas = stmt->Ist.CAS.details;
4327          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4328          /* only 32-bit allowed in this case */
4329          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
4330          /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
4331          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
4332          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4333          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4334          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4335          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4336          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4337          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4338          if (ty != Ity_I32)
4339             goto unhandled_cas;
4340          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4341          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4342          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
4343          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
4344          addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
4345          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
4346          addInstr(env, X86Instr_DACAS(am));
4347          addInstr(env,
4348                   X86Instr_CMov32(Xcc_NZ,
4349                                   X86RM_Reg(hregX86_EDX()), rOldHi));
4350          addInstr(env,
4351                   X86Instr_CMov32(Xcc_NZ,
4352                                   X86RM_Reg(hregX86_EAX()), rOldLo));
4353          return;
4354       }
4355       unhandled_cas:
4356       break;
4357
4358    /* --------- INSTR MARK --------- */
4359    /* Doesn't generate any executable code ... */
4360    case Ist_IMark:
4361        return;
4362
4363    /* --------- NO-OP --------- */
4364    /* Fairly self-explanatory, wouldn't you say? */
4365    case Ist_NoOp:
4366        return;
4367
4368    /* --------- EXIT --------- */
4369    case Ist_Exit: {
4370       if (stmt->Ist.Exit.dst->tag != Ico_U32)
4371          vpanic("iselStmt(x86): Ist_Exit: dst is not a 32-bit value");
4372
4373       X86CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
4374       X86AMode*   amEIP = X86AMode_IR(stmt->Ist.Exit.offsIP,
4375                                       hregX86_EBP());
4376
4377       /* Case: boring transfer to known address */
4378       if (stmt->Ist.Exit.jk == Ijk_Boring) {
4379          if (env->chainingAllowed) {
4380             /* .. almost always true .. */
4381             /* Skip the event check at the dst if this is a forwards
4382                edge. */
4383             Bool toFastEP
4384                = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
4385             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4386             addInstr(env, X86Instr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
4387                                            amEIP, cc, toFastEP));
4388          } else {
4389             /* .. very occasionally .. */
4390             /* We can't use chaining, so ask for an assisted transfer,
4391                as that's the only alternative that is allowable. */
4392             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4393             addInstr(env, X86Instr_XAssisted(r, amEIP, cc, Ijk_Boring));
4394          }
4395          return;
4396       }
4397
4398       /* Case: assisted transfer to arbitrary address */
4399       switch (stmt->Ist.Exit.jk) {
4400          /* Keep this list in sync with that in iselNext below */
4401          case Ijk_ClientReq:
4402          case Ijk_EmWarn:
4403          case Ijk_MapFail:
4404          case Ijk_NoDecode:
4405          case Ijk_NoRedir:
4406          case Ijk_SigSEGV:
4407          case Ijk_SigTRAP:
4408          case Ijk_Sys_int128:
4409          case Ijk_Sys_int129:
4410          case Ijk_Sys_int130:
4411          case Ijk_Sys_int145:
4412          case Ijk_Sys_int210:
4413          case Ijk_Sys_syscall:
4414          case Ijk_Sys_sysenter:
4415          case Ijk_InvalICache:
4416          case Ijk_Yield:
4417          {
4418             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4419             addInstr(env, X86Instr_XAssisted(r, amEIP, cc, stmt->Ist.Exit.jk));
4420             return;
4421          }
4422          default:
4423             break;
4424       }
4425
4426       /* Do we ever expect to see any other kind? */
4427       goto stmt_fail;
4428    }
4429
4430    default: break;
4431    }
4432   stmt_fail:
4433    ppIRStmt(stmt);
4434    vpanic("iselStmt");
4435 }
4436
4437
4438 /*---------------------------------------------------------*/
4439 /*--- ISEL: Basic block terminators (Nexts)             ---*/
4440 /*---------------------------------------------------------*/
4441
4442 static void iselNext ( ISelEnv* env,
4443                        IRExpr* next, IRJumpKind jk, Int offsIP )
4444 {
4445    if (vex_traceflags & VEX_TRACE_VCODE) {
4446       vex_printf( "\n-- PUT(%d) = ", offsIP);
4447       ppIRExpr( next );
4448       vex_printf( "; exit-");
4449       ppIRJumpKind(jk);
4450       vex_printf( "\n");
4451    }
4452
4453    /* Case: boring transfer to known address */
4454    if (next->tag == Iex_Const) {
4455       IRConst* cdst = next->Iex.Const.con;
4456       vassert(cdst->tag == Ico_U32);
4457       if (jk == Ijk_Boring || jk == Ijk_Call) {
4458          /* Boring transfer to known address */
4459          X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4460          if (env->chainingAllowed) {
4461             /* .. almost always true .. */
4462             /* Skip the event check at the dst if this is a forwards
4463                edge. */
4464             Bool toFastEP
4465                = ((Addr32)cdst->Ico.U32) > env->max_ga;
4466             if (0) vex_printf("%s", toFastEP ? "X" : ".");
4467             addInstr(env, X86Instr_XDirect(cdst->Ico.U32,
4468                                            amEIP, Xcc_ALWAYS,
4469                                            toFastEP));
4470          } else {
4471             /* .. very occasionally .. */
4472             /* We can't use chaining, so ask for an assisted transfer,
4473                as that's the only alternative that is allowable. */
4474             HReg r = iselIntExpr_R(env, next);
4475             addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4476                                              Ijk_Boring));
4477          }
4478          return;
4479       }
4480    }
4481
4482    /* Case: call/return (==boring) transfer to any address */
4483    switch (jk) {
4484       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4485          HReg      r     = iselIntExpr_R(env, next);
4486          X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4487          if (env->chainingAllowed) {
4488             addInstr(env, X86Instr_XIndir(r, amEIP, Xcc_ALWAYS));
4489          } else {
4490             addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
4491                                                Ijk_Boring));
4492          }
4493          return;
4494       }
4495       default:
4496          break;
4497    }
4498
4499    /* Case: assisted transfer to arbitrary address */
4500    switch (jk) {
4501       /* Keep this list in sync with that for Ist_Exit above */
4502       case Ijk_ClientReq:
4503       case Ijk_EmWarn:
4504       case Ijk_MapFail:
4505       case Ijk_NoDecode:
4506       case Ijk_NoRedir:
4507       case Ijk_SigSEGV:
4508       case Ijk_SigTRAP:
4509       case Ijk_Sys_int128:
4510       case Ijk_Sys_int129:
4511       case Ijk_Sys_int130:
4512       case Ijk_Sys_int145:
4513       case Ijk_Sys_int210:
4514       case Ijk_Sys_syscall:
4515       case Ijk_Sys_sysenter:
4516       case Ijk_InvalICache:
4517       case Ijk_Yield:
4518       {
4519          HReg      r     = iselIntExpr_R(env, next);
4520          X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
4521          addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS, jk));
4522          return;
4523       }
4524       default:
4525          break;
4526    }
4527
4528    vex_printf( "\n-- PUT(%d) = ", offsIP);
4529    ppIRExpr( next );
4530    vex_printf( "; exit-");
4531    ppIRJumpKind(jk);
4532    vex_printf( "\n");
4533    vassert(0); // are we expecting any other kind?
4534 }
4535
4536
4537 /*---------------------------------------------------------*/
4538 /*--- Insn selector top-level                           ---*/
4539 /*---------------------------------------------------------*/
4540
4541 /* Translate an entire SB to x86 code. */
4542
4543 HInstrArray* iselSB_X86 ( const IRSB* bb,
4544                           VexArch      arch_host,
4545                           const VexArchInfo* archinfo_host,
4546                           const VexAbiInfo*  vbi/*UNUSED*/,
4547                           Int offs_Host_EvC_Counter,
4548                           Int offs_Host_EvC_FailAddr,
4549                           Bool chainingAllowed,
4550                           Bool addProfInc,
4551                           Addr max_ga )
4552 {
4553    Int      i, j;
4554    HReg     hreg, hregHI;
4555    ISelEnv* env;
4556    UInt     hwcaps_host = archinfo_host->hwcaps;
4557    X86AMode *amCounter, *amFailAddr;
4558
4559    /* sanity ... */
4560    vassert(arch_host == VexArchX86);
4561    vassert(0 == (hwcaps_host
4562                  & ~(VEX_HWCAPS_X86_MMXEXT
4563                      | VEX_HWCAPS_X86_SSE1
4564                      | VEX_HWCAPS_X86_SSE2
4565                      | VEX_HWCAPS_X86_SSE3
4566                      | VEX_HWCAPS_X86_LZCNT)));
4567
4568    /* Check that the host's endianness is as expected. */
4569    vassert(archinfo_host->endness == VexEndnessLE);
4570
4571    /* Make up an initial environment to use. */
4572    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
4573    env->vreg_ctr = 0;
4574
4575    /* Set up output code array. */
4576    env->code = newHInstrArray();
4577
4578    /* Copy BB's type env. */
4579    env->type_env = bb->tyenv;
4580
4581    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4582       change as we go along. */
4583    env->n_vregmap = bb->tyenv->types_used;
4584    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4585    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4586
4587    /* and finally ... */
4588    env->chainingAllowed = chainingAllowed;
4589    env->hwcaps          = hwcaps_host;
4590    env->max_ga          = max_ga;
4591
4592    /* For each IR temporary, allocate a suitably-kinded virtual
4593       register. */
4594    j = 0;
4595    for (i = 0; i < env->n_vregmap; i++) {
4596       hregHI = hreg = INVALID_HREG;
4597       switch (bb->tyenv->types[i]) {
4598          case Ity_I1:
4599          case Ity_I8:
4600          case Ity_I16:
4601          case Ity_I32:  hreg   = mkHReg(True, HRcInt32,  0, j++); break;
4602          case Ity_I64:  hreg   = mkHReg(True, HRcInt32,  0, j++);
4603                         hregHI = mkHReg(True, HRcInt32,  0, j++); break;
4604          case Ity_F32:
4605          case Ity_F64:  hreg   = mkHReg(True, HRcFlt64,  0, j++); break;
4606          case Ity_V128: hreg   = mkHReg(True, HRcVec128, 0, j++); break;
4607          default: ppIRType(bb->tyenv->types[i]);
4608                   vpanic("iselBB: IRTemp type");
4609       }
4610       env->vregmap[i]   = hreg;
4611       env->vregmapHI[i] = hregHI;
4612    }
4613    env->vreg_ctr = j;
4614
4615    /* The very first instruction must be an event check. */
4616    amCounter  = X86AMode_IR(offs_Host_EvC_Counter,  hregX86_EBP());
4617    amFailAddr = X86AMode_IR(offs_Host_EvC_FailAddr, hregX86_EBP());
4618    addInstr(env, X86Instr_EvCheck(amCounter, amFailAddr));
4619
4620    /* Possibly a block counter increment (for profiling).  At this
4621       point we don't know the address of the counter, so just pretend
4622       it is zero.  It will have to be patched later, but before this
4623       translation is used, by a call to LibVEX_patchProfCtr. */
4624    if (addProfInc) {
4625       addInstr(env, X86Instr_ProfInc());
4626    }
4627
4628    /* Ok, finally we can iterate over the statements. */
4629    for (i = 0; i < bb->stmts_used; i++)
4630       iselStmt(env, bb->stmts[i]);
4631
4632    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4633
4634    /* record the number of vregs we used. */
4635    env->code->n_vregs = env->vreg_ctr;
4636    return env->code;
4637 }
4638
4639
4640 /*---------------------------------------------------------------*/
4641 /*--- end                                     host_x86_isel.c ---*/
4642 /*---------------------------------------------------------------*/