VEX/priv/host_amd64_isel.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_isel.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_ir.h"
  36 #include "libvex.h"
  37
  38 #include "ir_match.h"
  39 #include "main_util.h"
  40 #include "main_globals.h"
  41 #include "host_generic_regs.h"
  42 #include "host_generic_simd64.h"
  43 #include "host_generic_simd128.h"
  44 #include "host_generic_simd256.h"
  45 #include "host_amd64_maddf.h"
  46 #include "host_generic_maddf.h"
  47 #include "host_amd64_defs.h"
  48
  49
  50 /*---------------------------------------------------------*/
  51 /*--- x87/SSE control word stuff                        ---*/
  52 /*---------------------------------------------------------*/
  53
  54 /* Vex-generated code expects to run with the FPU set as follows: all
  55    exceptions masked, round-to-nearest, precision = 53 bits.  This
  56    corresponds to a FPU control word value of 0x027F.
  57
  58    Similarly the SSE control word (%mxcsr) should be 0x1F80.
  59
  60    %fpucw and %mxcsr should have these values on entry to
  61    Vex-generated code, and should those values should be
  62    unchanged at exit.
  63 */
  64
  65 #define DEFAULT_FPUCW 0x027F
  66
  67 #define DEFAULT_MXCSR 0x1F80
  68
  69 /* debugging only, do not use */
  70 /* define DEFAULT_FPUCW 0x037F */
  71
  72
  73 /*---------------------------------------------------------*/
  74 /*--- misc helpers                                      ---*/
  75 /*---------------------------------------------------------*/
  76
  77 /* These are duplicated in guest-amd64/toIR.c */
  78 static IRExpr* unop ( IROp op, IRExpr* a )
  79 {
  80    return IRExpr_Unop(op, a);
  81 }
  82
  83 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
  84 {
  85    return IRExpr_Binop(op, a1, a2);
  86 }
  87
  88 static IRExpr* bind ( Int binder )
  89 {
  90    return IRExpr_Binder(binder);
  91 }
  92
  93 static Bool isZeroU8 ( const IRExpr* e )
  94 {
  95    return e->tag == Iex_Const
  96           && e->Iex.Const.con->tag == Ico_U8
  97           && e->Iex.Const.con->Ico.U8 == 0;
  98 }
  99
 100
 101 /*---------------------------------------------------------*/
 102 /*--- ISelEnv                                           ---*/
 103 /*---------------------------------------------------------*/
 104
 105 /* This carries around:
 106
 107    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
 108      might encounter.  This is computed before insn selection starts,
 109      and does not change.
 110
 111    - A mapping from IRTemp to HReg.  This tells the insn selector
 112      which virtual register is associated with each IRTemp
 113      temporary.  This is computed before insn selection starts, and
 114      does not change.  We expect this mapping to map precisely the
 115      same set of IRTemps as the type mapping does.
 116
 117         - vregmap   holds the primary register for the IRTemp.
 118         - vregmapHI is only used for 128-bit integer-typed
 119              IRTemps.  It holds the identity of a second
 120              64-bit virtual HReg, which holds the high half
 121              of the value.
 122
 123    - The host subarchitecture we are selecting insns for.
 124      This is set at the start and does not change.
 125
 126    - The code array, that is, the insns selected so far.
 127
 128    - A counter, for generating new virtual registers.
 129
 130    - A Bool for indicating whether we may generate chain-me
 131      instructions for control flow transfers, or whether we must use
 132      XAssisted.
 133
 134    - The maximum guest address of any guest insn in this block.
 135      Actually, the address of the highest-addressed byte from any insn
 136      in this block.  Is set at the start and does not change.  This is
 137      used for detecting jumps which are definitely forward-edges from
 138      this block, and therefore can be made (chained) to the fast entry
 139      point of the destination, thereby avoiding the destination's
 140      event check.
 141
 142    Note, this is all host-independent.  (JRS 20050201: well, kinda
 143    ... not completely.  Compare with ISelEnv for X86.)
 144 */
 145
 146 typedef
 147    struct {
 148       /* Constant -- are set at the start and do not change. */
 149       IRTypeEnv*   type_env;
 150
 151       HReg*        vregmap;
 152       HReg*        vregmapHI;
 153       Int          n_vregmap;
 154
 155       UInt         hwcaps;
 156
 157       Bool         chainingAllowed;
 158       Addr64       max_ga;
 159
 160       /* These are modified as we go along. */
 161       HInstrArray* code;
 162       Int          vreg_ctr;
 163    }
 164    ISelEnv;
 165
 166
 167 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
 168 {
 169    vassert(tmp < env->n_vregmap);
 170    return env->vregmap[tmp];
 171 }
 172
 173 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
 174                                ISelEnv* env, IRTemp tmp )
 175 {
 176    vassert(tmp < env->n_vregmap);
 177    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
 178    *vrLO = env->vregmap[tmp];
 179    *vrHI = env->vregmapHI[tmp];
 180 }
 181
 182 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
 183 {
 184    addHInstr(env->code, instr);
 185    if (vex_traceflags & VEX_TRACE_VCODE) {
 186       ppAMD64Instr(instr, True);
 187       vex_printf("\n");
 188    }
 189 }
 190
 191 static HReg newVRegI ( ISelEnv* env )
 192 {
 193    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
 194    env->vreg_ctr++;
 195    return reg;
 196 }
 197
 198 static HReg newVRegV ( ISelEnv* env )
 199 {
 200    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
 201    env->vreg_ctr++;
 202    return reg;
 203 }
 204
 205
 206 /*---------------------------------------------------------*/
 207 /*--- ISEL: Forward declarations                        ---*/
 208 /*---------------------------------------------------------*/
 209
 210 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
 211    iselXXX_wrk do the real work, but are not to be called directly.
 212    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
 213    checks that all returned registers are virtual.  You should not
 214    call the _wrk version directly.
 215 */
 216 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
 217 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
 218
 219 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, const IRExpr* e );
 220 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, const IRExpr* e );
 221
 222 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, const IRExpr* e );
 223 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, const IRExpr* e );
 224
 225 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, const IRExpr* e );
 226 static HReg          iselIntExpr_R       ( ISelEnv* env, const IRExpr* e );
 227
 228 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
 229 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
 230
 231 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 232                                           ISelEnv* env, const IRExpr* e );
 233 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
 234                                           ISelEnv* env, const IRExpr* e );
 235
 236 static AMD64CondCode iselCondCode_C_wrk  ( ISelEnv* env, const IRExpr* e );
 237 static AMD64CondCode iselCondCode_C      ( ISelEnv* env, const IRExpr* e );
 238
 239 static HReg          iselCondCode_R_wrk  ( ISelEnv* env, const IRExpr* e );
 240 static HReg          iselCondCode_R      ( ISelEnv* env, const IRExpr* e );
 241
 242 static HReg          iselDblExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 243 static HReg          iselDblExpr         ( ISelEnv* env, const IRExpr* e );
 244
 245 static HReg          iselFltExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 246 static HReg          iselFltExpr         ( ISelEnv* env, const IRExpr* e );
 247
 248 static HReg          iselVecExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 249 static HReg          iselVecExpr         ( ISelEnv* env, const IRExpr* e );
 250
 251 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 252                                         ISelEnv* env, const IRExpr* e );
 253 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
 254                                         ISelEnv* env, const IRExpr* e );
 255
 256
 257 /*---------------------------------------------------------*/
 258 /*--- ISEL: Misc helpers                                ---*/
 259 /*---------------------------------------------------------*/
 260
 261 static Bool sane_AMode ( AMD64AMode* am )
 262 {
 263    switch (am->tag) {
 264       case Aam_IR:
 265          return
 266             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
 267                     && (hregIsVirtual(am->Aam.IR.reg)
 268                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
 269       case Aam_IRRS:
 270          return
 271             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
 272                     && hregIsVirtual(am->Aam.IRRS.base)
 273                     && hregClass(am->Aam.IRRS.index) == HRcInt64
 274                     && hregIsVirtual(am->Aam.IRRS.index) );
 275       default:
 276         vpanic("sane_AMode: unknown amd64 amode tag");
 277    }
 278 }
 279
 280
 281 /* Can the lower 32 bits be signedly widened to produce the whole
 282    64-bit value?  In other words, are the top 33 bits either all 0 or
 283    all 1 ? */
 284 static Bool fitsIn32Bits ( ULong x )
 285 {
 286    Long y1;
 287    y1 = x << 32;
 288    y1 >>=/*s*/ 32;
 289    return toBool(x == y1);
 290 }
 291
 292 /* Is this a 64-bit zero expression? */
 293
 294 static Bool isZeroU64 ( const IRExpr* e )
 295 {
 296    return e->tag == Iex_Const
 297           && e->Iex.Const.con->tag == Ico_U64
 298           && e->Iex.Const.con->Ico.U64 == 0ULL;
 299 }
 300
 301 static Bool isZeroU32 ( const IRExpr* e )
 302 {
 303    return e->tag == Iex_Const
 304           && e->Iex.Const.con->tag == Ico_U32
 305           && e->Iex.Const.con->Ico.U32 == 0;
 306 }
 307
 308 /* Are both args atoms and the same?  This is copy of eqIRAtom
 309    that omits the assertions that the args are indeed atoms. */
 310
 311 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
 312 {
 313    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 314       return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
 315    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 316       return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
 317    return False;
 318 }
 319
 320 /* Make a int reg-reg move. */
 321
 322 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
 323 {
 324    vassert(hregClass(src) == HRcInt64);
 325    vassert(hregClass(dst) == HRcInt64);
 326    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
 327 }
 328
 329 /* Make a vector (128 bit) reg-reg move. */
 330
 331 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
 332 {
 333    vassert(hregClass(src) == HRcVec128);
 334    vassert(hregClass(dst) == HRcVec128);
 335    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
 336 }
 337
 338 /* Advance/retreat %rsp by n. */
 339
 340 static void add_to_rsp ( ISelEnv* env, Int n )
 341 {
 342    vassert(n > 0 && n < 256 && (n%8) == 0);
 343    addInstr(env,
 344             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
 345                                         hregAMD64_RSP()));
 346 }
 347
 348 static void sub_from_rsp ( ISelEnv* env, Int n )
 349 {
 350    vassert(n > 0 && n < 256 && (n%8) == 0);
 351    addInstr(env,
 352             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
 353                                         hregAMD64_RSP()));
 354 }
 355
 356 /* Push 64-bit constants on the stack. */
 357 static void push_uimm64( ISelEnv* env, ULong uimm64 )
 358 {
 359    /* If uimm64 can be expressed as the sign extension of its
 360       lower 32 bits, we can do it the easy way. */
 361    Long simm64 = (Long)uimm64;
 362    if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
 363       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
 364    } else {
 365       HReg tmp = newVRegI(env);
 366       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
 367       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
 368    }
 369 }
 370
 371
 372 /* Used only in doHelperCall.  If possible, produce a single
 373    instruction which computes 'e' into 'dst'.  If not possible, return
 374    NULL. */
 375
 376 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
 377                                                     HReg     dst,
 378                                                     IRExpr*  e )
 379 {
 380    /* Per comments in doHelperCall below, appearance of
 381       Iex_VECRET implies ill-formed IR. */
 382    vassert(e->tag != Iex_VECRET);
 383
 384    /* In this case we give out a copy of the BaseBlock pointer. */
 385    if (UNLIKELY(e->tag == Iex_GSPTR)) {
 386       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
 387    }
 388
 389    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
 390
 391    if (e->tag == Iex_Const) {
 392       vassert(e->Iex.Const.con->tag == Ico_U64);
 393       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
 394          return AMD64Instr_Alu64R(
 395                    Aalu_MOV,
 396                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
 397                    dst
 398                 );
 399       } else {
 400          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
 401       }
 402    }
 403
 404    if (e->tag == Iex_RdTmp) {
 405       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
 406       return mk_iMOVsd_RR(src, dst);
 407    }
 408
 409    if (e->tag == Iex_Get) {
 410       vassert(e->Iex.Get.ty == Ity_I64);
 411       return AMD64Instr_Alu64R(
 412                 Aalu_MOV,
 413                 AMD64RMI_Mem(
 414                    AMD64AMode_IR(e->Iex.Get.offset,
 415                                  hregAMD64_RBP())),
 416                 dst);
 417    }
 418
 419    if (e->tag == Iex_Unop
 420        && e->Iex.Unop.op == Iop_32Uto64
 421        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
 422       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
 423       return AMD64Instr_MovxLQ(False, src, dst);
 424    }
 425
 426    if (0) { ppIRExpr(e); vex_printf("\n"); }
 427
 428    return NULL;
 429 }
 430
 431
 432 /* Do a complete function call.  |guard| is a Ity_Bit expression
 433    indicating whether or not the call happens.  If guard==NULL, the
 434    call is unconditional.  |retloc| is set to indicate where the
 435    return value is after the call.  The caller (of this fn) must
 436    generate code to add |stackAdjustAfterCall| to the stack pointer
 437    after the call is done. */
 438
 439 static
 440 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
 441                     /*OUT*/RetLoc* retloc,
 442                     ISelEnv* env,
 443                     IRExpr* guard,
 444                     IRCallee* cee, IRType retTy, IRExpr** args )
 445 {
 446    AMD64CondCode cc;
 447    HReg          argregs[6];
 448    HReg          tmpregs[6];
 449    AMD64Instr*   fastinstrs[6];
 450    UInt          n_args, i;
 451
 452    /* Set default returns.  We'll update them later if needed. */
 453    *stackAdjustAfterCall = 0;
 454    *retloc               = mk_RetLoc_INVALID();
 455
 456    /* These are used for cross-checking that IR-level constraints on
 457       the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
 458    UInt nVECRETs = 0;
 459    UInt nGSPTRs  = 0;
 460
 461    /* Marshal args for a call and do the call.
 462
 463       This function only deals with a tiny set of possibilities, which
 464       cover all helpers in practice.  The restrictions are that only
 465       arguments in registers are supported, hence only 6x64 integer
 466       bits in total can be passed.  In fact the only supported arg
 467       type is I64.
 468
 469       The return type can be I{64,32,16,8} or V{128,256}.  In the
 470       latter two cases, it is expected that |args| will contain the
 471       special node IRExpr_VECRET(), in which case this routine
 472       generates code to allocate space on the stack for the vector
 473       return value.  Since we are not passing any scalars on the
 474       stack, it is enough to preallocate the return space before
 475       marshalling any arguments, in this case.
 476
 477       |args| may also contain IRExpr_GSPTR(), in which case the
 478       value in %rbp is passed as the corresponding argument.
 479
 480       Generating code which is both efficient and correct when
 481       parameters are to be passed in registers is difficult, for the
 482       reasons elaborated in detail in comments attached to
 483       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
 484       of the method described in those comments.
 485
 486       The problem is split into two cases: the fast scheme and the
 487       slow scheme.  In the fast scheme, arguments are computed
 488       directly into the target (real) registers.  This is only safe
 489       when we can be sure that computation of each argument will not
 490       trash any real registers set by computation of any other
 491       argument.
 492
 493       In the slow scheme, all args are first computed into vregs, and
 494       once they are all done, they are moved to the relevant real
 495       regs.  This always gives correct code, but it also gives a bunch
 496       of vreg-to-rreg moves which are usually redundant but are hard
 497       for the register allocator to get rid of.
 498
 499       To decide which scheme to use, all argument expressions are
 500       first examined.  If they are all so simple that it is clear they
 501       will be evaluated without use of any fixed registers, use the
 502       fast scheme, else use the slow scheme.  Note also that only
 503       unconditional calls may use the fast scheme, since having to
 504       compute a condition expression could itself trash real
 505       registers.  Note that for simplicity, in the case where
 506       IRExpr_VECRET() is present, we use the slow scheme.  This is
 507       motivated by the desire to avoid any possible complexity
 508       w.r.t. nested calls.
 509
 510       Note this requires being able to examine an expression and
 511       determine whether or not evaluation of it might use a fixed
 512       register.  That requires knowledge of how the rest of this insn
 513       selector works.  Currently just the following 3 are regarded as
 514       safe -- hopefully they cover the majority of arguments in
 515       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
 516    */
 517
 518    /* Note that the cee->regparms field is meaningless on AMD64 host
 519       (since there is only one calling convention) and so we always
 520       ignore it. */
 521    n_args = 0;
 522    for (i = 0; args[i]; i++)
 523       n_args++;
 524
 525    if (n_args > 6)
 526       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
 527
 528    argregs[0] = hregAMD64_RDI();
 529    argregs[1] = hregAMD64_RSI();
 530    argregs[2] = hregAMD64_RDX();
 531    argregs[3] = hregAMD64_RCX();
 532    argregs[4] = hregAMD64_R8();
 533    argregs[5] = hregAMD64_R9();
 534
 535    tmpregs[0] = tmpregs[1] = tmpregs[2] =
 536    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
 537
 538    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
 539    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
 540
 541    /* First decide which scheme (slow or fast) is to be used.  First
 542       assume the fast scheme, and select slow if any contraindications
 543       (wow) appear. */
 544
 545    /* We'll need space on the stack for the return value.  Avoid
 546       possible complications with nested calls by using the slow
 547       scheme. */
 548    if (retTy == Ity_V128 || retTy == Ity_V256)
 549       goto slowscheme;
 550
 551    if (guard) {
 552       if (guard->tag == Iex_Const
 553           && guard->Iex.Const.con->tag == Ico_U1
 554           && guard->Iex.Const.con->Ico.U1 == True) {
 555          /* unconditional */
 556       } else {
 557          /* Not manifestly unconditional -- be conservative. */
 558          goto slowscheme;
 559       }
 560    }
 561
 562    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
 563       use the slow scheme.  Because this is tentative, we can't call
 564       addInstr (that is, commit to) any instructions until we're
 565       handled all the arguments.  So park the resulting instructions
 566       in a buffer and emit that if we're successful. */
 567
 568    /* FAST SCHEME */
 569    /* In this loop, we process args that can be computed into the
 570       destination (real) register with a single instruction, without
 571       using any fixed regs.  That also includes IRExpr_GSPTR(), but
 572       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
 573       never see IRExpr_VECRET() at this point, since the return-type
 574       check above should ensure all those cases use the slow scheme
 575       instead. */
 576    vassert(n_args <= 6);
 577    for (i = 0; i < n_args; i++) {
 578       IRExpr* arg = args[i];
 579       if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
 580          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 581       }
 582       fastinstrs[i]
 583          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
 584       if (fastinstrs[i] == NULL)
 585          goto slowscheme;
 586    }
 587
 588    /* Looks like we're in luck.  Emit the accumulated instructions and
 589       move on to doing the call itself. */
 590    for (i = 0; i < n_args; i++)
 591       addInstr(env, fastinstrs[i]);
 592
 593    /* Fast scheme only applies for unconditional calls.  Hence: */
 594    cc = Acc_ALWAYS;
 595
 596    goto handle_call;
 597
 598
 599    /* SLOW SCHEME; move via temporaries */
 600   slowscheme:
 601    {}
 602 #  if 0 /* debug only */
 603    if (n_args > 0) {for (i = 0; args[i]; i++) {
 604    ppIRExpr(args[i]); vex_printf(" "); }
 605    vex_printf("\n");}
 606 #  endif
 607
 608    /* If we have a vector return type, allocate a place for it on the
 609       stack and record its address. */
 610    HReg r_vecRetAddr = INVALID_HREG;
 611    if (retTy == Ity_V128) {
 612       r_vecRetAddr = newVRegI(env);
 613       sub_from_rsp(env, 16);
 614       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 615    }
 616    else if (retTy == Ity_V256) {
 617       r_vecRetAddr = newVRegI(env);
 618       sub_from_rsp(env, 32);
 619       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 620    }
 621
 622    vassert(n_args <= 6);
 623    for (i = 0; i < n_args; i++) {
 624       IRExpr* arg = args[i];
 625       if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 626          tmpregs[i] = newVRegI(env);
 627          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
 628          nGSPTRs++;
 629       }
 630       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
 631          /* We stashed the address of the return slot earlier, so just
 632             retrieve it now. */
 633          vassert(!hregIsInvalid(r_vecRetAddr));
 634          tmpregs[i] = r_vecRetAddr;
 635          nVECRETs++;
 636       }
 637       else {
 638          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 639          tmpregs[i] = iselIntExpr_R(env, args[i]);
 640       }
 641    }
 642
 643    /* Now we can compute the condition.  We can't do it earlier
 644       because the argument computations could trash the condition
 645       codes.  Be a bit clever to handle the common case where the
 646       guard is 1:Bit. */
 647    cc = Acc_ALWAYS;
 648    if (guard) {
 649       if (guard->tag == Iex_Const
 650           && guard->Iex.Const.con->tag == Ico_U1
 651           && guard->Iex.Const.con->Ico.U1 == True) {
 652          /* unconditional -- do nothing */
 653       } else {
 654          cc = iselCondCode_C( env, guard );
 655       }
 656    }
 657
 658    /* Move the args to their final destinations. */
 659    for (i = 0; i < n_args; i++) {
 660       /* None of these insns, including any spill code that might
 661          be generated, may alter the condition codes. */
 662       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
 663    }
 664
 665
 666    /* Do final checks, set the return values, and generate the call
 667       instruction proper. */
 668   handle_call:
 669
 670    if (retTy == Ity_V128 || retTy == Ity_V256) {
 671       vassert(nVECRETs == 1);
 672    } else {
 673       vassert(nVECRETs == 0);
 674    }
 675
 676    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 677
 678    vassert(*stackAdjustAfterCall == 0);
 679    vassert(is_RetLoc_INVALID(*retloc));
 680    switch (retTy) {
 681          case Ity_INVALID:
 682             /* Function doesn't return a value. */
 683             *retloc = mk_RetLoc_simple(RLPri_None);
 684             break;
 685          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
 686             *retloc = mk_RetLoc_simple(RLPri_Int);
 687             break;
 688          case Ity_V128:
 689             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
 690             *stackAdjustAfterCall = 16;
 691             break;
 692          case Ity_V256:
 693             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
 694             *stackAdjustAfterCall = 32;
 695             break;
 696          default:
 697             /* IR can denote other possible return types, but we don't
 698                handle those here. */
 699            vassert(0);
 700    }
 701
 702    /* Finally, generate the call itself.  This needs the *retloc value
 703       set in the switch above, which is why it's at the end. */
 704    addInstr(env,
 705             AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
 706 }
 707
 708
 709 /* Given a guest-state array descriptor, an index expression and a
 710    bias, generate an AMD64AMode holding the relevant guest state
 711    offset. */
 712
 713 static
 714 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
 715                                   IRExpr* off, Int bias )
 716 {
 717    HReg tmp, roff;
 718    Int  elemSz = sizeofIRType(descr->elemTy);
 719    Int  nElems = descr->nElems;
 720
 721    /* Throw out any cases not generated by an amd64 front end.  In
 722       theory there might be a day where we need to handle them -- if
 723       we ever run non-amd64-guest on amd64 host. */
 724
 725    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
 726       vpanic("genGuestArrayOffset(amd64 host)");
 727
 728    /* Compute off into a reg, %off.  Then return:
 729
 730          movq %off, %tmp
 731          addq $bias, %tmp  (if bias != 0)
 732          andq %tmp, 7
 733          ... base(%rbp, %tmp, shift) ...
 734    */
 735    tmp  = newVRegI(env);
 736    roff = iselIntExpr_R(env, off);
 737    addInstr(env, mk_iMOVsd_RR(roff, tmp));
 738    if (bias != 0) {
 739       /* Make sure the bias is sane, in the sense that there are
 740          no significant bits above bit 30 in it. */
 741       vassert(-10000 < bias && bias < 10000);
 742       addInstr(env,
 743                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
 744    }
 745    addInstr(env,
 746             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
 747    vassert(elemSz == 1 || elemSz == 8);
 748    return
 749       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
 750                                     elemSz==8 ? 3 : 0);
 751 }
 752
 753
 754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
 755 static
 756 void set_SSE_rounding_default ( ISelEnv* env )
 757 {
 758    /* pushq $DEFAULT_MXCSR
 759       ldmxcsr 0(%rsp)
 760       addq $8, %rsp
 761    */
 762    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 763    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
 764    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 765    add_to_rsp(env, 8);
 766 }
 767
 768 /* Mess with the FPU's rounding mode: set to the default rounding mode
 769    (DEFAULT_FPUCW). */
 770 static
 771 void set_FPU_rounding_default ( ISelEnv* env )
 772 {
 773    /* movq $DEFAULT_FPUCW, -8(%rsp)
 774       fldcw -8(%esp)
 775    */
 776    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 777    addInstr(env, AMD64Instr_Alu64M(
 778                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
 779    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 780 }
 781
 782
 783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
 784    expression denoting a value in the range 0 .. 3, indicating a round
 785    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
 786    have the same rounding.
 787 */
 788 static
 789 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
 790 {
 791    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
 792       both rounding bits == 0.  If that wasn't the case, we couldn't
 793       create a new rounding field simply by ORing the new value into
 794       place. */
 795
 796    /* movq $3, %reg
 797       andq [[mode]], %reg  -- shouldn't be needed; paranoia
 798       shlq $13, %reg
 799       orq $DEFAULT_MXCSR, %reg
 800       pushq %reg
 801       ldmxcsr 0(%esp)
 802       addq $8, %rsp
 803    */
 804    HReg        reg      = newVRegI(env);
 805    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 806    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
 807    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
 808                                    iselIntExpr_RMI(env, mode), reg));
 809    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
 810    addInstr(env, AMD64Instr_Alu64R(
 811                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
 812    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
 813    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 814    add_to_rsp(env, 8);
 815 }
 816
 817
 818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
 819    expression denoting a value in the range 0 .. 3, indicating a round
 820    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
 821    the same rounding.
 822 */
 823 static
 824 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
 825 {
 826    HReg rrm  = iselIntExpr_R(env, mode);
 827    HReg rrm2 = newVRegI(env);
 828    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 829
 830    /* movq  %rrm, %rrm2
 831       andq  $3, %rrm2   -- shouldn't be needed; paranoia
 832       shlq  $10, %rrm2
 833       orq   $DEFAULT_FPUCW, %rrm2
 834       movq  %rrm2, -8(%rsp)
 835       fldcw -8(%esp)
 836    */
 837    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
 838    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
 839    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
 840    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
 841                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
 842    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
 843                                    AMD64RI_Reg(rrm2), m8_rsp));
 844    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 845 }
 846
 847
 848 /* Generate all-zeroes into a new vector register.
 849 */
 850 static HReg generate_zeroes_V128 ( ISelEnv* env )
 851 {
 852    HReg dst = newVRegV(env);
 853    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
 854    return dst;
 855 }
 856
 857 /* Generate all-ones into a new vector register.
 858 */
 859 static HReg generate_ones_V128 ( ISelEnv* env )
 860 {
 861    HReg dst = newVRegV(env);
 862    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
 863    return dst;
 864 }
 865
 866
 867 /* Generate !src into a new vector register.  Amazing that there isn't
 868    a less crappy way to do this.
 869 */
 870 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
 871 {
 872    HReg dst = generate_ones_V128(env);
 873    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
 874    return dst;
 875 }
 876
 877
 878 /* Expand the given byte into a 64-bit word, by cloning each bit
 879    8 times. */
 880 static ULong bitmask8_to_bytemask64 ( UShort w8 )
 881 {
 882    vassert(w8 == (w8 & 0xFF));
 883    ULong w64 = 0;
 884    Int i;
 885    for (i = 0; i < 8; i++) {
 886       if (w8 & (1<<i))
 887          w64 |= (0xFFULL << (8 * i));
 888    }
 889    return w64;
 890 }
 891
 892
 893 /*---------------------------------------------------------*/
 894 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
 895 /*---------------------------------------------------------*/
 896
 897 /* Select insns for an integer-typed expression, and add them to the
 898    code list.  Return a reg holding the result.  This reg will be a
 899    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
 900    want to modify it, ask for a new vreg, copy it in there, and modify
 901    the copy.  The register allocator will do its best to map both
 902    vregs to the same real register, so the copies will often disappear
 903    later in the game.
 904
 905    This should handle expressions of 64, 32, 16 and 8-bit type.  All
 906    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
 907    expressions, the upper 32/48/56 bits are arbitrary, so you should
 908    mask or sign extend partial values if necessary.
 909 */
 910
 911 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
 912 {
 913    HReg r = iselIntExpr_R_wrk(env, e);
 914    /* sanity checks ... */
 915 #  if 0
 916    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
 917 #  endif
 918    vassert(hregClass(r) == HRcInt64);
 919    vassert(hregIsVirtual(r));
 920    return r;
 921 }
 922
 923 /* DO NOT CALL THIS DIRECTLY ! */
 924 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
 925 {
 926    MatchInfo mi;
 927    DECLARE_PATTERN(p_1Uto8_64to1);
 928    DECLARE_PATTERN(p_LDle8_then_8Uto64);
 929    DECLARE_PATTERN(p_LDle16_then_16Uto64);
 930
 931    IRType ty = typeOfIRExpr(env->type_env,e);
 932    switch (ty) {
 933       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
 934       default: vassert(0);
 935    }
 936
 937    switch (e->tag) {
 938
 939    /* --------- TEMP --------- */
 940    case Iex_RdTmp: {
 941       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
 942    }
 943
 944    /* --------- LOAD --------- */
 945    case Iex_Load: {
 946       HReg dst = newVRegI(env);
 947       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
 948
 949       /* We can't handle big-endian loads, nor load-linked. */
 950       if (e->Iex.Load.end != Iend_LE)
 951          goto irreducible;
 952
 953       if (ty == Ity_I64) {
 954          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
 955                                          AMD64RMI_Mem(amode), dst) );
 956          return dst;
 957       }
 958       if (ty == Ity_I32) {
 959          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
 960          return dst;
 961       }
 962       if (ty == Ity_I16) {
 963          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
 964          return dst;
 965       }
 966       if (ty == Ity_I8) {
 967          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
 968          return dst;
 969       }
 970       break;
 971    }
 972
 973    /* --------- BINARY OP --------- */
 974    case Iex_Binop: {
 975       AMD64AluOp   aluOp;
 976       AMD64ShiftOp shOp;
 977
 978       /* Pattern: Sub64(0,x) */
 979       /*     and: Sub32(0,x) */
 980       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
 981           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
 982          HReg dst = newVRegI(env);
 983          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
 984          addInstr(env, mk_iMOVsd_RR(reg,dst));
 985          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
 986          return dst;
 987       }
 988
 989       /* Is it an addition or logical style op? */
 990       switch (e->Iex.Binop.op) {
 991          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
 992             aluOp = Aalu_ADD; break;
 993          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
 994             aluOp = Aalu_SUB; break;
 995          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
 996             aluOp = Aalu_AND; break;
 997          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
 998             aluOp = Aalu_OR; break;
 999          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
1000             aluOp = Aalu_XOR; break;
1001          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1002             aluOp = Aalu_MUL; break;
1003          default:
1004             aluOp = Aalu_INVALID; break;
1005       }
1006       /* For commutative ops we assume any literal
1007          values are on the second operand. */
1008       if (aluOp != Aalu_INVALID) {
1009          HReg dst      = newVRegI(env);
1010          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1011          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1012          addInstr(env, mk_iMOVsd_RR(reg,dst));
1013          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1014          return dst;
1015       }
1016
1017       /* Perhaps a shift op? */
1018       switch (e->Iex.Binop.op) {
1019          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1020             shOp = Ash_SHL; break;
1021          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1022             shOp = Ash_SHR; break;
1023          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1024             shOp = Ash_SAR; break;
1025          default:
1026             shOp = Ash_INVALID; break;
1027       }
1028       if (shOp != Ash_INVALID) {
1029          HReg dst = newVRegI(env);
1030
1031          /* regL = the value to be shifted */
1032          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1033          addInstr(env, mk_iMOVsd_RR(regL,dst));
1034
1035          /* Do any necessary widening for 16/8 bit operands.  Also decide on the
1036             final width at which the shift is to be done. */
1037          Bool shift64 = False;
1038          switch (e->Iex.Binop.op) {
1039             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1040                shift64 = True;
1041                break;
1042             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1043                break;
1044             case Iop_Shr8:
1045                addInstr(env, AMD64Instr_Alu64R(
1046                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1047                break;
1048             case Iop_Shr16:
1049                addInstr(env, AMD64Instr_Alu64R(
1050                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1051                break;
1052             case Iop_Shr32:
1053                break;
1054             case Iop_Sar8:
1055                addInstr(env, AMD64Instr_Sh32(Ash_SHL, 24, dst));
1056                addInstr(env, AMD64Instr_Sh32(Ash_SAR, 24, dst));
1057                break;
1058             case Iop_Sar16:
1059                addInstr(env, AMD64Instr_Sh32(Ash_SHL, 16, dst));
1060                addInstr(env, AMD64Instr_Sh32(Ash_SAR, 16, dst));
1061                break;
1062             case Iop_Sar32:
1063                break;
1064             default:
1065                ppIROp(e->Iex.Binop.op);
1066                vassert(0);
1067          }
1068
1069          /* Now consider the shift amount.  If it's a literal, we
1070             can do a much better job than the general case. */
1071          if (e->Iex.Binop.arg2->tag == Iex_Const) {
1072             /* assert that the IR is well-typed */
1073             Int nshift;
1074             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1075             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1076             vassert(nshift >= 0);
1077             if (nshift > 0) {
1078                /* Can't allow nshift==0 since that means %cl */
1079                if (shift64) {
1080                   addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1081                } else {
1082                   addInstr(env, AMD64Instr_Sh32(shOp, nshift, dst));
1083                }
1084             }
1085          } else {
1086             /* General case; we have to force the amount into %cl. */
1087             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1088             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1089             if (shift64) {
1090                addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1091             } else {
1092                addInstr(env, AMD64Instr_Sh32(shOp, 0/* %cl */, dst));
1093             }
1094          }
1095          return dst;
1096       }
1097
1098       /* Handle misc other scalar ops. */
1099       if (e->Iex.Binop.op == Iop_Max32U) {
1100          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1101          HReg dst  = newVRegI(env);
1102          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1103          addInstr(env, mk_iMOVsd_RR(src1, dst));
1104          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1105          addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1106          return dst;
1107       }
1108
1109       if (e->Iex.Binop.op == Iop_DivModS64to32
1110           || e->Iex.Binop.op == Iop_DivModU64to32) {
1111          /* 64 x 32 -> (32(rem),32(div)) division */
1112          /* Get the 64-bit operand into edx:eax, and the other into
1113             any old R/M. */
1114          HReg      rax     = hregAMD64_RAX();
1115          HReg      rdx     = hregAMD64_RDX();
1116          HReg      dst     = newVRegI(env);
1117          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1118          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1119          /* Compute the left operand into a reg, and then
1120             put the top half in edx and the bottom in eax. */
1121          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1122          addInstr(env, mk_iMOVsd_RR(left64, rdx));
1123          addInstr(env, mk_iMOVsd_RR(left64, rax));
1124          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1125          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1126          addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1127          addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1128          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1129          addInstr(env, mk_iMOVsd_RR(rax, dst));
1130          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1131          return dst;
1132       }
1133
1134       if (e->Iex.Binop.op == Iop_32HLto64) {
1135          HReg hi32  = newVRegI(env);
1136          HReg lo32  = newVRegI(env);
1137          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1138          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1139          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1140          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1141          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1142          addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1143          addInstr(env, AMD64Instr_Alu64R(
1144                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1145          return hi32;
1146       }
1147
1148       if (e->Iex.Binop.op == Iop_16HLto32) {
1149          HReg hi16  = newVRegI(env);
1150          HReg lo16  = newVRegI(env);
1151          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1152          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1153          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1154          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1155          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1156          addInstr(env, AMD64Instr_Alu64R(
1157                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1158          addInstr(env, AMD64Instr_Alu64R(
1159                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1160          return hi16;
1161       }
1162
1163       if (e->Iex.Binop.op == Iop_8HLto16) {
1164          HReg hi8  = newVRegI(env);
1165          HReg lo8  = newVRegI(env);
1166          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1167          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1168          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1169          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1170          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1171          addInstr(env, AMD64Instr_Alu64R(
1172                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1173          addInstr(env, AMD64Instr_Alu64R(
1174                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1175          return hi8;
1176       }
1177
1178       if (e->Iex.Binop.op == Iop_MullS32
1179           || e->Iex.Binop.op == Iop_MullS16
1180           || e->Iex.Binop.op == Iop_MullS8
1181           || e->Iex.Binop.op == Iop_MullU32
1182           || e->Iex.Binop.op == Iop_MullU16
1183           || e->Iex.Binop.op == Iop_MullU8) {
1184          HReg a32   = newVRegI(env);
1185          HReg b32   = newVRegI(env);
1186          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1187          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1188          Int          shift  = 0;
1189          AMD64ShiftOp shr_op = Ash_SHR;
1190          switch (e->Iex.Binop.op) {
1191             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1192             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1193             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1194             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1195             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1196             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1197             default: vassert(0);
1198          }
1199
1200          addInstr(env, mk_iMOVsd_RR(a32s, a32));
1201          addInstr(env, mk_iMOVsd_RR(b32s, b32));
1202          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1203          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1204          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1205          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1206          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1207          return b32;
1208       }
1209
1210       if (e->Iex.Binop.op == Iop_CmpF64) {
1211          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1212          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1213          HReg dst = newVRegI(env);
1214          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1215          /* Mask out irrelevant parts of the result so as to conform
1216             to the CmpF64 definition. */
1217          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1218          return dst;
1219       }
1220
1221       if (e->Iex.Binop.op == Iop_F64toI32S
1222           || e->Iex.Binop.op == Iop_F64toI64S) {
1223          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1224          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1225          HReg dst = newVRegI(env);
1226          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1227          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1228          set_SSE_rounding_default(env);
1229          return dst;
1230       }
1231
1232       /* Deal with 64-bit SIMD binary ops.  For the most part these are doable
1233          by using the equivalent 128-bit operation and ignoring the upper half
1234          of the result. */
1235       AMD64SseOp op = Asse_INVALID;
1236       Bool arg1isEReg = False;
1237       Bool preShift32R = False;
1238       switch (e->Iex.Binop.op) {
1239          // The following 3 could be done with 128 bit insns too, but
1240          // first require the inputs to be reformatted.
1241          //case Iop_QNarrowBin32Sto16Sx4:
1242          //op = Asse_PACKSSD; arg1isEReg = True; break;
1243          //case Iop_QNarrowBin16Sto8Sx8:
1244          //op = Asse_PACKSSW; arg1isEReg = True; break;
1245          //case Iop_QNarrowBin16Sto8Ux8:
1246          //op = Asse_PACKUSW; arg1isEReg = True; break;
1247
1248          case Iop_InterleaveHI8x8:
1249             op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1250             break;
1251          case Iop_InterleaveHI16x4:
1252             op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1253             break;
1254          case Iop_InterleaveHI32x2:
1255             op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1256             break;
1257          case Iop_InterleaveLO8x8:
1258             op = Asse_UNPCKLB; arg1isEReg = True;
1259             break;
1260          case Iop_InterleaveLO16x4:
1261             op = Asse_UNPCKLW; arg1isEReg = True;
1262             break;
1263          case Iop_InterleaveLO32x2:
1264             op = Asse_UNPCKLD; arg1isEReg = True;
1265             break;
1266
1267          case Iop_Add8x8:     op = Asse_ADD8;     break;
1268          case Iop_Add16x4:    op = Asse_ADD16;    break;
1269          case Iop_Add32x2:    op = Asse_ADD32;    break;
1270          case Iop_QAdd8Sx8:   op = Asse_QADD8S;   break;
1271          case Iop_QAdd16Sx4:  op = Asse_QADD16S;  break;
1272          case Iop_QAdd8Ux8:   op = Asse_QADD8U;   break;
1273          case Iop_QAdd16Ux4:  op = Asse_QADD16U;  break;
1274          case Iop_Avg8Ux8:    op = Asse_AVG8U;    break;
1275          case Iop_Avg16Ux4:   op = Asse_AVG16U;   break;
1276          case Iop_CmpEQ8x8:   op = Asse_CMPEQ8;   break;
1277          case Iop_CmpEQ16x4:  op = Asse_CMPEQ16;  break;
1278          case Iop_CmpEQ32x2:  op = Asse_CMPEQ32;  break;
1279          case Iop_CmpGT8Sx8:  op = Asse_CMPGT8S;  break;
1280          case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1281          case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1282          case Iop_Max16Sx4:   op = Asse_MAX16S;   break;
1283          case Iop_Max8Ux8:    op = Asse_MAX8U;    break;
1284          case Iop_Min16Sx4:   op = Asse_MIN16S;   break;
1285          case Iop_Min8Ux8:    op = Asse_MIN8U;    break;
1286          case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1287          case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1288          case Iop_Mul16x4:    op = Asse_MUL16;    break;
1289          case Iop_Sub8x8:     op = Asse_SUB8;     break;
1290          case Iop_Sub16x4:    op = Asse_SUB16;    break;
1291          case Iop_Sub32x2:    op = Asse_SUB32;    break;
1292          case Iop_QSub8Sx8:   op = Asse_QSUB8S;   break;
1293          case Iop_QSub16Sx4:  op = Asse_QSUB16S;  break;
1294          case Iop_QSub8Ux8:   op = Asse_QSUB8U;   break;
1295          case Iop_QSub16Ux4:  op = Asse_QSUB16U;  break;
1296          default: break;
1297       }
1298       if (op != Asse_INVALID) {
1299          /* This isn't pretty, but .. move each arg to the low half of an XMM
1300             register, do the operation on the whole register, and move the
1301             result back to an integer register. */
1302          const IRExpr* arg1 = e->Iex.Binop.arg1;
1303          const IRExpr* arg2 = e->Iex.Binop.arg2;
1304          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1305          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1306          HReg iarg1 = iselIntExpr_R(env, arg1);
1307          HReg iarg2 = iselIntExpr_R(env, arg2);
1308          HReg varg1 = newVRegV(env);
1309          HReg varg2 = newVRegV(env);
1310          HReg idst  = newVRegI(env);
1311          addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1312          addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1313          if (arg1isEReg) {
1314             if (preShift32R) {
1315                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1316                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1317             }
1318             addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1319             addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1320          } else {
1321             vassert(!preShift32R);
1322             addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1323             addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1324          }
1325          return idst;
1326       }
1327
1328       UInt laneBits = 0;
1329       op = Asse_INVALID;
1330       switch (e->Iex.Binop.op) {
1331          case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1332          case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1333          case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1334          case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1335          case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1336          case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1337          default: break;
1338       }
1339       if (op != Asse_INVALID) {
1340          const IRExpr* arg1 = e->Iex.Binop.arg1;
1341          const IRExpr* arg2 = e->Iex.Binop.arg2;
1342          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1343          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1344          HReg igreg = iselIntExpr_R(env, arg1);
1345          HReg vgreg = newVRegV(env);
1346          HReg idst  = newVRegI(env);
1347          addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1348          /* If it's a shift by an in-range immediate, generate a single
1349             instruction. */
1350          if (arg2->tag == Iex_Const) {
1351             IRConst* c = arg2->Iex.Const.con;
1352             vassert(c->tag == Ico_U8);
1353             UInt shift = c->Ico.U8;
1354             if (shift < laneBits) {
1355                addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1356                addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1357                return idst;
1358             }
1359          }
1360          /* Otherwise we have to do it the longwinded way. */
1361          HReg ishift = iselIntExpr_R(env, arg2);
1362          HReg vshift = newVRegV(env);
1363          addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1364          addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1365          addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1366          return idst;
1367       }
1368
1369       if (e->Iex.Binop.op == Iop_Mul32x2) {
1370          const IRExpr* arg1 = e->Iex.Binop.arg1;
1371          const IRExpr* arg2 = e->Iex.Binop.arg2;
1372          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1373          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1374          HReg s1 = iselIntExpr_R(env, arg1);
1375          HReg s2 = iselIntExpr_R(env, arg2);
1376          HReg resLo = newVRegI(env);
1377          // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1378          addInstr(env, mk_iMOVsd_RR(s1, resLo));
1379          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1380          addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1381
1382          // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1383          HReg resHi = newVRegI(env);
1384          addInstr(env, mk_iMOVsd_RR(s1, resHi));
1385          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1386          HReg tmp = newVRegI(env);
1387          addInstr(env, mk_iMOVsd_RR(s2, tmp));
1388          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1389          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1390          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1391
1392          // final result = resHi | resLo
1393          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1394          return resLo;
1395       }
1396
1397       // A few remaining SIMD64 ops require helper functions, at least for
1398       // now.
1399       Bool second_is_UInt = False;
1400       HWord fn = 0;
1401       switch (e->Iex.Binop.op) {
1402          case Iop_CatOddLanes16x4:
1403             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1404          case Iop_CatEvenLanes16x4:
1405             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1406          case Iop_PermOrZero8x8:
1407             fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1408
1409          case Iop_QNarrowBin32Sto16Sx4:
1410             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1411          case Iop_QNarrowBin16Sto8Sx8:
1412             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1413          case Iop_QNarrowBin16Sto8Ux8:
1414             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1415
1416          case Iop_NarrowBin16to8x8:
1417             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1418          case Iop_NarrowBin32to16x4:
1419             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1420
1421          case Iop_SarN8x8:
1422             fn = (HWord)h_generic_calc_SarN8x8;
1423             second_is_UInt = True;
1424             break;
1425
1426          default:
1427             fn = (HWord)0; break;
1428       }
1429       if (fn != (HWord)0) {
1430          /* Note: the following assumes all helpers are of signature
1431                ULong fn ( ULong, ULong ), and they are
1432             not marked as regparm functions.
1433          */
1434          HReg dst  = newVRegI(env);
1435          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1436          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1437          if (second_is_UInt)
1438             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1439          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1440          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1441          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1442                                         mk_RetLoc_simple(RLPri_Int) ));
1443          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1444          return dst;
1445       }
1446
1447       // Half-float vector conversion
1448       if (e->Iex.Binop.op == Iop_F32toF16x4
1449           && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1450          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1451          HReg dstV = newVRegV(env);
1452          HReg dstI = newVRegI(env);
1453          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1454          addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1455          set_SSE_rounding_default(env);
1456          addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1457          return dstI;
1458       }
1459
1460       break;
1461    }
1462
1463    /* --------- UNARY OP --------- */
1464    case Iex_Unop: {
1465
1466       /* 1Uto8(64to1(expr64)) */
1467       {
1468          DEFINE_PATTERN( p_1Uto8_64to1,
1469                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1470          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1471             const IRExpr* expr64 = mi.bindee[0];
1472             HReg    dst    = newVRegI(env);
1473             HReg    src    = iselIntExpr_R(env, expr64);
1474             addInstr(env, mk_iMOVsd_RR(src,dst) );
1475             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1476                                             AMD64RMI_Imm(1), dst));
1477             return dst;
1478          }
1479       }
1480
1481       /* 8Uto64(LDle(expr64)) */
1482       {
1483          DEFINE_PATTERN(p_LDle8_then_8Uto64,
1484                         unop(Iop_8Uto64,
1485                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1486          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1487             HReg dst = newVRegI(env);
1488             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1489             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1490             return dst;
1491          }
1492       }
1493
1494       /* 16Uto64(LDle(expr64)) */
1495       {
1496          DEFINE_PATTERN(p_LDle16_then_16Uto64,
1497                         unop(Iop_16Uto64,
1498                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1499          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1500             HReg dst = newVRegI(env);
1501             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1502             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1503             return dst;
1504          }
1505       }
1506
1507       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1508          Use 32 bit arithmetic and let the default zero-extend rule
1509          do the 32Uto64 for free. */
1510       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1511          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1512          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1513          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1514          AMD64AluOp aluOp = Aalu_INVALID;
1515          switch (opi) {
1516             case Iop_Add32: aluOp = Aalu_ADD; break;
1517             case Iop_Sub32: aluOp = Aalu_SUB; break;
1518             case Iop_And32: aluOp = Aalu_AND; break;
1519             case Iop_Or32:  aluOp = Aalu_OR;  break;
1520             case Iop_Xor32: aluOp = Aalu_XOR; break;
1521             default: break;
1522          }
1523          if (aluOp != Aalu_INVALID) {
1524             /* For commutative ops we assume any literal values are on
1525                the second operand. */
1526             HReg dst      = newVRegI(env);
1527             HReg reg      = iselIntExpr_R(env, argL);
1528             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1529             addInstr(env, mk_iMOVsd_RR(reg,dst));
1530             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1531             return dst;
1532          }
1533          /* just fall through to normal handling for Iop_32Uto64 */
1534       }
1535
1536       /* Fallback cases */
1537       switch (e->Iex.Unop.op) {
1538          case Iop_32Uto64:
1539          case Iop_32Sto64: {
1540             HReg dst = newVRegI(env);
1541             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1542             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1543                                             src, dst) );
1544             return dst;
1545          }
1546          case Iop_128HIto64: {
1547             HReg rHi, rLo;
1548             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1549             return rHi; /* and abandon rLo */
1550          }
1551          case Iop_128to64: {
1552             HReg rHi, rLo;
1553             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1554             return rLo; /* and abandon rHi */
1555          }
1556          case Iop_8Uto16:
1557          case Iop_8Uto32:
1558          case Iop_8Uto64:
1559          case Iop_16Uto64:
1560          case Iop_16Uto32: {
1561             HReg dst     = newVRegI(env);
1562             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1563             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1564                                    || e->Iex.Unop.op==Iop_16Uto64 );
1565             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1566             addInstr(env, mk_iMOVsd_RR(src,dst) );
1567             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1568                                             AMD64RMI_Imm(mask), dst));
1569             return dst;
1570          }
1571          case Iop_8Sto16:
1572          case Iop_8Sto64:
1573          case Iop_8Sto32:
1574          case Iop_16Sto32:
1575          case Iop_16Sto64: {
1576             HReg dst     = newVRegI(env);
1577             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1578             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1579                                    || e->Iex.Unop.op==Iop_16Sto64 );
1580             UInt amt     = srcIs16 ? 48 : 56;
1581             addInstr(env, mk_iMOVsd_RR(src,dst) );
1582             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1583             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1584             return dst;
1585          }
1586          case Iop_Not8:
1587          case Iop_Not16:
1588          case Iop_Not32:
1589          case Iop_Not64: {
1590             HReg dst = newVRegI(env);
1591             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1592             addInstr(env, mk_iMOVsd_RR(src,dst) );
1593             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1594             return dst;
1595          }
1596          case Iop_16HIto8:
1597          case Iop_32HIto16:
1598          case Iop_64HIto32: {
1599             HReg dst  = newVRegI(env);
1600             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1601             Int shift = 0;
1602             switch (e->Iex.Unop.op) {
1603                case Iop_16HIto8:  shift = 8;  break;
1604                case Iop_32HIto16: shift = 16; break;
1605                case Iop_64HIto32: shift = 32; break;
1606                default: vassert(0);
1607             }
1608             addInstr(env, mk_iMOVsd_RR(src,dst) );
1609             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1610             return dst;
1611          }
1612          case Iop_1Uto64:
1613          case Iop_1Uto32:
1614          case Iop_1Uto8: {
1615             HReg dst           = newVRegI(env);
1616             AMD64CondCode cond = iselCondCode_C(env, e->Iex.Unop.arg);
1617             addInstr(env, AMD64Instr_Set64(cond,dst));
1618             return dst;
1619          }
1620          case Iop_1Sto8:
1621          case Iop_1Sto16:
1622          case Iop_1Sto32:
1623          case Iop_1Sto64: {
1624             HReg dst = newVRegI(env);
1625             HReg tmp = iselCondCode_R(env, e->Iex.Unop.arg);
1626             addInstr(env, mk_iMOVsd_RR(tmp, dst));
1627             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1628             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1629             return dst;
1630          }
1631          case Iop_Ctz64: {
1632             /* Count trailing zeroes, implemented by amd64 'bsfq' */
1633             HReg dst = newVRegI(env);
1634             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1635             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1636             return dst;
1637          }
1638          case Iop_Clz64: {
1639             /* Count leading zeroes.  Do 'bsrq' to establish the index
1640                of the highest set bit, and subtract that value from
1641                63. */
1642             HReg tmp = newVRegI(env);
1643             HReg dst = newVRegI(env);
1644             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1645             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1646             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1647                                             AMD64RMI_Imm(63), dst));
1648             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1649                                             AMD64RMI_Reg(tmp), dst));
1650             return dst;
1651          }
1652
1653          case Iop_CmpwNEZ64: {
1654             HReg dst = newVRegI(env);
1655             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1656             addInstr(env, mk_iMOVsd_RR(src,dst));
1657             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1658             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1659                                             AMD64RMI_Reg(src), dst));
1660             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1661             return dst;
1662          }
1663
1664          case Iop_CmpwNEZ32: {
1665             HReg src = newVRegI(env);
1666             HReg dst = newVRegI(env);
1667             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1668             addInstr(env, mk_iMOVsd_RR(pre,src));
1669             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1670             addInstr(env, mk_iMOVsd_RR(src,dst));
1671             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1672             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1673                                             AMD64RMI_Reg(src), dst));
1674             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1675             return dst;
1676          }
1677
1678          case Iop_Left8:
1679          case Iop_Left16:
1680          case Iop_Left32:
1681          case Iop_Left64: {
1682             HReg dst = newVRegI(env);
1683             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1684             addInstr(env, mk_iMOVsd_RR(src, dst));
1685             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1686             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1687             return dst;
1688          }
1689
1690          case Iop_V128to32: {
1691             HReg        dst     = newVRegI(env);
1692             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1693             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1694             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1695             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1696             return dst;
1697          }
1698
1699          /* V128{HI}to64 */
1700          case Iop_V128to64: {
1701             HReg dst = newVRegI(env);
1702             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1703             addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1704             return dst;
1705          }
1706          case Iop_V128HIto64: {
1707             HReg dst  = newVRegI(env);
1708             HReg vec  = iselVecExpr(env, e->Iex.Unop.arg);
1709             HReg vec2 = newVRegV(env);
1710             addInstr(env, mk_vMOVsd_RR(vec, vec2));
1711             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1712             addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1713             return dst;
1714          }
1715
1716          /* V256to64_{3,2,1,0} */
1717          case Iop_V256to64_0: case Iop_V256to64_1:
1718          case Iop_V256to64_2: case Iop_V256to64_3: {
1719             HReg vHi, vLo, vec;
1720             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1721             /* Do the first part of the selection by deciding which of
1722                the 128 bit registers to look at, and second part using
1723                the same scheme as for V128{HI}to64 above. */
1724             Bool low64of128 = True;
1725             switch (e->Iex.Unop.op) {
1726                case Iop_V256to64_0: vec = vLo; low64of128 = True;  break;
1727                case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1728                case Iop_V256to64_2: vec = vHi; low64of128 = True;  break;
1729                case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1730                default: vassert(0);
1731             }
1732             HReg dst = newVRegI(env);
1733             if (low64of128) {
1734                addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1735             } else {
1736                HReg vec2 = newVRegV(env);
1737                addInstr(env, mk_vMOVsd_RR(vec, vec2));
1738                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1739                addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1740             }
1741             return dst;
1742          }
1743
1744          /* ReinterpF64asI64(e) */
1745          /* Given an IEEE754 double, produce an I64 with the same bit
1746             pattern. */
1747          case Iop_ReinterpF64asI64: {
1748             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1749             HReg        dst    = newVRegI(env);
1750             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1751             /* paranoia */
1752             set_SSE_rounding_default(env);
1753             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1754             addInstr(env, AMD64Instr_Alu64R(
1755                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1756             return dst;
1757          }
1758
1759          /* ReinterpF32asI32(e) */
1760          /* Given an IEEE754 single, produce an I64 with the same bit
1761             pattern in the lower half. */
1762          case Iop_ReinterpF32asI32: {
1763             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1764             HReg        dst    = newVRegI(env);
1765             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1766             /* paranoia */
1767             set_SSE_rounding_default(env);
1768             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1769             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1770             return dst;
1771          }
1772
1773          case Iop_16to8:
1774          case Iop_32to8:
1775          case Iop_64to8:
1776          case Iop_32to16:
1777          case Iop_64to16:
1778          case Iop_64to32:
1779             /* These are no-ops. */
1780             return iselIntExpr_R(env, e->Iex.Unop.arg);
1781
1782          case Iop_GetMSBs8x8: {
1783             /* Note: the following assumes the helper is of
1784                signature
1785                   UInt fn ( ULong ), and is not a regparm fn.
1786             */
1787             HReg dst = newVRegI(env);
1788             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1789             HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1790             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1791             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1792                                            1, mk_RetLoc_simple(RLPri_Int) ));
1793             /* MovxLQ is not exactly the right thing here.  We just
1794                need to get the bottom 8 bits of RAX into dst, and zero
1795                out everything else.  Assuming that the helper returns
1796                a UInt with the top 24 bits zeroed out, it'll do,
1797                though. */
1798             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1799             return dst;
1800          }
1801
1802          case Iop_GetMSBs8x16: {
1803             /* Note: the following assumes the helper is of signature
1804                   UInt fn ( ULong w64hi, ULong w64Lo ),
1805                and is not a regparm fn. */
1806             HReg dst = newVRegI(env);
1807             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1808             HReg rsp = hregAMD64_RSP();
1809             HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1810             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1811             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1812             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1813                                              16, vec, m16_rsp));
1814             /* hi 64 bits into RDI -- the first arg */
1815             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1816                                              AMD64RMI_Mem(m8_rsp),
1817                                              hregAMD64_RDI() )); /* 1st arg */
1818             /* lo 64 bits into RSI -- the 2nd arg */
1819             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1820                                              AMD64RMI_Mem(m16_rsp),
1821                                              hregAMD64_RSI() )); /* 2nd arg */
1822             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1823                                            2, mk_RetLoc_simple(RLPri_Int) ));
1824             /* MovxLQ is not exactly the right thing here.  We just
1825                need to get the bottom 16 bits of RAX into dst, and zero
1826                out everything else.  Assuming that the helper returns
1827                a UInt with the top 16 bits zeroed out, it'll do,
1828                though. */
1829             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1830             return dst;
1831          }
1832
1833          default:
1834             break;
1835       }
1836
1837       /* Deal with unary 64-bit SIMD ops. */
1838       HWord fn = 0;
1839       switch (e->Iex.Unop.op) {
1840          case Iop_CmpNEZ32x2:
1841             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1842          case Iop_CmpNEZ16x4:
1843             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1844          case Iop_CmpNEZ8x8:
1845             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1846          default:
1847             fn = (HWord)0; break;
1848       }
1849       if (fn != (HWord)0) {
1850          /* Note: the following assumes all helpers are of
1851             signature
1852                ULong fn ( ULong ), and they are
1853             not marked as regparm functions.
1854          */
1855          HReg dst = newVRegI(env);
1856          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1857          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1858          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1859                                         mk_RetLoc_simple(RLPri_Int) ));
1860          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1861          return dst;
1862       }
1863
1864       break;
1865    }
1866
1867    /* --------- GET --------- */
1868    case Iex_Get: {
1869       if (ty == Ity_I64) {
1870          HReg dst = newVRegI(env);
1871          addInstr(env, AMD64Instr_Alu64R(
1872                           Aalu_MOV,
1873                           AMD64RMI_Mem(
1874                              AMD64AMode_IR(e->Iex.Get.offset,
1875                                            hregAMD64_RBP())),
1876                           dst));
1877          return dst;
1878       }
1879       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1880          HReg dst = newVRegI(env);
1881          addInstr(env, AMD64Instr_LoadEX(
1882                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1883                           False,
1884                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1885                           dst));
1886          return dst;
1887       }
1888       break;
1889    }
1890
1891    case Iex_GetI: {
1892       AMD64AMode* am
1893          = genGuestArrayOffset(
1894               env, e->Iex.GetI.descr,
1895                    e->Iex.GetI.ix, e->Iex.GetI.bias );
1896       HReg dst = newVRegI(env);
1897       if (ty == Ity_I8) {
1898          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1899          return dst;
1900       }
1901       if (ty == Ity_I64) {
1902          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1903          return dst;
1904       }
1905       break;
1906    }
1907
1908    /* --------- CCALL --------- */
1909    case Iex_CCall: {
1910       HReg    dst = newVRegI(env);
1911       vassert(ty == e->Iex.CCall.retty);
1912
1913       /* be very restrictive for now.  Only 64-bit ints allowed for
1914          args, and 64 or 32 bits for return type. */
1915       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1916          goto irreducible;
1917
1918       /* Marshal args, do the call. */
1919       UInt   addToSp = 0;
1920       RetLoc rloc    = mk_RetLoc_INVALID();
1921       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1922                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1923       vassert(is_sane_RetLoc(rloc));
1924       vassert(rloc.pri == RLPri_Int);
1925       vassert(addToSp == 0);
1926
1927       /* Move to dst, and zero out the top 32 bits if the result type is
1928          Ity_I32.  Probably overkill, but still .. */
1929       if (e->Iex.CCall.retty == Ity_I64)
1930          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1931       else
1932          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1933
1934       return dst;
1935    }
1936
1937    /* --------- LITERAL --------- */
1938    /* 64/32/16/8-bit literals */
1939    case Iex_Const:
1940       if (ty == Ity_I64) {
1941          HReg r = newVRegI(env);
1942          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1943          return r;
1944       } else {
1945          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1946          HReg      r   = newVRegI(env);
1947          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1948          return r;
1949       }
1950
1951    /* --------- MULTIPLEX --------- */
1952    case Iex_ITE: { // VFD
1953       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1954           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1955          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1956          HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1957          HReg     dst = newVRegI(env);
1958          addInstr(env, mk_iMOVsd_RR(r1,dst));
1959          AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
1960          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1961          return dst;
1962       }
1963       break;
1964    }
1965
1966    /* --------- TERNARY OP --------- */
1967    case Iex_Triop: {
1968       IRTriop *triop = e->Iex.Triop.details;
1969       /* C3210 flags following FPU partial remainder (fprem), both
1970          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1971       if (triop->op == Iop_PRemC3210F64
1972           || triop->op == Iop_PRem1C3210F64) {
1973          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1974          HReg        arg1   = iselDblExpr(env, triop->arg2);
1975          HReg        arg2   = iselDblExpr(env, triop->arg3);
1976          HReg        dst    = newVRegI(env);
1977          addInstr(env, AMD64Instr_A87Free(2));
1978
1979          /* one arg -> top of x87 stack */
1980          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1981          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1982
1983          /* other arg -> top of x87 stack */
1984          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1985          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1986
1987          switch (triop->op) {
1988             case Iop_PRemC3210F64:
1989                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1990                break;
1991             case Iop_PRem1C3210F64:
1992                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1993                break;
1994             default:
1995                vassert(0);
1996          }
1997          /* Ignore the result, and instead make off with the FPU's
1998             C3210 flags (in the status word). */
1999          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
2000          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
2001          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
2002          return dst;
2003       }
2004       break;
2005    }
2006
2007    default:
2008    break;
2009    } /* switch (e->tag) */
2010
2011    /* We get here if no pattern matched. */
2012   irreducible:
2013    ppIRExpr(e);
2014    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2015 }
2016
2017
2018 /*---------------------------------------------------------*/
2019 /*--- ISEL: Integer expression auxiliaries              ---*/
2020 /*---------------------------------------------------------*/
2021
2022 /* --------------------- AMODEs --------------------- */
2023
2024 /* Return an AMode which computes the value of the specified
2025    expression, possibly also adding insns to the code list as a
2026    result.  The expression may only be a 32-bit one.
2027 */
2028
2029 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2030 {
2031    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2032    vassert(sane_AMode(am));
2033    return am;
2034 }
2035
2036 /* DO NOT CALL THIS DIRECTLY ! */
2037 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2038 {
2039    MatchInfo mi;
2040    DECLARE_PATTERN(p_complex);
2041    IRType ty = typeOfIRExpr(env->type_env,e);
2042    vassert(ty == Ity_I64);
2043
2044    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2045    /*              bind0        bind1  bind2   bind3   */
2046    DEFINE_PATTERN(p_complex,
2047       binop( Iop_Add64,
2048              binop( Iop_Add64,
2049                     bind(0),
2050                     binop(Iop_Shl64, bind(1), bind(2))
2051                   ),
2052              bind(3)
2053            )
2054    );
2055    if (matchIRExpr(&mi, p_complex, e)) {
2056       const IRExpr* expr1  = mi.bindee[0];
2057       const IRExpr* expr2  = mi.bindee[1];
2058       const IRExpr* imm8   = mi.bindee[2];
2059       const IRExpr* simm32 = mi.bindee[3];
2060       if (imm8->tag == Iex_Const
2061           && imm8->Iex.Const.con->tag == Ico_U8
2062           && imm8->Iex.Const.con->Ico.U8 < 4
2063           /* imm8 is OK, now check simm32 */
2064           && simm32->tag == Iex_Const
2065           && simm32->Iex.Const.con->tag == Ico_U64
2066           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2067          UInt shift = imm8->Iex.Const.con->Ico.U8;
2068          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2069          HReg r1 = iselIntExpr_R(env, expr1);
2070          HReg r2 = iselIntExpr_R(env, expr2);
2071          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2072          return AMD64AMode_IRRS(offset, r1, r2, shift);
2073       }
2074    }
2075
2076    /* Add64(expr1, Shl64(expr2, imm)) */
2077    if (e->tag == Iex_Binop
2078        && e->Iex.Binop.op == Iop_Add64
2079        && e->Iex.Binop.arg2->tag == Iex_Binop
2080        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2081        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2082        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2083       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2084       if (shift == 1 || shift == 2 || shift == 3) {
2085          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2086          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2087          return AMD64AMode_IRRS(0, r1, r2, shift);
2088       }
2089    }
2090
2091    /* Add64(expr,i) */
2092    if (e->tag == Iex_Binop
2093        && e->Iex.Binop.op == Iop_Add64
2094        && e->Iex.Binop.arg2->tag == Iex_Const
2095        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2096        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2097       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2098       return AMD64AMode_IR(
2099                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2100                 r1
2101              );
2102    }
2103
2104    /* Doesn't match anything in particular.  Generate it into
2105       a register and use that. */
2106    {
2107       HReg r1 = iselIntExpr_R(env, e);
2108       return AMD64AMode_IR(0, r1);
2109    }
2110 }
2111
2112
2113 /* --------------------- RMIs --------------------- */
2114
2115 /* Similarly, calculate an expression into an X86RMI operand.  As with
2116    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2117
2118 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2119 {
2120    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2121    /* sanity checks ... */
2122    switch (rmi->tag) {
2123       case Armi_Imm:
2124          return rmi;
2125       case Armi_Reg:
2126          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2127          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2128          return rmi;
2129       case Armi_Mem:
2130          vassert(sane_AMode(rmi->Armi.Mem.am));
2131          return rmi;
2132       default:
2133          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2134    }
2135 }
2136
2137 /* DO NOT CALL THIS DIRECTLY ! */
2138 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2139 {
2140    IRType ty = typeOfIRExpr(env->type_env,e);
2141    vassert(ty == Ity_I64 || ty == Ity_I32
2142            || ty == Ity_I16 || ty == Ity_I8);
2143
2144    /* special case: immediate 64/32/16/8 */
2145    if (e->tag == Iex_Const) {
2146       switch (e->Iex.Const.con->tag) {
2147         case Ico_U64:
2148            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2149               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2150            }
2151            break;
2152          case Ico_U32:
2153             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2154          case Ico_U16:
2155             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2156          case Ico_U8:
2157             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2158          default:
2159             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2160       }
2161    }
2162
2163    /* special case: 64-bit GET */
2164    if (e->tag == Iex_Get && ty == Ity_I64) {
2165       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2166                                         hregAMD64_RBP()));
2167    }
2168
2169    /* special case: 64-bit load from memory */
2170    if (e->tag == Iex_Load && ty == Ity_I64
2171        && e->Iex.Load.end == Iend_LE) {
2172       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2173       return AMD64RMI_Mem(am);
2174    }
2175
2176    /* default case: calculate into a register and return that */
2177    {
2178       HReg r = iselIntExpr_R ( env, e );
2179       return AMD64RMI_Reg(r);
2180    }
2181 }
2182
2183
2184 /* --------------------- RIs --------------------- */
2185
2186 /* Calculate an expression into an AMD64RI operand.  As with
2187    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2188    bits. */
2189
2190 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2191 {
2192    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2193    /* sanity checks ... */
2194    switch (ri->tag) {
2195       case Ari_Imm:
2196          return ri;
2197       case Ari_Reg:
2198          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2199          vassert(hregIsVirtual(ri->Ari.Reg.reg));
2200          return ri;
2201       default:
2202          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2203    }
2204 }
2205
2206 /* DO NOT CALL THIS DIRECTLY ! */
2207 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2208 {
2209    IRType ty = typeOfIRExpr(env->type_env,e);
2210    vassert(ty == Ity_I64 || ty == Ity_I32
2211            || ty == Ity_I16 || ty == Ity_I8);
2212
2213    /* special case: immediate */
2214    if (e->tag == Iex_Const) {
2215       switch (e->Iex.Const.con->tag) {
2216         case Ico_U64:
2217            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2218               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2219            }
2220            break;
2221          case Ico_U32:
2222             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2223          case Ico_U16:
2224             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2225          case Ico_U8:
2226             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2227          default:
2228             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2229       }
2230    }
2231
2232    /* default case: calculate into a register and return that */
2233    {
2234       HReg r = iselIntExpr_R ( env, e );
2235       return AMD64RI_Reg(r);
2236    }
2237 }
2238
2239
2240 /* --------------------- RMs --------------------- */
2241
2242 /* Similarly, calculate an expression into an AMD64RM operand.  As
2243    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2244    bits.  */
2245
2246 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2247 {
2248    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2249    /* sanity checks ... */
2250    switch (rm->tag) {
2251       case Arm_Reg:
2252          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2253          vassert(hregIsVirtual(rm->Arm.Reg.reg));
2254          return rm;
2255       case Arm_Mem:
2256          vassert(sane_AMode(rm->Arm.Mem.am));
2257          return rm;
2258       default:
2259          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2260    }
2261 }
2262
2263 /* DO NOT CALL THIS DIRECTLY ! */
2264 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2265 {
2266    IRType ty = typeOfIRExpr(env->type_env,e);
2267    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2268
2269    /* special case: 64-bit GET */
2270    if (e->tag == Iex_Get && ty == Ity_I64) {
2271       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2272                                        hregAMD64_RBP()));
2273    }
2274
2275    /* special case: load from memory */
2276
2277    /* default case: calculate into a register and return that */
2278    {
2279       HReg r = iselIntExpr_R ( env, e );
2280       return AMD64RM_Reg(r);
2281    }
2282 }
2283
2284
2285 /* --------------------- CONDCODE as %rflag test --------------------- */
2286
2287 /* Generate code to evaluated a bit-typed expression, returning the
2288    condition code which would correspond when the expression would
2289    notionally have returned 1.
2290
2291    Note that iselCondCode_C and iselCondCode_R are mutually recursive.  For
2292    future changes to either of them, take care not to introduce an infinite
2293    loop involving the two of them.
2294 */
2295 static AMD64CondCode iselCondCode_C ( ISelEnv* env, const IRExpr* e )
2296 {
2297    /* Uh, there's nothing we can sanity check here, unfortunately. */
2298    return iselCondCode_C_wrk(env,e);
2299 }
2300
2301 /* DO NOT CALL THIS DIRECTLY ! */
2302 static AMD64CondCode iselCondCode_C_wrk ( ISelEnv* env, const IRExpr* e )
2303 {
2304    vassert(e);
2305    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2306
2307    /* var */
2308    if (e->tag == Iex_RdTmp) {
2309       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2310       addInstr(env, AMD64Instr_Test64(1,r64));
2311       return Acc_NZ;
2312    }
2313
2314    /* Constant 1:Bit */
2315    if (e->tag == Iex_Const) {
2316       HReg r;
2317       vassert(e->Iex.Const.con->tag == Ico_U1);
2318       vassert(e->Iex.Const.con->Ico.U1 == True
2319               || e->Iex.Const.con->Ico.U1 == False);
2320       r = newVRegI(env);
2321       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2322       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2323       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2324    }
2325
2326    /* Not1(...) */
2327    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2328       /* Generate code for the arg, and negate the test condition */
2329       return 1 ^ iselCondCode_C(env, e->Iex.Unop.arg);
2330    }
2331
2332    /* --- patterns rooted at: 64to1 --- */
2333
2334    /* 64to1 */
2335    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2336       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2337       addInstr(env, AMD64Instr_Test64(1,reg));
2338       return Acc_NZ;
2339    }
2340
2341    /* --- patterns rooted at: 32to1 --- */
2342
2343    /* 32to1 */
2344    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2345       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2346       addInstr(env, AMD64Instr_Test64(1,reg));
2347       return Acc_NZ;
2348    }
2349
2350    /* --- patterns rooted at: CmpNEZ8 --- */
2351
2352    /* CmpNEZ8(x) */
2353    if (e->tag == Iex_Unop
2354        && e->Iex.Unop.op == Iop_CmpNEZ8) {
2355       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2356       addInstr(env, AMD64Instr_Test64(0xFF,r));
2357       return Acc_NZ;
2358    }
2359
2360    /* --- patterns rooted at: CmpNEZ16 --- */
2361
2362    /* CmpNEZ16(x) */
2363    if (e->tag == Iex_Unop
2364        && e->Iex.Unop.op == Iop_CmpNEZ16) {
2365       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2366       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2367       return Acc_NZ;
2368    }
2369
2370    /* --- patterns rooted at: CmpNEZ32 --- */
2371
2372    if (e->tag == Iex_Unop
2373        && e->Iex.Unop.op == Iop_CmpNEZ32) {
2374       IRExpr* arg = e->Iex.Unop.arg;
2375       if (arg->tag == Iex_Binop
2376           && (arg->Iex.Binop.op == Iop_Or32
2377               || arg->Iex.Binop.op == Iop_And32)) {
2378          /* CmpNEZ32(Or32(x,y)) */
2379          /* CmpNEZ32(And32(x,y)) */
2380          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2381          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2382          HReg      tmp  = newVRegI(env);
2383          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2384          addInstr(env, AMD64Instr_Alu32R(
2385                           arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2386                           rmi1, tmp));
2387          return Acc_NZ;
2388       }
2389       /* CmpNEZ32(x) */
2390       HReg      r1   = iselIntExpr_R(env, arg);
2391       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2392       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2393       return Acc_NZ;
2394    }
2395
2396    /* --- patterns rooted at: CmpNEZ64 --- */
2397
2398    if (e->tag == Iex_Unop
2399        && e->Iex.Unop.op == Iop_CmpNEZ64) {
2400       IRExpr* arg = e->Iex.Unop.arg;
2401       if (arg->tag == Iex_Binop
2402           && (arg->Iex.Binop.op == Iop_Or64
2403               || arg->Iex.Binop.op == Iop_And64)) {
2404          /* CmpNEZ64(Or64(x,y)) */
2405          /* CmpNEZ64(And64(x,y)) */
2406          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2407          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2408          HReg      tmp  = newVRegI(env);
2409          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2410          addInstr(env, AMD64Instr_Alu64R(
2411                           arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2412                           rmi1, tmp));
2413          return Acc_NZ;
2414       }
2415       /* CmpNEZ64(x) */
2416       HReg      r1   = iselIntExpr_R(env, arg);
2417       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2418       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2419       return Acc_NZ;
2420    }
2421
2422    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2423
2424    /* CmpEQ8 / CmpNE8 */
2425    if (e->tag == Iex_Binop
2426        && (e->Iex.Binop.op == Iop_CmpEQ8
2427            || e->Iex.Binop.op == Iop_CmpNE8
2428            || e->Iex.Binop.op == Iop_CasCmpEQ8
2429            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2430       if (isZeroU8(e->Iex.Binop.arg2)) {
2431          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2432          addInstr(env, AMD64Instr_Test64(0xFF,r1));
2433          switch (e->Iex.Binop.op) {
2434             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2435             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2436             default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,0:I8)");
2437          }
2438       } else {
2439          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2440          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2441          HReg      r    = newVRegI(env);
2442          addInstr(env, mk_iMOVsd_RR(r1,r));
2443          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2444          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2445          switch (e->Iex.Binop.op) {
2446             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2447             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2448             default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,expr)");
2449          }
2450       }
2451    }
2452
2453    /* CmpEQ16 / CmpNE16 */
2454    if (e->tag == Iex_Binop
2455        && (e->Iex.Binop.op == Iop_CmpEQ16
2456            || e->Iex.Binop.op == Iop_CmpNE16
2457            || e->Iex.Binop.op == Iop_CasCmpEQ16
2458            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2459       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2460       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2461       HReg      r    = newVRegI(env);
2462       addInstr(env, mk_iMOVsd_RR(r1,r));
2463       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2464       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2465       switch (e->Iex.Binop.op) {
2466          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2467          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2468          default: vpanic("iselCondCode_C(amd64): CmpXX16");
2469       }
2470    }
2471
2472    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2473       Saves a "movq %rax, %tmp" compared to the default route. */
2474    if (e->tag == Iex_Binop
2475        && e->Iex.Binop.op == Iop_CmpNE64
2476        && e->Iex.Binop.arg1->tag == Iex_CCall
2477        && e->Iex.Binop.arg2->tag == Iex_Const) {
2478       IRExpr* cal = e->Iex.Binop.arg1;
2479       IRExpr* con = e->Iex.Binop.arg2;
2480       HReg    tmp = newVRegI(env);
2481       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2482       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2483       vassert(con->Iex.Const.con->tag == Ico_U64);
2484       /* Marshal args, do the call. */
2485       UInt   addToSp = 0;
2486       RetLoc rloc    = mk_RetLoc_INVALID();
2487       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2488                     cal->Iex.CCall.cee,
2489                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
2490       vassert(is_sane_RetLoc(rloc));
2491       vassert(rloc.pri == RLPri_Int);
2492       vassert(addToSp == 0);
2493       /* */
2494       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2495       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2496                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2497       return Acc_NZ;
2498    }
2499
2500    /* Cmp*64*(x,y) */
2501    if (e->tag == Iex_Binop
2502        && (e->Iex.Binop.op == Iop_CmpEQ64
2503            || e->Iex.Binop.op == Iop_CmpNE64
2504            || e->Iex.Binop.op == Iop_CmpLT64S
2505            || e->Iex.Binop.op == Iop_CmpLT64U
2506            || e->Iex.Binop.op == Iop_CmpLE64S
2507            || e->Iex.Binop.op == Iop_CmpLE64U
2508            || e->Iex.Binop.op == Iop_CasCmpEQ64
2509            || e->Iex.Binop.op == Iop_CasCmpNE64
2510            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2511       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2512       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2513       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2514       switch (e->Iex.Binop.op) {
2515          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2516          case Iop_CmpNE64:
2517          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2518          case Iop_CmpLT64S: return Acc_L;
2519          case Iop_CmpLT64U: return Acc_B;
2520          case Iop_CmpLE64S: return Acc_LE;
2521          case Iop_CmpLE64U: return Acc_BE;
2522          default: vpanic("iselCondCode_C(amd64): CmpXX64");
2523       }
2524    }
2525
2526    /* Cmp*32*(x,y) */
2527    if (e->tag == Iex_Binop
2528        && (e->Iex.Binop.op == Iop_CmpEQ32
2529            || e->Iex.Binop.op == Iop_CmpNE32
2530            || e->Iex.Binop.op == Iop_CmpLT32S
2531            || e->Iex.Binop.op == Iop_CmpLT32U
2532            || e->Iex.Binop.op == Iop_CmpLE32S
2533            || e->Iex.Binop.op == Iop_CmpLE32U
2534            || e->Iex.Binop.op == Iop_CasCmpEQ32
2535            || e->Iex.Binop.op == Iop_CasCmpNE32
2536            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2537       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2538       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2539       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2540       switch (e->Iex.Binop.op) {
2541          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2542          case Iop_CmpNE32:
2543          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2544          case Iop_CmpLT32S: return Acc_L;
2545          case Iop_CmpLT32U: return Acc_B;
2546          case Iop_CmpLE32S: return Acc_LE;
2547          case Iop_CmpLE32U: return Acc_BE;
2548          default: vpanic("iselCondCode_C(amd64): CmpXX32");
2549       }
2550    }
2551
2552    /* And1(x,y), Or1(x,y) */
2553    if (e->tag == Iex_Binop
2554        && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2555       // Get the result in an int reg, then test the least significant bit.
2556       HReg tmp = iselCondCode_R(env, e);
2557       addInstr(env, AMD64Instr_Test64(1, tmp));
2558       return Acc_NZ;
2559    }
2560
2561    ppIRExpr(e);
2562    vpanic("iselCondCode_C(amd64)");
2563 }
2564
2565
2566 /* --------------------- CONDCODE as int reg --------------------- */
2567
2568 /* Generate code to evaluated a bit-typed expression, returning the resulting
2569    value in bit 0 of an integer register.  WARNING: all of the other bits in the
2570    register can be arbitrary.  Callers must mask them off or otherwise ignore
2571    them, as necessary.
2572
2573    Note that iselCondCode_C and iselCondCode_R are mutually recursive.  For
2574    future changes to either of them, take care not to introduce an infinite
2575    loop involving the two of them.
2576 */
2577 static HReg iselCondCode_R ( ISelEnv* env, const IRExpr* e )
2578 {
2579    /* Uh, there's nothing we can sanity check here, unfortunately. */
2580    return iselCondCode_R_wrk(env,e);
2581 }
2582
2583 /* DO NOT CALL THIS DIRECTLY ! */
2584 static HReg iselCondCode_R_wrk ( ISelEnv* env, const IRExpr* e )
2585 {
2586    vassert(e);
2587    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2588
2589    /* var */
2590    if (e->tag == Iex_RdTmp) {
2591       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2592    }
2593
2594    /* And1(x,y), Or1(x,y) */
2595    if (e->tag == Iex_Binop
2596        && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2597       HReg x_as_64 = iselCondCode_R(env, e->Iex.Binop.arg1);
2598       HReg y_as_64 = iselCondCode_R(env, e->Iex.Binop.arg2);
2599       HReg res = newVRegI(env);
2600       addInstr(env, mk_iMOVsd_RR(y_as_64, res));
2601       AMD64AluOp aop = e->Iex.Binop.op == Iop_And1 ? Aalu_AND : Aalu_OR;
2602       addInstr(env, AMD64Instr_Alu64R(aop, AMD64RMI_Reg(x_as_64), res));
2603       return res;
2604    }
2605
2606    /* Anything else, we hand off to iselCondCode_C and force the value into a
2607       register. */
2608    HReg res = newVRegI(env);
2609    AMD64CondCode cc = iselCondCode_C(env, e);
2610    addInstr(env, AMD64Instr_Set64(cc, res));
2611    return res;
2612
2613    // PJF old debug code? - unreachable
2614    /*
2615    ppIRExpr(e);
2616    vpanic("iselCondCode_R(amd64)");
2617    */
2618 }
2619
2620
2621 /*---------------------------------------------------------*/
2622 /*--- ISEL: Integer expressions (128 bit)               ---*/
2623 /*---------------------------------------------------------*/
2624
2625 /* Compute a 128-bit value into a register pair, which is returned as
2626    the first two parameters.  As with iselIntExpr_R, these may be
2627    either real or virtual regs; in any case they must not be changed
2628    by subsequent code emitted by the caller.  */
2629
2630 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2631                              ISelEnv* env, const IRExpr* e )
2632 {
2633    iselInt128Expr_wrk(rHi, rLo, env, e);
2634 #  if 0
2635    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2636 #  endif
2637    vassert(hregClass(*rHi) == HRcInt64);
2638    vassert(hregIsVirtual(*rHi));
2639    vassert(hregClass(*rLo) == HRcInt64);
2640    vassert(hregIsVirtual(*rLo));
2641 }
2642
2643 /* DO NOT CALL THIS DIRECTLY ! */
2644 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2645                                  ISelEnv* env, const IRExpr* e )
2646 {
2647    vassert(e);
2648    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2649
2650    /* read 128-bit IRTemp */
2651    if (e->tag == Iex_RdTmp) {
2652       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2653       return;
2654    }
2655
2656    /* --------- BINARY ops --------- */
2657    if (e->tag == Iex_Binop) {
2658       switch (e->Iex.Binop.op) {
2659          /* 64 x 64 -> 128 multiply */
2660          case Iop_MullU64:
2661          case Iop_MullS64: {
2662             /* get one operand into %rax, and the other into a R/M.
2663                Need to make an educated guess about which is better in
2664                which. */
2665             HReg     tLo    = newVRegI(env);
2666             HReg     tHi    = newVRegI(env);
2667             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2668             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2669             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2670             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2671             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2672             /* Result is now in RDX:RAX.  Tell the caller. */
2673             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2674             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2675             *rHi = tHi;
2676             *rLo = tLo;
2677             return;
2678          }
2679
2680          /* 128 x 64 -> (64(rem),64(div)) division */
2681          case Iop_DivModU128to64:
2682          case Iop_DivModS128to64: {
2683             /* Get the 128-bit operand into rdx:rax, and the other into
2684                any old R/M. */
2685             HReg sHi, sLo;
2686             HReg     tLo     = newVRegI(env);
2687             HReg     tHi     = newVRegI(env);
2688             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2689             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2690             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2691             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2692             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2693             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2694             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2695             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2696             *rHi = tHi;
2697             *rLo = tLo;
2698             return;
2699          }
2700
2701          /* 64HLto128(e1,e2) */
2702          case Iop_64HLto128:
2703             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2704             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2705             return;
2706
2707          default:
2708             break;
2709       }
2710    } /* if (e->tag == Iex_Binop) */
2711
2712    ppIRExpr(e);
2713    vpanic("iselInt128Expr");
2714 }
2715
2716
2717 /*---------------------------------------------------------*/
2718 /*--- ISEL: Floating point expressions (32 bit)         ---*/
2719 /*---------------------------------------------------------*/
2720
2721 /* Nothing interesting here; really just wrappers for
2722    64-bit stuff. */
2723
2724 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2725 {
2726    HReg r = iselFltExpr_wrk( env, e );
2727 #  if 0
2728    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2729 #  endif
2730    vassert(hregClass(r) == HRcVec128);
2731    vassert(hregIsVirtual(r));
2732    return r;
2733 }
2734
2735 /* DO NOT CALL THIS DIRECTLY */
2736 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2737 {
2738    IRType ty = typeOfIRExpr(env->type_env,e);
2739    vassert(ty == Ity_F32);
2740
2741    if (e->tag == Iex_RdTmp) {
2742       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2743    }
2744
2745    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2746       AMD64AMode* am;
2747       HReg res = newVRegV(env);
2748       vassert(e->Iex.Load.ty == Ity_F32);
2749       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2750       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2751       return res;
2752    }
2753
2754    if (e->tag == Iex_Binop
2755        && e->Iex.Binop.op == Iop_F64toF32) {
2756       /* Although the result is still held in a standard SSE register,
2757          we need to round it to reflect the loss of accuracy/range
2758          entailed in casting it to a 32-bit float. */
2759       HReg dst = newVRegV(env);
2760       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2761       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2762       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2763       set_SSE_rounding_default( env );
2764       return dst;
2765    }
2766
2767    if (e->tag == Iex_Get) {
2768       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2769                                        hregAMD64_RBP() );
2770       HReg res = newVRegV(env);
2771       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2772       return res;
2773    }
2774
2775    if (e->tag == Iex_Unop
2776        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2777        /* Given an I32, produce an IEEE754 float with the same bit
2778           pattern. */
2779        HReg        dst    = newVRegV(env);
2780        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2781        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2782        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2783        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2784        return dst;
2785    }
2786
2787    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2788       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2789       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2790       HReg        dst    = newVRegV(env);
2791
2792       /* rf now holds the value to be rounded.  The first thing to do
2793          is set the FPU's rounding mode accordingly. */
2794
2795       /* Set host x87 rounding mode */
2796       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2797
2798       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2799       addInstr(env, AMD64Instr_A87Free(1));
2800       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2801       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2802       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2803       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2804
2805       /* Restore default x87 rounding. */
2806       set_FPU_rounding_default( env );
2807
2808       return dst;
2809    }
2810
2811    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2812       /* Sigh ... very rough code.  Could do much better. */
2813       /* Get the 128-bit literal 00---0 10---0 into a register
2814          and xor it with the value to be negated. */
2815       HReg r1  = newVRegI(env);
2816       HReg dst = newVRegV(env);
2817       HReg tmp = newVRegV(env);
2818       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2819       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2820       addInstr(env, mk_vMOVsd_RR(src,tmp));
2821       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2822       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2823       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2824       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2825       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2826       add_to_rsp(env, 16);
2827       return dst;
2828    }
2829
2830    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2831       IRQop *qop = e->Iex.Qop.details;
2832       HReg dst  = newVRegV(env);
2833       HReg argX = iselFltExpr(env, qop->arg2);
2834       HReg argY = iselFltExpr(env, qop->arg3);
2835       HReg argZ = iselFltExpr(env, qop->arg4);
2836       if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
2837          vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
2838          if (dst.u32 != argX.u32)
2839             addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
2840          addInstr(env, AMD64Instr_Avx32FLo(Asse_VFMADD213, argY, argZ, dst));
2841          return dst;
2842       }
2843       /* XXXROUNDINGFIXME */
2844       /* set roundingmode here */
2845       /* subq $16, %rsp         -- make a space*/
2846       sub_from_rsp(env, 16);
2847       /* Prepare 4 arg regs:
2848          leaq 0(%rsp), %rdi
2849          leaq 4(%rsp), %rsi
2850          leaq 8(%rsp), %rdx
2851          leaq 12(%rsp), %rcx
2852       */
2853       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2854                                      hregAMD64_RDI()));
2855       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2856                                      hregAMD64_RSI()));
2857       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2858                                      hregAMD64_RDX()));
2859       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2860                                      hregAMD64_RCX()));
2861       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2862          movss  %argX, 0(%rsi)
2863          movss  %argY, 0(%rdx)
2864          movss  %argZ, 0(%rcx)
2865          */
2866       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2867                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2868       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2869                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2870       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2871                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2872
2873       /* call the helper with priority order : fma4 -> fallback generic
2874          remark: the fma3 case is handled before without helper*/
2875 #if defined(VGA_amd64)
2876       if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
2877          addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2878                                         (ULong)(HWord)h_amd64_calc_MAddF32_fma4,
2879                                         4, mk_RetLoc_simple(RLPri_None) ));
2880       }else
2881 #endif
2882       {
2883          addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2884                                         (ULong)(HWord)h_generic_calc_MAddF32,
2885                                         4, mk_RetLoc_simple(RLPri_None) ));
2886       }
2887
2888       /* fetch the result from memory, using %r_argp, which the
2889          register allocator will keep alive across the call. */
2890       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2891                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2892       /* and finally, clear the space */
2893       add_to_rsp(env, 16);
2894       return dst;
2895    }
2896
2897    if (e->tag == Iex_ITE) { // VFD
2898       HReg r1, r0, dst;
2899       vassert(ty == Ity_F32);
2900       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
2901       r1  = iselFltExpr(env, e->Iex.ITE.iftrue);
2902       r0  = iselFltExpr(env, e->Iex.ITE.iffalse);
2903       dst = newVRegV(env);
2904       addInstr(env, mk_vMOVsd_RR(r1,dst));
2905       AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
2906       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
2907       return dst;
2908    }
2909
2910    ppIRExpr(e);
2911    vpanic("iselFltExpr_wrk");
2912 }
2913
2914
2915 /*---------------------------------------------------------*/
2916 /*--- ISEL: Floating point expressions (64 bit)         ---*/
2917 /*---------------------------------------------------------*/
2918
2919 /* Compute a 64-bit floating point value into the lower half of an xmm
2920    register, the identity of which is returned.  As with
2921    iselIntExpr_R, the returned reg will be virtual, and it must not be
2922    changed by subsequent code emitted by the caller.
2923 */
2924
2925 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2926
2927     Type                  S (1 bit)   E (11 bits)   F (52 bits)
2928     ----                  ---------   -----------   -----------
2929     signalling NaN        u           2047 (max)    .0uuuuu---u
2930                                                     (with at least
2931                                                      one 1 bit)
2932     quiet NaN             u           2047 (max)    .1uuuuu---u
2933
2934     negative infinity     1           2047 (max)    .000000---0
2935
2936     positive infinity     0           2047 (max)    .000000---0
2937
2938     negative zero         1           0             .000000---0
2939
2940     positive zero         0           0             .000000---0
2941 */
2942
2943 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2944 {
2945    HReg r = iselDblExpr_wrk( env, e );
2946 #  if 0
2947    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2948 #  endif
2949    vassert(hregClass(r) == HRcVec128);
2950    vassert(hregIsVirtual(r));
2951    return r;
2952 }
2953
2954 /* DO NOT CALL THIS DIRECTLY */
2955 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2956 {
2957    IRType ty = typeOfIRExpr(env->type_env,e);
2958    vassert(e);
2959    vassert(ty == Ity_F64);
2960
2961    if (e->tag == Iex_RdTmp) {
2962       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2963    }
2964
2965    if (e->tag == Iex_Const) {
2966       union { ULong u64; Double f64; } u;
2967       HReg res = newVRegV(env);
2968       HReg tmp = newVRegI(env);
2969       vassert(sizeof(u) == 8);
2970       vassert(sizeof(u.u64) == 8);
2971       vassert(sizeof(u.f64) == 8);
2972
2973       if (e->Iex.Const.con->tag == Ico_F64) {
2974          u.f64 = e->Iex.Const.con->Ico.F64;
2975       }
2976       else if (e->Iex.Const.con->tag == Ico_F64i) {
2977          u.u64 = e->Iex.Const.con->Ico.F64i;
2978       }
2979       else
2980          vpanic("iselDblExpr(amd64): const");
2981
2982       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2983       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2984       addInstr(env, AMD64Instr_SseLdSt(
2985                        True/*load*/, 8, res,
2986                        AMD64AMode_IR(0, hregAMD64_RSP())
2987               ));
2988       add_to_rsp(env, 8);
2989       return res;
2990    }
2991
2992    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2993       AMD64AMode* am;
2994       HReg res = newVRegV(env);
2995       vassert(e->Iex.Load.ty == Ity_F64);
2996       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2997       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2998       return res;
2999    }
3000
3001    if (e->tag == Iex_Get) {
3002       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
3003                                       hregAMD64_RBP() );
3004       HReg res = newVRegV(env);
3005       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
3006       return res;
3007    }
3008
3009    if (e->tag == Iex_GetI) {
3010       AMD64AMode* am
3011          = genGuestArrayOffset(
3012               env, e->Iex.GetI.descr,
3013                    e->Iex.GetI.ix, e->Iex.GetI.bias );
3014       HReg res = newVRegV(env);
3015       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
3016       return res;
3017    }
3018
3019    if (e->tag == Iex_Triop) {
3020       IRTriop *triop = e->Iex.Triop.details;
3021       AMD64SseOp op = Asse_INVALID;
3022       switch (triop->op) {
3023          case Iop_AddF64: op = Asse_ADDF; break;
3024          case Iop_SubF64: op = Asse_SUBF; break;
3025          case Iop_MulF64: op = Asse_MULF; break;
3026          case Iop_DivF64: op = Asse_DIVF; break;
3027          default: break;
3028       }
3029       if (op != Asse_INVALID) {
3030          HReg dst  = newVRegV(env);
3031          HReg argL = iselDblExpr(env, triop->arg2);
3032          HReg argR = iselDblExpr(env, triop->arg3);
3033          addInstr(env, mk_vMOVsd_RR(argL, dst));
3034          /* XXXROUNDINGFIXME */
3035          /* set roundingmode here */
3036          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3037          return dst;
3038       }
3039    }
3040
3041    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
3042       IRQop *qop = e->Iex.Qop.details;
3043       HReg dst  = newVRegV(env);
3044       HReg argX = iselDblExpr(env, qop->arg2);
3045       HReg argY = iselDblExpr(env, qop->arg3);
3046       HReg argZ = iselDblExpr(env, qop->arg4);
3047       if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
3048          vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
3049          if (dst.u32 != argX.u32)
3050             addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
3051          addInstr(env, AMD64Instr_Avx64FLo(Asse_VFMADD213, argY, argZ, dst));
3052          return dst;
3053       }
3054
3055       /* XXXROUNDINGFIXME */
3056       /* set roundingmode here */
3057       /* subq $32, %rsp         -- make a space*/
3058       sub_from_rsp(env, 32);
3059       /* Prepare 4 arg regs:
3060          leaq 0(%rsp), %rdi
3061          leaq 8(%rsp), %rsi
3062          leaq 16(%rsp), %rdx
3063          leaq 24(%rsp), %rcx
3064       */
3065       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
3066                                      hregAMD64_RDI()));
3067       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
3068                                      hregAMD64_RSI()));
3069       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
3070                                      hregAMD64_RDX()));
3071       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
3072                                      hregAMD64_RCX()));
3073       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
3074          movsd  %argX, 0(%rsi)
3075          movsd  %argY, 0(%rdx)
3076          movsd  %argZ, 0(%rcx)
3077          */
3078       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
3079                                        AMD64AMode_IR(0, hregAMD64_RSI())));
3080       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
3081                                        AMD64AMode_IR(0, hregAMD64_RDX())));
3082       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
3083                                        AMD64AMode_IR(0, hregAMD64_RCX())));
3084
3085       /* call the helper with priority order : fma4 -> fallback generic
3086          remark: the fma3 case is handled before without helper*/
3087 #if defined(VGA_amd64)
3088       if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
3089          addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
3090                                         (ULong)(HWord)h_amd64_calc_MAddF64_fma4,
3091                                         4, mk_RetLoc_simple(RLPri_None) ));
3092       }else
3093 #endif
3094       {
3095          addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
3096                                         (ULong)(HWord)h_generic_calc_MAddF64,
3097                                         4, mk_RetLoc_simple(RLPri_None) ));
3098       }
3099
3100       /* fetch the result from memory, using %r_argp, which the
3101          register allocator will keep alive across the call. */
3102       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
3103                                        AMD64AMode_IR(0, hregAMD64_RSP())));
3104       /* and finally, clear the space */
3105       add_to_rsp(env, 32);
3106       return dst;
3107    }
3108
3109    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
3110       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3111       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3112       HReg        dst    = newVRegV(env);
3113
3114       /* rf now holds the value to be rounded.  The first thing to do
3115          is set the FPU's rounding mode accordingly. */
3116
3117       /* Set host x87 rounding mode */
3118       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3119
3120       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3121       addInstr(env, AMD64Instr_A87Free(1));
3122       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3123       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
3124       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3125       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3126
3127       /* Restore default x87 rounding. */
3128       set_FPU_rounding_default( env );
3129
3130       return dst;
3131    }
3132
3133    IRTriop *triop = e->Iex.Triop.details;
3134    if (e->tag == Iex_Triop
3135        && (triop->op == Iop_ScaleF64
3136            || triop->op == Iop_AtanF64
3137            || triop->op == Iop_Yl2xF64
3138            || triop->op == Iop_Yl2xp1F64
3139            || triop->op == Iop_PRemF64
3140            || triop->op == Iop_PRem1F64)
3141       ) {
3142       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3143       HReg        arg1   = iselDblExpr(env, triop->arg2);
3144       HReg        arg2   = iselDblExpr(env, triop->arg3);
3145       HReg        dst    = newVRegV(env);
3146       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
3147                                   || triop->op == Iop_PRemF64
3148                                   || triop->op == Iop_PRem1F64);
3149       addInstr(env, AMD64Instr_A87Free(2));
3150
3151       /* one arg -> top of x87 stack */
3152       addInstr(env, AMD64Instr_SseLdSt(
3153                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3154       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3155
3156       /* other arg -> top of x87 stack */
3157       addInstr(env, AMD64Instr_SseLdSt(
3158                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3159       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3160
3161       /* do it */
3162       /* XXXROUNDINGFIXME */
3163       /* set roundingmode here */
3164       switch (triop->op) {
3165          case Iop_ScaleF64:
3166             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3167             break;
3168          case Iop_AtanF64:
3169             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3170             break;
3171          case Iop_Yl2xF64:
3172             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3173             break;
3174          case Iop_Yl2xp1F64:
3175             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3176             break;
3177          case Iop_PRemF64:
3178             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3179             break;
3180          case Iop_PRem1F64:
3181             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3182             break;
3183          default:
3184             vassert(0);
3185       }
3186
3187       /* save result */
3188       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3189       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3190       return dst;
3191    }
3192
3193    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3194       HReg dst = newVRegV(env);
3195       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3196       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3197       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3198       set_SSE_rounding_default( env );
3199       return dst;
3200    }
3201
3202    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3203       HReg dst = newVRegV(env);
3204       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3205       set_SSE_rounding_default( env );
3206       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3207       return dst;
3208    }
3209
3210    if (e->tag == Iex_Unop
3211        && (e->Iex.Unop.op == Iop_NegF64
3212            || e->Iex.Unop.op == Iop_AbsF64)) {
3213       /* Sigh ... very rough code.  Could do much better. */
3214       /* Get the 128-bit literal 00---0 10---0 into a register
3215          and xor/nand it with the value to be negated. */
3216       HReg r1  = newVRegI(env);
3217       HReg dst = newVRegV(env);
3218       HReg tmp = newVRegV(env);
3219       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3220       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3221       addInstr(env, mk_vMOVsd_RR(src,tmp));
3222       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3223       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3224       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3225       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3226
3227       if (e->Iex.Unop.op == Iop_NegF64)
3228          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3229       else
3230          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3231
3232       add_to_rsp(env, 16);
3233       return dst;
3234    }
3235
3236    if (e->tag == Iex_Binop) {
3237       A87FpOp fpop = Afp_INVALID;
3238       switch (e->Iex.Binop.op) {
3239          case Iop_SqrtF64: fpop = Afp_SQRT; break;
3240          case Iop_SinF64:  fpop = Afp_SIN;  break;
3241          case Iop_CosF64:  fpop = Afp_COS;  break;
3242          case Iop_TanF64:  fpop = Afp_TAN;  break;
3243          case Iop_2xm1F64: fpop = Afp_2XM1; break;
3244          default: break;
3245       }
3246       if (fpop != Afp_INVALID) {
3247          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3248          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3249          HReg        dst    = newVRegV(env);
3250          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3251          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3252          addInstr(env, AMD64Instr_A87Free(nNeeded));
3253          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3254          /* XXXROUNDINGFIXME */
3255          /* set roundingmode here */
3256          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3257             codes.  I don't think that matters, since this insn
3258             selector never generates such an instruction intervening
3259             between an flag-setting instruction and a flag-using
3260             instruction. */
3261          addInstr(env, AMD64Instr_A87FpOp(fpop));
3262          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3263          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3264          return dst;
3265       }
3266    }
3267
3268    if (e->tag == Iex_Unop) {
3269       switch (e->Iex.Unop.op) {
3270 //..          case Iop_I32toF64: {
3271 //..             HReg dst = newVRegF(env);
3272 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3273 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3274 //..             set_FPU_rounding_default(env);
3275 //..             addInstr(env, X86Instr_FpLdStI(
3276 //..                              True/*load*/, 4, dst,
3277 //..                              X86AMode_IR(0, hregX86_ESP())));
3278 //..             add_to_esp(env, 4);
3279 //..             return dst;
3280 //..          }
3281          case Iop_ReinterpI64asF64: {
3282             /* Given an I64, produce an IEEE754 double with the same
3283                bit pattern. */
3284             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3285             HReg        dst    = newVRegV(env);
3286             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3287             /* paranoia */
3288             set_SSE_rounding_default(env);
3289             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3290             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3291             return dst;
3292          }
3293          case Iop_F32toF64: {
3294             HReg f32;
3295             HReg f64 = newVRegV(env);
3296             /* this shouldn't be necessary, but be paranoid ... */
3297             set_SSE_rounding_default(env);
3298             f32 = iselFltExpr(env, e->Iex.Unop.arg);
3299             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3300             return f64;
3301          }
3302          default:
3303             break;
3304       }
3305    }
3306
3307    /* --------- MULTIPLEX --------- */
3308    if (e->tag == Iex_ITE) { // VFD
3309       HReg r1, r0, dst;
3310       vassert(ty == Ity_F64);
3311       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3312       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3313       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3314       dst = newVRegV(env);
3315       addInstr(env, mk_vMOVsd_RR(r1,dst));
3316       AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
3317       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3318       return dst;
3319    }
3320
3321    ppIRExpr(e);
3322    vpanic("iselDblExpr_wrk");
3323 }
3324
3325
3326 /*---------------------------------------------------------*/
3327 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3328 /*---------------------------------------------------------*/
3329
3330 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3331 {
3332    HReg r = iselVecExpr_wrk( env, e );
3333 #  if 0
3334    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3335 #  endif
3336    vassert(hregClass(r) == HRcVec128);
3337    vassert(hregIsVirtual(r));
3338    return r;
3339 }
3340
3341
3342 /* DO NOT CALL THIS DIRECTLY */
3343 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3344 {
3345    HWord      fn = 0; /* address of helper fn, if required */
3346    Bool       arg1isEReg = False;
3347    AMD64SseOp op = Asse_INVALID;
3348    vassert(e);
3349    IRType ty = typeOfIRExpr(env->type_env, e);
3350    vassert(ty == Ity_V128);
3351    UInt laneBits = 0;
3352
3353    if (e->tag == Iex_RdTmp) {
3354       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3355    }
3356
3357    if (e->tag == Iex_Get) {
3358       HReg dst = newVRegV(env);
3359       addInstr(env, AMD64Instr_SseLdSt(
3360                        True/*load*/,
3361                        16,
3362                        dst,
3363                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3364                     )
3365               );
3366       return dst;
3367    }
3368
3369    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3370       HReg        dst = newVRegV(env);
3371       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3372       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3373       return dst;
3374    }
3375
3376    if (e->tag == Iex_Const) {
3377       HReg dst = newVRegV(env);
3378       vassert(e->Iex.Const.con->tag == Ico_V128);
3379       switch (e->Iex.Const.con->Ico.V128) {
3380          case 0x0000:
3381             dst = generate_zeroes_V128(env);
3382             break;
3383          case 0xFFFF:
3384             dst = generate_ones_V128(env);
3385             break;
3386          default: {
3387             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3388             /* do push_uimm64 twice, first time for the high-order half. */
3389             push_uimm64(env, bitmask8_to_bytemask64(
3390                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3391                        ));
3392             push_uimm64(env, bitmask8_to_bytemask64(
3393                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3394                        ));
3395             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3396             add_to_rsp(env, 16);
3397             break;
3398          }
3399       }
3400       return dst;
3401    }
3402
3403    if (e->tag == Iex_Unop) {
3404    switch (e->Iex.Unop.op) {
3405
3406       case Iop_NotV128: {
3407          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3408          return do_sse_NotV128(env, arg);
3409       }
3410
3411       case Iop_CmpNEZ64x2: {
3412          /* We can use SSE2 instructions for this. */
3413          /* Ideally, we want to do a 64Ix2 comparison against zero of
3414             the operand.  Problem is no such insn exists.  Solution
3415             therefore is to do a 32Ix4 comparison instead, and bitwise-
3416             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3417             let the not'd result of this initial comparison be a:b:c:d.
3418             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3419             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3420             giving the required result.
3421
3422             The required selection sequence is 2,3,0,1, which
3423             according to Intel's documentation means the pshufd
3424             literal value is 0xB1, that is,
3425             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3426          */
3427          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3428          HReg tmp  = generate_zeroes_V128(env);
3429          HReg dst  = newVRegV(env);
3430          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3431          tmp = do_sse_NotV128(env, tmp);
3432          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3433          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3434          return dst;
3435       }
3436
3437       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3438       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3439       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3440       do_CmpNEZ_vector:
3441       {
3442          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3443          HReg tmp  = newVRegV(env);
3444          HReg zero = generate_zeroes_V128(env);
3445          HReg dst;
3446          addInstr(env, mk_vMOVsd_RR(arg, tmp));
3447          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3448          dst = do_sse_NotV128(env, tmp);
3449          return dst;
3450       }
3451
3452       case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3453       case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3454       do_32Fx4_unary:
3455       {
3456          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3457          HReg dst = newVRegV(env);
3458          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3459          return dst;
3460       }
3461
3462       case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3463       case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3464       case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
3465       do_32F0x4_unary:
3466       {
3467          /* A bit subtle.  We have to copy the arg to the result
3468             register first, because actually doing the SSE scalar insn
3469             leaves the upper 3/4 of the destination register
3470             unchanged.  Whereas the required semantics of these
3471             primops is that the upper 3/4 is simply copied in from the
3472             argument. */
3473          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3474          HReg dst = newVRegV(env);
3475          addInstr(env, mk_vMOVsd_RR(arg, dst));
3476          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3477          return dst;
3478       }
3479
3480       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3481       do_64F0x2_unary:
3482       {
3483          /* A bit subtle.  We have to copy the arg to the result
3484             register first, because actually doing the SSE scalar insn
3485             leaves the upper half of the destination register
3486             unchanged.  Whereas the required semantics of these
3487             primops is that the upper half is simply copied in from the
3488             argument. */
3489          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3490          HReg dst = newVRegV(env);
3491          addInstr(env, mk_vMOVsd_RR(arg, dst));
3492          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3493          return dst;
3494       }
3495
3496       case Iop_32UtoV128: {
3497          // FIXME maybe just use MOVQ here?
3498          HReg        dst     = newVRegV(env);
3499          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3500          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3501          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3502          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3503          return dst;
3504       }
3505
3506       case Iop_64UtoV128: {
3507          // FIXME maybe just use MOVQ here?
3508          HReg        dst  = newVRegV(env);
3509          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3510          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3511          addInstr(env, AMD64Instr_Push(rmi));
3512          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3513          add_to_rsp(env, 8);
3514          return dst;
3515       }
3516
3517       case Iop_V256toV128_0:
3518       case Iop_V256toV128_1: {
3519          HReg vHi, vLo;
3520          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3521          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3522       }
3523
3524       case Iop_F16toF32x4: {
3525          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3526             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3527             HReg dst = newVRegV(env);
3528             addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3529             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3530             return dst;
3531          }
3532          break;
3533       }
3534
3535       default:
3536          break;
3537    } /* switch (e->Iex.Unop.op) */
3538    } /* if (e->tag == Iex_Unop) */
3539
3540    if (e->tag == Iex_Binop) {
3541    switch (e->Iex.Binop.op) {
3542
3543       case Iop_Sqrt64Fx2:
3544       case Iop_Sqrt32Fx4: {
3545          /* :: (rmode, vec) -> vec */
3546          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3547          HReg dst = newVRegV(env);
3548          /* XXXROUNDINGFIXME */
3549          /* set roundingmode here */
3550          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3551                            ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3552                        (Asse_SQRTF, arg, dst));
3553          return dst;
3554       }
3555
3556       /* FIXME: could we generate MOVQ here? */
3557       case Iop_SetV128lo64: {
3558          HReg dst  = newVRegV(env);
3559          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3560          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3561          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3562          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3563          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3564          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3565          return dst;
3566       }
3567
3568       /* FIXME: could we generate MOVD here? */
3569       case Iop_SetV128lo32: {
3570          HReg dst  = newVRegV(env);
3571          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3572          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3573          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3574          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3575          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3576          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3577          return dst;
3578       }
3579
3580       case Iop_64HLtoV128: {
3581          const IRExpr* arg1 = e->Iex.Binop.arg1;
3582          const IRExpr* arg2 = e->Iex.Binop.arg2;
3583          HReg dst = newVRegV(env);
3584          HReg tmp = newVRegV(env);
3585          HReg qHi = iselIntExpr_R(env, arg1);
3586          // If the args are trivially the same (tmp or const), use the same
3587          // source register for both, and only one movq since those are
3588          // (relatively) expensive.
3589          if (areAtomsAndEqual(arg1, arg2)) {
3590             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3591             addInstr(env, mk_vMOVsd_RR(dst, tmp));
3592             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3593             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3594          } else {
3595             HReg qLo = iselIntExpr_R(env, arg2);
3596             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3597             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3598             addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3599             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3600          }
3601          return dst;
3602       }
3603
3604       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3605       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3606       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3607       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3608       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3609       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3610       do_32Fx4:
3611       {
3612          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3613          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3614          HReg dst = newVRegV(env);
3615          addInstr(env, mk_vMOVsd_RR(argL, dst));
3616          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3617          return dst;
3618       }
3619
3620       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3621       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3622       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3623       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3624       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3625       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3626       do_64Fx2:
3627       {
3628          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3629          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3630          HReg dst = newVRegV(env);
3631          addInstr(env, mk_vMOVsd_RR(argL, dst));
3632          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3633          return dst;
3634       }
3635
3636       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3637       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3638       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3639       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3640       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3641       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3642       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3643       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3644       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3645       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3646       do_32F0x4: {
3647          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3648          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3649          HReg dst = newVRegV(env);
3650          addInstr(env, mk_vMOVsd_RR(argL, dst));
3651          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3652          return dst;
3653       }
3654
3655       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3656       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3657       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3658       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3659       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3660       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3661       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3662       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3663       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3664       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3665       do_64F0x2: {
3666          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3667          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3668          HReg dst = newVRegV(env);
3669          addInstr(env, mk_vMOVsd_RR(argL, dst));
3670          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3671          return dst;
3672       }
3673
3674       case Iop_PermOrZero8x16:
3675          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3676             op = Asse_PSHUFB;
3677             goto do_SseReRg;
3678          }
3679          // Otherwise we'll have to generate a call to
3680          // h_generic_calc_PermOrZero8x16 (ATK).  But that would only be for a
3681          // host which doesn't have SSSE3, in which case we don't expect this
3682          // IROp to enter the compilation pipeline in the first place.
3683          break;
3684
3685       case Iop_PwExtUSMulQAdd8x16:
3686          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3687             op = Asse_PMADDUBSW;
3688             goto do_SseReRg;
3689          }
3690          break;
3691
3692       case Iop_QNarrowBin32Sto16Sx8:
3693          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3694       case Iop_QNarrowBin16Sto8Sx16:
3695          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3696       case Iop_QNarrowBin16Sto8Ux16:
3697          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3698
3699       case Iop_InterleaveHI8x16:
3700          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3701       case Iop_InterleaveHI16x8:
3702          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3703       case Iop_InterleaveHI32x4:
3704          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3705       case Iop_InterleaveHI64x2:
3706          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3707
3708       case Iop_InterleaveLO8x16:
3709          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3710       case Iop_InterleaveLO16x8:
3711          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3712       case Iop_InterleaveLO32x4:
3713          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3714       case Iop_InterleaveLO64x2:
3715          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3716
3717       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3718       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3719       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3720       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3721       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3722       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3723       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3724       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3725       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3726       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3727       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3728       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3729       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3730       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3731       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3732       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3733       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3734       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3735       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3736       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3737       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3738       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3739       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3740       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3741       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3742       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3743       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3744       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3745       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3746       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3747       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3748       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3749       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3750       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3751       do_SseReRg: {
3752          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3753          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3754          HReg dst = newVRegV(env);
3755          if (arg1isEReg) {
3756             addInstr(env, mk_vMOVsd_RR(arg2, dst));
3757             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3758          } else {
3759             addInstr(env, mk_vMOVsd_RR(arg1, dst));
3760             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3761          }
3762          return dst;
3763       }
3764
3765       case Iop_ShlN8x16: laneBits = 8;  op = Asse_SHL16; goto do_SseShift;
3766       case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3767       case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3768       case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3769       case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3770       case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3771       case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3772       case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3773       case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3774       do_SseShift: {
3775          HReg dst  = newVRegV(env);
3776          HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3777          /* If it's a shift by an in-range immediate, generate a single
3778             instruction. */
3779          if (e->Iex.Binop.arg2->tag == Iex_Const) {
3780             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3781             vassert(c->tag == Ico_U8);
3782             UInt shift = c->Ico.U8;
3783             if (shift < laneBits) {
3784                if (laneBits == 8) {
3785                   /* This instruction doesn't exist so we need to fake it using
3786                      Asse_SHL16 and Asse_SHR16.
3787
3788                      We'd like to shift every byte in the 16-byte register to
3789                      the left by some amount.
3790
3791                      Instead, we will make a copy and shift all the 16-bit words
3792                      to the *right* by 8 and then to the left by 8 plus the
3793                      shift amount.  That will get us the correct answer for the
3794                      upper 8 bits of each 16-bit word and zero elsewhere.
3795
3796                      Then we will shift all the 16-bit words in the original to
3797                      the left by 8 plus the shift amount and then to the right
3798                      by 8.  This will get the correct answer for the lower 8
3799                      bits of each 16-bit word and zero elsewhere.
3800
3801                      Finally, we will OR those two results together.
3802
3803                      Because we don't have a shift by constant in x86, we store
3804                      the constant 8 into a register and shift by that as needed.
3805                   */
3806                   AMD64SseOp reverse_op = op;
3807                   switch (op) {
3808                      case Asse_SHL16:
3809                         reverse_op = Asse_SHR16;
3810                         break;
3811                      default:
3812                         vpanic("Iop_ShlN8x16");
3813                   }
3814                   HReg hi  = newVRegV(env);
3815                   addInstr(env, mk_vMOVsd_RR(greg, hi));
3816                   addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, hi));
3817                   addInstr(env, AMD64Instr_SseShiftN(op, 8+shift, hi));
3818                   addInstr(env, mk_vMOVsd_RR(greg, dst));
3819                   addInstr(env, AMD64Instr_SseShiftN(op, 8+shift, dst));
3820                   addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, dst));
3821                   addInstr(env, AMD64Instr_SseReRg(Asse_OR, hi, dst));
3822                   return dst;
3823                }
3824                addInstr(env, mk_vMOVsd_RR(greg, dst));
3825                addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3826                return dst;
3827             }
3828          }
3829          /* Otherwise we have to do it the longwinded way. */
3830          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3831          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3832          HReg        ereg = newVRegV(env);
3833          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3834          addInstr(env, AMD64Instr_Push(rmi));
3835          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3836          if (laneBits == 8) {
3837             /* This instruction doesn't exist so we need to fake it, in the same
3838                way as above.
3839             */
3840             AMD64SseOp reverse_op = op;
3841             switch (op) {
3842                case Asse_SHL16:
3843                   reverse_op = Asse_SHR16;
3844                   break;
3845                default:
3846                   vpanic("Iop_ShlN8x16");
3847             }
3848             HReg hi  = newVRegV(env);
3849             addInstr(env, mk_vMOVsd_RR(greg, hi));
3850             addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, hi));
3851             addInstr(env, AMD64Instr_SseShiftN(op, 8, hi));
3852             addInstr(env, AMD64Instr_SseReRg(op, ereg, hi));
3853             addInstr(env, mk_vMOVsd_RR(greg, dst));
3854             addInstr(env, AMD64Instr_SseShiftN(op, 8, dst));
3855             addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3856             addInstr(env, AMD64Instr_SseShiftN(reverse_op, 8, dst));
3857             addInstr(env, AMD64Instr_SseReRg(Asse_OR, hi, dst));
3858             return dst;
3859          }
3860          addInstr(env, mk_vMOVsd_RR(greg, dst));
3861          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3862          add_to_rsp(env, 16);
3863          return dst;
3864       }
3865
3866       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3867                            goto do_SseAssistedBinary;
3868       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3869                            goto do_SseAssistedBinary;
3870       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3871                            goto do_SseAssistedBinary;
3872       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3873                            goto do_SseAssistedBinary;
3874       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3875                            goto do_SseAssistedBinary;
3876       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3877                            goto do_SseAssistedBinary;
3878       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3879                            goto do_SseAssistedBinary;
3880       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3881                            goto do_SseAssistedBinary;
3882       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3883                            goto do_SseAssistedBinary;
3884       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3885                            goto do_SseAssistedBinary;
3886       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3887                            goto do_SseAssistedBinary;
3888       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3889                            goto do_SseAssistedBinary;
3890       case Iop_QNarrowBin32Sto16Ux8:
3891                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3892                            goto do_SseAssistedBinary;
3893       case Iop_NarrowBin16to8x16:
3894                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3895                            goto do_SseAssistedBinary;
3896       case Iop_NarrowBin32to16x8:
3897                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3898                            goto do_SseAssistedBinary;
3899       do_SseAssistedBinary: {
3900          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3901             well. */
3902          vassert(fn != 0);
3903          HReg dst = newVRegV(env);
3904          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3905          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3906          HReg argp = newVRegI(env);
3907          /* subq $112, %rsp         -- make a space*/
3908          sub_from_rsp(env, 112);
3909          /* leaq 48(%rsp), %r_argp  -- point into it */
3910          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3911                                         argp));
3912          /* andq $-16, %r_argp      -- 16-align the pointer */
3913          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3914                                          AMD64RMI_Imm( ~(UInt)15 ),
3915                                          argp));
3916          /* Prepare 3 arg regs:
3917             leaq 0(%r_argp), %rdi
3918             leaq 16(%r_argp), %rsi
3919             leaq 32(%r_argp), %rdx
3920          */
3921          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3922                                         hregAMD64_RDI()));
3923          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3924                                         hregAMD64_RSI()));
3925          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3926                                         hregAMD64_RDX()));
3927          /* Store the two args, at (%rsi) and (%rdx):
3928             movupd  %argL, 0(%rsi)
3929             movupd  %argR, 0(%rdx)
3930          */
3931          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3932                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3933          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3934                                           AMD64AMode_IR(0, hregAMD64_RDX())));
3935          /* call the helper */
3936          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3937                                         3, mk_RetLoc_simple(RLPri_None) ));
3938          /* fetch the result from memory, using %r_argp, which the
3939             register allocator will keep alive across the call. */
3940          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3941                                           AMD64AMode_IR(0, argp)));
3942          /* and finally, clear the space */
3943          add_to_rsp(env, 112);
3944          return dst;
3945       }
3946
3947       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3948                          goto do_SseAssistedVectorAndScalar;
3949       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3950                          goto do_SseAssistedVectorAndScalar;
3951       do_SseAssistedVectorAndScalar: {
3952          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3953             well. */
3954          vassert(fn != 0);
3955          HReg dst = newVRegV(env);
3956          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3957          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3958          HReg argp = newVRegI(env);
3959          /* subq $112, %rsp         -- make a space*/
3960          sub_from_rsp(env, 112);
3961          /* leaq 48(%rsp), %r_argp  -- point into it */
3962          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3963                                         argp));
3964          /* andq $-16, %r_argp      -- 16-align the pointer */
3965          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3966                                          AMD64RMI_Imm( ~(UInt)15 ),
3967                                          argp));
3968          /* Prepare 2 vector arg regs:
3969             leaq 0(%r_argp), %rdi
3970             leaq 16(%r_argp), %rsi
3971          */
3972          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3973                                         hregAMD64_RDI()));
3974          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3975                                         hregAMD64_RSI()));
3976          /* Store the vector arg, at (%rsi):
3977             movupd  %argL, 0(%rsi)
3978          */
3979          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3980                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3981          /* And get the scalar value into rdx */
3982          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3983
3984          /* call the helper */
3985          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3986                                         3, mk_RetLoc_simple(RLPri_None) ));
3987          /* fetch the result from memory, using %r_argp, which the
3988             register allocator will keep alive across the call. */
3989          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3990                                           AMD64AMode_IR(0, argp)));
3991          /* and finally, clear the space */
3992          add_to_rsp(env, 112);
3993          return dst;
3994       }
3995
3996       case Iop_I32StoF32x4:
3997       case Iop_F32toI32Sx4: {
3998          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3999          HReg dst = newVRegV(env);
4000          AMD64SseOp mop
4001             = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
4002          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4003          addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
4004          set_SSE_rounding_default(env);
4005          return dst;
4006       }
4007
4008       // Half-float vector conversion
4009       case Iop_F32toF16x8: {
4010          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4011             HReg srcHi, srcLo;
4012             iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
4013             HReg dstHi = newVRegV(env);
4014             HReg dstLo = newVRegV(env);
4015             set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
4016             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
4017             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
4018             set_SSE_rounding_default(env);
4019             // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
4020             // need to compact all that into one register.  There's probably a
4021             // more elegant way to do this, but ..
4022             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4023             // dstHi is now 127:64 = useful data, 63:0 = zero
4024             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4025             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
4026             // dstLo is now 127:64 = zero, 63:0 = useful data
4027             addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
4028             return dstLo;
4029          }
4030          break;
4031       }
4032
4033       default:
4034          break;
4035    } /* switch (e->Iex.Binop.op) */
4036    } /* if (e->tag == Iex_Binop) */
4037
4038    if (e->tag == Iex_Triop) {
4039    IRTriop *triop = e->Iex.Triop.details;
4040    switch (triop->op) {
4041
4042       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
4043       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
4044       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
4045       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
4046       do_64Fx2_w_rm:
4047       {
4048          HReg argL = iselVecExpr(env, triop->arg2);
4049          HReg argR = iselVecExpr(env, triop->arg3);
4050          HReg dst = newVRegV(env);
4051          addInstr(env, mk_vMOVsd_RR(argL, dst));
4052          /* XXXROUNDINGFIXME */
4053          /* set roundingmode here */
4054          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
4055          return dst;
4056       }
4057
4058       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
4059       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
4060       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
4061       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
4062       do_32Fx4_w_rm:
4063       {
4064          HReg argL = iselVecExpr(env, triop->arg2);
4065          HReg argR = iselVecExpr(env, triop->arg3);
4066          HReg dst = newVRegV(env);
4067          addInstr(env, mk_vMOVsd_RR(argL, dst));
4068          /* XXXROUNDINGFIXME */
4069          /* set roundingmode here */
4070          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
4071          return dst;
4072       }
4073
4074       default:
4075          break;
4076    } /* switch (triop->op) */
4077    } /* if (e->tag == Iex_Triop) */
4078
4079    if (e->tag == Iex_ITE) { // VFD
4080       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
4081       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
4082       HReg dst = newVRegV(env);
4083       addInstr(env, mk_vMOVsd_RR(r1,dst));
4084       AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
4085       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
4086       return dst;
4087    }
4088
4089    //vec_fail:
4090    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
4091               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4092    ppIRExpr(e);
4093    vpanic("iselVecExpr_wrk");
4094 }
4095
4096
4097 /*---------------------------------------------------------*/
4098 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
4099 /*---------------------------------------------------------*/
4100
4101 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
4102                            ISelEnv* env, const IRExpr* e )
4103 {
4104    iselDVecExpr_wrk( rHi, rLo, env, e );
4105 #  if 0
4106    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
4107 #  endif
4108    vassert(hregClass(*rHi) == HRcVec128);
4109    vassert(hregClass(*rLo) == HRcVec128);
4110    vassert(hregIsVirtual(*rHi));
4111    vassert(hregIsVirtual(*rLo));
4112 }
4113
4114
4115 /* DO NOT CALL THIS DIRECTLY */
4116 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
4117                                ISelEnv* env, const IRExpr* e )
4118 {
4119    HWord fn = 0; /* address of helper fn, if required */
4120    vassert(e);
4121    IRType ty = typeOfIRExpr(env->type_env, e);
4122    vassert(ty == Ity_V256);
4123    UInt laneBits = 0;
4124
4125    AMD64SseOp op = Asse_INVALID;
4126
4127    /* read 256-bit IRTemp */
4128    if (e->tag == Iex_RdTmp) {
4129       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
4130       return;
4131    }
4132
4133    if (e->tag == Iex_Get) {
4134       HReg        vHi  = newVRegV(env);
4135       HReg        vLo  = newVRegV(env);
4136       HReg        rbp  = hregAMD64_RBP();
4137       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
4138       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
4139       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
4140       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
4141       *rHi = vHi;
4142       *rLo = vLo;
4143       return;
4144    }
4145
4146    if (e->tag == Iex_Load) {
4147       HReg        vHi  = newVRegV(env);
4148       HReg        vLo  = newVRegV(env);
4149       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
4150       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4151       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4152       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
4153       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
4154       *rHi = vHi;
4155       *rLo = vLo;
4156       return;
4157    }
4158
4159    if (e->tag == Iex_Const) {
4160       vassert(e->Iex.Const.con->tag == Ico_V256);
4161       switch (e->Iex.Const.con->Ico.V256) {
4162          case 0x00000000: {
4163             HReg vHi = generate_zeroes_V128(env);
4164             HReg vLo = newVRegV(env);
4165             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
4166             *rHi = vHi;
4167             *rLo = vLo;
4168             return;
4169          }
4170          case 0xFFFFFFFF: {
4171             HReg vHi = generate_ones_V128(env);
4172             HReg vLo = newVRegV(env);
4173             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
4174             *rHi = vHi;
4175             *rLo = vLo;
4176             return;
4177          }
4178          default:
4179             break; /* give up.   Until such time as is necessary. */
4180       }
4181    }
4182
4183    if (e->tag == Iex_Unop) {
4184    switch (e->Iex.Unop.op) {
4185
4186       case Iop_NotV256: {
4187          HReg argHi, argLo;
4188          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4189          *rHi = do_sse_NotV128(env, argHi);
4190          *rLo = do_sse_NotV128(env, argLo);
4191          return;
4192       }
4193
4194       case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
4195       case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
4196       case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
4197       do_32Fx8_unary:
4198       {
4199          HReg argHi, argLo;
4200          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4201          HReg dstHi = newVRegV(env);
4202          HReg dstLo = newVRegV(env);
4203          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4204          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4205          *rHi = dstHi;
4206          *rLo = dstLo;
4207          return;
4208       }
4209
4210       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
4211       do_64Fx4_unary:
4212       {
4213          HReg argHi, argLo;
4214          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4215          HReg dstHi = newVRegV(env);
4216          HReg dstLo = newVRegV(env);
4217          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4218          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4219          *rHi = dstHi;
4220          *rLo = dstLo;
4221          return;
4222       }
4223
4224       case Iop_CmpNEZ64x4: {
4225          /* We can use SSE2 instructions for this. */
4226          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4227             (obviously).  See comment on Iop_CmpNEZ64x2 for
4228             explanation of what's going on here. */
4229          HReg argHi, argLo;
4230          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4231          HReg tmpHi  = generate_zeroes_V128(env);
4232          HReg tmpLo  = newVRegV(env);
4233          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4234          HReg dstHi  = newVRegV(env);
4235          HReg dstLo  = newVRegV(env);
4236          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4237          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4238          tmpHi = do_sse_NotV128(env, tmpHi);
4239          tmpLo = do_sse_NotV128(env, tmpLo);
4240          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4241          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4242          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4243          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4244          *rHi = dstHi;
4245          *rLo = dstLo;
4246          return;
4247       }
4248
4249       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4250       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4251       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
4252       do_CmpNEZ_vector:
4253       {
4254          HReg argHi, argLo;
4255          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4256          HReg tmpHi = newVRegV(env);
4257          HReg tmpLo = newVRegV(env);
4258          HReg zero  = generate_zeroes_V128(env);
4259          HReg dstHi, dstLo;
4260          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4261          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4262          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4263          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4264          dstHi = do_sse_NotV128(env, tmpHi);
4265          dstLo = do_sse_NotV128(env, tmpLo);
4266          *rHi = dstHi;
4267          *rLo = dstLo;
4268          return;
4269       }
4270
4271       case Iop_F16toF32x8: {
4272          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4273             HReg src     = iselVecExpr(env, e->Iex.Unop.arg);
4274             HReg srcCopy = newVRegV(env);
4275             HReg dstHi   = newVRegV(env);
4276             HReg dstLo   = newVRegV(env);
4277             // Copy src, since we'll need to modify it.
4278             addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4279             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4280             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4281             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4282             *rHi = dstHi;
4283             *rLo = dstLo;
4284             return;
4285          }
4286          break;
4287       }
4288
4289       default:
4290          break;
4291    } /* switch (e->Iex.Unop.op) */
4292    } /* if (e->tag == Iex_Unop) */
4293
4294    if (e->tag == Iex_Binop) {
4295    switch (e->Iex.Binop.op) {
4296
4297       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
4298       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
4299       do_64Fx4:
4300       {
4301          HReg argLhi, argLlo, argRhi, argRlo;
4302          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4303          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4304          HReg dstHi = newVRegV(env);
4305          HReg dstLo = newVRegV(env);
4306          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4307          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4308          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4309          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4310          *rHi = dstHi;
4311          *rLo = dstLo;
4312          return;
4313       }
4314
4315       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
4316       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
4317       do_32Fx8:
4318       {
4319          HReg argLhi, argLlo, argRhi, argRlo;
4320          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4321          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4322          HReg dstHi = newVRegV(env);
4323          HReg dstLo = newVRegV(env);
4324          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4325          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4326          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4327          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4328          *rHi = dstHi;
4329          *rLo = dstLo;
4330          return;
4331       }
4332
4333       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
4334       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
4335       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
4336       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
4337       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
4338       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
4339       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
4340       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
4341       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
4342       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
4343       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
4344       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
4345       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
4346       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
4347       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
4348       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
4349       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
4350       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4351       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4352       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
4353       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
4354       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
4355       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
4356       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4357       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4358       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
4359       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
4360       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
4361       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
4362       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
4363       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
4364       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
4365       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
4366       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
4367       do_SseReRg:
4368       {
4369          HReg argLhi, argLlo, argRhi, argRlo;
4370          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4371          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4372          HReg dstHi = newVRegV(env);
4373          HReg dstLo = newVRegV(env);
4374          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4375          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4376          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4377          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4378          *rHi = dstHi;
4379          *rLo = dstLo;
4380          return;
4381       }
4382
4383       case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4384       case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4385       case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4386       case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4387       case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4388       case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4389       case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4390       case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4391       do_SseShift: {
4392          HReg dstHi = newVRegV(env);
4393          HReg dstLo = newVRegV(env);
4394          HReg gregHi, gregLo;
4395          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4396          /* If it's a shift by an in-range immediate, generate two single
4397             instructions. */
4398          if (e->Iex.Binop.arg2->tag == Iex_Const) {
4399             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4400             vassert(c->tag == Ico_U8);
4401             UInt shift = c->Ico.U8;
4402             if (shift < laneBits) {
4403                addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4404                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4405                addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4406                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4407                *rHi = dstHi;
4408                *rLo = dstLo;
4409                return;
4410             }
4411          }
4412          /* Otherwise we have to do it the longwinded way. */
4413          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4414          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
4415          HReg        ereg  = newVRegV(env);
4416          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4417          addInstr(env, AMD64Instr_Push(rmi));
4418          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4419          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4420          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4421          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4422          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4423          add_to_rsp(env, 16);
4424          *rHi = dstHi;
4425          *rLo = dstLo;
4426          return;
4427       }
4428
4429       case Iop_V128HLtoV256: {
4430          // Curiously, there doesn't seem to be any benefit to be had here by
4431          // checking whether arg1 and arg2 are the same, in the style of how
4432          // (eg) 64HLtoV128 is handled elsewhere in this file.
4433          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4434          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4435          return;
4436       }
4437
4438       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4439                            goto do_SseAssistedBinary;
4440       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4441                            goto do_SseAssistedBinary;
4442       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4443                            goto do_SseAssistedBinary;
4444       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4445                            goto do_SseAssistedBinary;
4446       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4447                            goto do_SseAssistedBinary;
4448       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4449                            goto do_SseAssistedBinary;
4450       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4451                            goto do_SseAssistedBinary;
4452       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4453                            goto do_SseAssistedBinary;
4454       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4455                            goto do_SseAssistedBinary;
4456       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4457                            goto do_SseAssistedBinary;
4458       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4459                            goto do_SseAssistedBinary;
4460       do_SseAssistedBinary: {
4461          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4462             well. */
4463          vassert(fn != 0);
4464          HReg dstHi = newVRegV(env);
4465          HReg dstLo = newVRegV(env);
4466          HReg argLhi, argLlo, argRhi, argRlo;
4467          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4468          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4469          HReg argp = newVRegI(env);
4470          /* subq $160, %rsp         -- make a space*/
4471          sub_from_rsp(env, 160);
4472          /* leaq 48(%rsp), %r_argp  -- point into it */
4473          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4474                                         argp));
4475          /* andq $-16, %r_argp      -- 16-align the pointer */
4476          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4477                                          AMD64RMI_Imm( ~(UInt)15 ),
4478                                          argp));
4479          /* Prepare 3 arg regs:
4480             leaq 0(%r_argp), %rdi
4481             leaq 16(%r_argp), %rsi
4482             leaq 32(%r_argp), %rdx
4483          */
4484          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4485                                         hregAMD64_RDI()));
4486          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4487                                         hregAMD64_RSI()));
4488          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4489                                         hregAMD64_RDX()));
4490          /* Store the two high args, at (%rsi) and (%rdx):
4491             movupd  %argLhi, 0(%rsi)
4492             movupd  %argRhi, 0(%rdx)
4493          */
4494          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4495                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4496          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4497                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4498          /* Store the two low args, at 48(%rsi) and 48(%rdx):
4499             movupd  %argLlo, 48(%rsi)
4500             movupd  %argRlo, 48(%rdx)
4501          */
4502          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4503                                           AMD64AMode_IR(48, hregAMD64_RSI())));
4504          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4505                                           AMD64AMode_IR(48, hregAMD64_RDX())));
4506          /* call the helper */
4507          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4508                                         mk_RetLoc_simple(RLPri_None) ));
4509          /* Prepare 3 arg regs:
4510             leaq 48(%r_argp), %rdi
4511             leaq 64(%r_argp), %rsi
4512             leaq 80(%r_argp), %rdx
4513          */
4514          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4515                                         hregAMD64_RDI()));
4516          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4517                                         hregAMD64_RSI()));
4518          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4519                                         hregAMD64_RDX()));
4520          /* call the helper */
4521          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4522                                         mk_RetLoc_simple(RLPri_None) ));
4523          /* fetch the result from memory, using %r_argp, which the
4524             register allocator will keep alive across the call. */
4525          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4526                                           AMD64AMode_IR(0, argp)));
4527          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4528                                           AMD64AMode_IR(48, argp)));
4529          /* and finally, clear the space */
4530          add_to_rsp(env, 160);
4531          *rHi = dstHi;
4532          *rLo = dstLo;
4533          return;
4534       }
4535
4536       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4537                            goto do_SseAssistedBinary256;
4538       do_SseAssistedBinary256: {
4539          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4540             well. */
4541          vassert(fn != 0);
4542          HReg dstHi = newVRegV(env);
4543          HReg dstLo = newVRegV(env);
4544          HReg argLhi, argLlo, argRhi, argRlo;
4545          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4546          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4547          HReg argp = newVRegI(env);
4548          /* subq $160, %rsp         -- make a space*/
4549          sub_from_rsp(env, 160);
4550          /* leaq 48(%rsp), %r_argp  -- point into it */
4551          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4552                                         argp));
4553          /* andq $-16, %r_argp      -- 16-align the pointer */
4554          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4555                                          AMD64RMI_Imm( ~(UInt)15 ),
4556                                          argp));
4557          /* Prepare 3 arg regs:
4558             leaq 0(%r_argp), %rdi
4559             leaq 32(%r_argp), %rsi
4560             leaq 64(%r_argp), %rdx
4561          */
4562          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4563                                         hregAMD64_RDI()));
4564          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4565                                         hregAMD64_RSI()));
4566          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4567                                         hregAMD64_RDX()));
4568          /* Store the two args, at (%rsi) and (%rdx):
4569             movupd  %argLlo, 0(%rsi)
4570             movupd  %argLhi, 16(%rsi)
4571             movupd  %argRlo, 0(%rdx)
4572             movupd  %argRhi, 16(%rdx)
4573          */
4574          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4575                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4576          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4577                                           AMD64AMode_IR(16, hregAMD64_RSI())));
4578          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4579                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4580          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4581                                           AMD64AMode_IR(16, hregAMD64_RDX())));
4582          /* call the helper */
4583          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4584                                         mk_RetLoc_simple(RLPri_None) ));
4585          /* fetch the result from memory, using %r_argp, which the
4586             register allocator will keep alive across the call. */
4587          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4588                                           AMD64AMode_IR(0, argp)));
4589          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4590                                           AMD64AMode_IR(16, argp)));
4591          /* and finally, clear the space */
4592          add_to_rsp(env, 160);
4593          *rHi = dstHi;
4594          *rLo = dstLo;
4595          return;
4596       }
4597
4598       case Iop_I32StoF32x8:
4599       case Iop_F32toI32Sx8: {
4600          HReg argHi, argLo;
4601          iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4602          HReg dstHi = newVRegV(env);
4603          HReg dstLo = newVRegV(env);
4604          AMD64SseOp mop
4605             = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4606          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4607          addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4608          addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4609          set_SSE_rounding_default(env);
4610          *rHi = dstHi;
4611          *rLo = dstLo;
4612          return;
4613       }
4614
4615       default:
4616          break;
4617    } /* switch (e->Iex.Binop.op) */
4618    } /* if (e->tag == Iex_Binop) */
4619
4620    if (e->tag == Iex_Triop) {
4621    IRTriop *triop = e->Iex.Triop.details;
4622    switch (triop->op) {
4623
4624       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4625       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4626       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4627       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4628       do_64Fx4_w_rm:
4629       {
4630          HReg argLhi, argLlo, argRhi, argRlo;
4631          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4632          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4633          HReg dstHi = newVRegV(env);
4634          HReg dstLo = newVRegV(env);
4635          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4636          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4637          /* XXXROUNDINGFIXME */
4638          /* set roundingmode here */
4639          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4640          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4641          *rHi = dstHi;
4642          *rLo = dstLo;
4643          return;
4644       }
4645
4646       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4647       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4648       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4649       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4650       do_32Fx8_w_rm:
4651       {
4652          HReg argLhi, argLlo, argRhi, argRlo;
4653          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4654          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4655          HReg dstHi = newVRegV(env);
4656          HReg dstLo = newVRegV(env);
4657          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4658          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4659          /* XXXROUNDINGFIXME */
4660          /* set roundingmode here */
4661          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4662          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4663          *rHi = dstHi;
4664          *rLo = dstLo;
4665          return;
4666       }
4667
4668       default:
4669          break;
4670    } /* switch (triop->op) */
4671    } /* if (e->tag == Iex_Triop) */
4672
4673
4674    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4675       const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4676       const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4677       const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4678       const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4679       // If the args are trivially the same (tmp or const), use the same
4680       // source register for all four, and only one movq since those are
4681       // (relatively) expensive.
4682       if (areAtomsAndEqual(arg1, arg2)
4683           && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4684          HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4685          HReg tmp = newVRegV(env);
4686          HReg dst = newVRegV(env);
4687          addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4688          addInstr(env, mk_vMOVsd_RR(dst, tmp));
4689          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4690          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4691          *rHi = dst;
4692          *rLo = dst;
4693       } else {
4694          /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4695          HReg q3 = iselIntExpr_R(env, arg1);
4696          HReg q2 = iselIntExpr_R(env, arg2);
4697          HReg q1 = iselIntExpr_R(env, arg3);
4698          HReg q0 = iselIntExpr_R(env, arg4);
4699          HReg tmp = newVRegV(env);
4700          HReg dstHi = newVRegV(env);
4701          HReg dstLo = newVRegV(env);
4702          addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4703          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4704          addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4705          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4706          addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4707          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4708          addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4709          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4710          *rHi = dstHi;
4711          *rLo = dstLo;
4712       }
4713       return;
4714    }
4715
4716    if (e->tag == Iex_ITE) {
4717       HReg r1Hi, r1Lo, r0Hi, r0Lo;
4718       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4719       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4720       HReg dstHi = newVRegV(env);
4721       HReg dstLo = newVRegV(env);
4722       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4723       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4724       AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond);
4725       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4726       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4727       *rHi = dstHi;
4728       *rLo = dstLo;
4729       return;
4730    }
4731
4732    //avx_fail:
4733    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4734               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4735    ppIRExpr(e);
4736    vpanic("iselDVecExpr_wrk");
4737 }
4738
4739
4740 /*---------------------------------------------------------*/
4741 /*--- ISEL: Statements                                  ---*/
4742 /*---------------------------------------------------------*/
4743
4744 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4745 {
4746    if (vex_traceflags & VEX_TRACE_VCODE) {
4747       vex_printf("\n-- ");
4748       ppIRStmt(stmt);
4749       vex_printf("\n");
4750    }
4751
4752    switch (stmt->tag) {
4753
4754    /* --------- LOADG (guarded load) --------- */
4755    case Ist_LoadG: {
4756       IRLoadG* lg = stmt->Ist.LoadG.details;
4757       if (lg->end != Iend_LE)
4758          goto stmt_fail;
4759
4760       UChar szB = 0; /* invalid */
4761       switch (lg->cvt) {
4762          case ILGop_Ident32:   szB = 4;  break;
4763          case ILGop_Ident64:   szB = 8;  break;
4764          case ILGop_IdentV128: szB = 16; break;
4765          default: break;
4766       }
4767       if (szB == 0)
4768          goto stmt_fail;
4769
4770       AMD64AMode* amAddr
4771          = iselIntExpr_AMode(env, lg->addr);
4772       HReg rAlt
4773          = szB == 16 ? iselVecExpr(env, lg->alt)
4774                      : iselIntExpr_R(env, lg->alt);
4775       HReg rDst
4776          = lookupIRTemp(env, lg->dst);
4777
4778       /* Get the alt value into the dst.  We'll do a conditional load
4779          which overwrites it -- or not -- with loaded data. */
4780       if (szB == 16) {
4781          addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4782       } else {
4783          addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4784       }
4785       AMD64CondCode cc = iselCondCode_C(env, lg->guard);
4786       if (szB == 16) {
4787          addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4788       } else {
4789          addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4790       }
4791       return;
4792    }
4793
4794    /* --------- STOREG (guarded store) --------- */
4795    case Ist_StoreG: {
4796       IRStoreG* sg = stmt->Ist.StoreG.details;
4797       if (sg->end != Iend_LE)
4798          goto stmt_fail;
4799
4800       UChar szB = 0; /* invalid */
4801       switch (typeOfIRExpr(env->type_env, sg->data)) {
4802          case Ity_I32:  szB = 4; break;
4803          case Ity_I64:  szB = 8; break;
4804          case Ity_V128: szB = 16; break;
4805          default: break;
4806       }
4807       if (szB == 0)
4808          goto stmt_fail;
4809
4810       AMD64AMode* amAddr
4811          = iselIntExpr_AMode(env, sg->addr);
4812       HReg rSrc
4813          = szB == 16 ? iselVecExpr(env, sg->data)
4814                      : iselIntExpr_R(env, sg->data);
4815       AMD64CondCode cc
4816          = iselCondCode_C(env, sg->guard);
4817       if (szB == 16) {
4818          addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4819       } else {
4820          addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4821       }
4822       return;
4823    }
4824
4825    /* --------- STORE --------- */
4826    case Ist_Store: {
4827       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4828       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4829       IREndness end   = stmt->Ist.Store.end;
4830
4831       if (tya != Ity_I64 || end != Iend_LE)
4832          goto stmt_fail;
4833
4834       if (tyd == Ity_I64) {
4835          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4836          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4837          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4838          return;
4839       }
4840       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4841          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4842          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4843          addInstr(env, AMD64Instr_Store(
4844                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4845                           r,am));
4846          return;
4847       }
4848       if (tyd == Ity_F64) {
4849          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4850          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4851          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4852          return;
4853       }
4854       if (tyd == Ity_F32) {
4855          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4856          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4857          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4858          return;
4859       }
4860       if (tyd == Ity_V128) {
4861          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4862          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4863          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4864          return;
4865       }
4866       if (tyd == Ity_V256) {
4867          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4868          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4869          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4870          HReg vHi, vLo;
4871          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4872          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4873          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4874          return;
4875       }
4876       break;
4877    }
4878
4879    /* --------- PUT --------- */
4880    case Ist_Put: {
4881       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4882       if (ty == Ity_I64) {
4883          /* We're going to write to memory, so compute the RHS into an
4884             AMD64RI. */
4885          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4886          addInstr(env,
4887                   AMD64Instr_Alu64M(
4888                      Aalu_MOV,
4889                      ri,
4890                      AMD64AMode_IR(stmt->Ist.Put.offset,
4891                                    hregAMD64_RBP())
4892                  ));
4893          return;
4894       }
4895       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4896          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4897          addInstr(env, AMD64Instr_Store(
4898                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4899                           r,
4900                           AMD64AMode_IR(stmt->Ist.Put.offset,
4901                                         hregAMD64_RBP())));
4902          return;
4903       }
4904       if (ty == Ity_F32) {
4905          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4906          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4907          set_SSE_rounding_default(env); /* paranoia */
4908          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4909          return;
4910       }
4911       if (ty == Ity_F64) {
4912          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4913          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4914                                          hregAMD64_RBP() );
4915          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4916          return;
4917       }
4918       if (ty == Ity_V128) {
4919          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4920          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4921                                          hregAMD64_RBP());
4922          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4923          return;
4924       }
4925       if (ty == Ity_V256) {
4926          HReg vHi, vLo;
4927          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4928          HReg        rbp  = hregAMD64_RBP();
4929          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4930          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4931          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4932          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4933          return;
4934       }
4935       break;
4936    }
4937
4938    /* --------- Indexed PUT --------- */
4939    case Ist_PutI: {
4940       IRPutI *puti = stmt->Ist.PutI.details;
4941
4942       AMD64AMode* am
4943          = genGuestArrayOffset(
4944               env, puti->descr,
4945                    puti->ix, puti->bias );
4946
4947       IRType ty = typeOfIRExpr(env->type_env, puti->data);
4948       if (ty == Ity_F64) {
4949          HReg val = iselDblExpr(env, puti->data);
4950          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4951          return;
4952       }
4953       if (ty == Ity_I8) {
4954          HReg r = iselIntExpr_R(env, puti->data);
4955          addInstr(env, AMD64Instr_Store( 1, r, am ));
4956          return;
4957       }
4958       if (ty == Ity_I64) {
4959          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4960          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4961          return;
4962       }
4963       break;
4964    }
4965
4966    /* --------- TMP --------- */
4967    case Ist_WrTmp: {
4968       IRTemp tmp = stmt->Ist.WrTmp.tmp;
4969       IRType ty = typeOfIRTemp(env->type_env, tmp);
4970
4971       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4972          compute it into an AMode and then use LEA.  This usually
4973          produces fewer instructions, often because (for memcheck
4974          created IR) we get t = address-expression, (t is later used
4975          twice) and so doing this naturally turns address-expression
4976          back into an AMD64 amode. */
4977       if (ty == Ity_I64
4978           && stmt->Ist.WrTmp.data->tag == Iex_Binop
4979           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4980          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4981          HReg dst = lookupIRTemp(env, tmp);
4982          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4983             /* Hmm, iselIntExpr_AMode wimped out and just computed the
4984                value into a register.  Just emit a normal reg-reg move
4985                so reg-alloc can coalesce it away in the usual way. */
4986             HReg src = am->Aam.IR.reg;
4987             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4988          } else {
4989             addInstr(env, AMD64Instr_Lea64(am,dst));
4990          }
4991          return;
4992       }
4993
4994       if (ty == Ity_I64 || ty == Ity_I32
4995           || ty == Ity_I16 || ty == Ity_I8) {
4996          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4997          HReg dst = lookupIRTemp(env, tmp);
4998          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4999          return;
5000       }
5001       if (ty == Ity_I128) {
5002          HReg rHi, rLo, dstHi, dstLo;
5003          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
5004          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
5005          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
5006          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
5007          return;
5008       }
5009       if (ty == Ity_I1) {
5010          AMD64CondCode cond = iselCondCode_C(env, stmt->Ist.WrTmp.data);
5011          HReg dst = lookupIRTemp(env, tmp);
5012          addInstr(env, AMD64Instr_Set64(cond, dst));
5013          return;
5014       }
5015       if (ty == Ity_F64) {
5016          HReg dst = lookupIRTemp(env, tmp);
5017          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
5018          addInstr(env, mk_vMOVsd_RR(src, dst));
5019          return;
5020       }
5021       if (ty == Ity_F32) {
5022          HReg dst = lookupIRTemp(env, tmp);
5023          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
5024          addInstr(env, mk_vMOVsd_RR(src, dst));
5025          return;
5026       }
5027       if (ty == Ity_V128) {
5028          HReg dst = lookupIRTemp(env, tmp);
5029          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
5030          addInstr(env, mk_vMOVsd_RR(src, dst));
5031          return;
5032       }
5033       if (ty == Ity_V256) {
5034          HReg rHi, rLo, dstHi, dstLo;
5035          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
5036          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
5037          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
5038          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
5039          return;
5040       }
5041       break;
5042    }
5043
5044    /* --------- Call to DIRTY helper --------- */
5045    case Ist_Dirty: {
5046       IRDirty* d = stmt->Ist.Dirty.details;
5047
5048       /* Figure out the return type, if any. */
5049       IRType retty = Ity_INVALID;
5050       if (d->tmp != IRTemp_INVALID)
5051          retty = typeOfIRTemp(env->type_env, d->tmp);
5052
5053       /* Throw out any return types we don't know about. */
5054       Bool retty_ok = False;
5055       switch (retty) {
5056          case Ity_INVALID: /* function doesn't return anything */
5057          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
5058          case Ity_V128: case Ity_V256:
5059             retty_ok = True; break;
5060          default:
5061             break;
5062       }
5063       if (!retty_ok)
5064          break; /* will go to stmt_fail: */
5065
5066       /* Marshal args, do the call, and set the return value to
5067          0x555..555 if this is a conditional call that returns a value
5068          and the call is skipped. */
5069       UInt   addToSp = 0;
5070       RetLoc rloc    = mk_RetLoc_INVALID();
5071       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
5072       vassert(is_sane_RetLoc(rloc));
5073
5074       /* Now figure out what to do with the returned value, if any. */
5075       switch (retty) {
5076          case Ity_INVALID: {
5077             /* No return value.  Nothing to do. */
5078             vassert(d->tmp == IRTemp_INVALID);
5079             vassert(rloc.pri == RLPri_None);
5080             vassert(addToSp == 0);
5081             return;
5082          }
5083          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
5084             /* The returned value is in %rax.  Park it in the register
5085                associated with tmp. */
5086             vassert(rloc.pri == RLPri_Int);
5087             vassert(addToSp == 0);
5088             HReg dst = lookupIRTemp(env, d->tmp);
5089             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
5090             return;
5091          }
5092          case Ity_V128: {
5093             /* The returned value is on the stack, and rloc.spOff
5094                tells us where.  Fish it off the stack and then move
5095                the stack pointer upwards to clear it, as directed by
5096                doHelperCall. */
5097             vassert(rloc.pri == RLPri_V128SpRel);
5098             vassert(addToSp >= 16);
5099             HReg        dst = lookupIRTemp(env, d->tmp);
5100             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
5101             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
5102             add_to_rsp(env, addToSp);
5103             return;
5104          }
5105          case Ity_V256: {
5106             /* See comments for Ity_V128. */
5107             vassert(rloc.pri == RLPri_V256SpRel);
5108             vassert(addToSp >= 32);
5109             HReg        dstLo, dstHi;
5110             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
5111             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
5112             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
5113             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
5114             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
5115             add_to_rsp(env, addToSp);
5116             return;
5117          }
5118          default:
5119             /*NOTREACHED*/
5120             vassert(0);
5121       }
5122       break;
5123    }
5124
5125    /* --------- MEM FENCE --------- */
5126    case Ist_MBE:
5127       switch (stmt->Ist.MBE.event) {
5128          case Imbe_Fence:
5129             addInstr(env, AMD64Instr_MFence());
5130             return;
5131          default:
5132             break;
5133       }
5134       break;
5135
5136    /* --------- ACAS --------- */
5137    case Ist_CAS:
5138       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
5139          /* "normal" singleton CAS */
5140          UChar  sz;
5141          IRCAS* cas = stmt->Ist.CAS.details;
5142          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
5143          /* get: cas->expd into %rax, and cas->data into %rbx */
5144          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
5145          HReg rData = iselIntExpr_R(env, cas->dataLo);
5146          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
5147          HReg rOld  = lookupIRTemp(env, cas->oldLo);
5148          vassert(cas->expdHi == NULL);
5149          vassert(cas->dataHi == NULL);
5150          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
5151          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
5152          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
5153          switch (ty) {
5154             case Ity_I64: sz = 8; break;
5155             case Ity_I32: sz = 4; break;
5156             case Ity_I16: sz = 2; break;
5157             case Ity_I8:  sz = 1; break;
5158             default: goto unhandled_cas;
5159          }
5160          addInstr(env, AMD64Instr_ACAS(am, sz));
5161          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
5162          return;
5163       } else {
5164          /* double CAS */
5165          UChar  sz;
5166          IRCAS* cas = stmt->Ist.CAS.details;
5167          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
5168          /* only 32-bit and 64-bit allowed in this case */
5169          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
5170          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
5171          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
5172          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
5173          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
5174          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
5175          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
5176          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
5177          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
5178          switch (ty) {
5179             case Ity_I64:
5180                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
5181                   goto unhandled_cas; /* we'd have to generate
5182                                          cmpxchg16b, but the host
5183                                          doesn't support that */
5184                sz = 8;
5185                break;
5186             case Ity_I32:
5187                sz = 4;
5188                break;
5189             default:
5190                goto unhandled_cas;
5191          }
5192          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
5193          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
5194          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
5195          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
5196          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
5197          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
5198          addInstr(env, AMD64Instr_DACAS(am, sz));
5199          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
5200          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
5201          return;
5202       }
5203       unhandled_cas:
5204       break;
5205
5206    /* --------- INSTR MARK --------- */
5207    /* Doesn't generate any executable code ... */
5208    case Ist_IMark:
5209        return;
5210
5211    /* --------- ABI HINT --------- */
5212    /* These have no meaning (denotation in the IR) and so we ignore
5213       them ... if any actually made it this far. */
5214    case Ist_AbiHint:
5215        return;
5216
5217    /* --------- NO-OP --------- */
5218    case Ist_NoOp:
5219        return;
5220
5221    /* --------- EXIT --------- */
5222    case Ist_Exit: {
5223       if (stmt->Ist.Exit.dst->tag != Ico_U64)
5224          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5225
5226       AMD64CondCode cc    = iselCondCode_C(env, stmt->Ist.Exit.guard);
5227       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5228                                           hregAMD64_RBP());
5229
5230       /* Case: boring transfer to known address */
5231       if (stmt->Ist.Exit.jk == Ijk_Boring) {
5232          if (env->chainingAllowed) {
5233             /* .. almost always true .. */
5234             /* Skip the event check at the dst if this is a forwards
5235                edge. */
5236             Bool toFastEP
5237                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5238             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5239             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5240                                              amRIP, cc, toFastEP));
5241          } else {
5242             /* .. very occasionally .. */
5243             /* We can't use chaining, so ask for an assisted transfer,
5244                as that's the only alternative that is allowable. */
5245             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5246             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5247          }
5248          return;
5249       }
5250
5251       /* Case: assisted transfer to arbitrary address */
5252       switch (stmt->Ist.Exit.jk) {
5253          /* Keep this list in sync with that in iselNext below */
5254          case Ijk_ClientReq:
5255          case Ijk_EmWarn:
5256          case Ijk_NoDecode:
5257          case Ijk_NoRedir:
5258          case Ijk_SigSEGV:
5259          case Ijk_SigBUS:
5260          case Ijk_SigTRAP:
5261          case Ijk_Sys_syscall:
5262          case Ijk_Sys_int210:
5263          case Ijk_InvalICache:
5264          case Ijk_Yield:
5265          {
5266             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5267             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5268             return;
5269          }
5270          default:
5271             break;
5272       }
5273
5274       /* Do we ever expect to see any other kind? */
5275       goto stmt_fail;
5276    }
5277
5278    default: break;
5279    }
5280   stmt_fail:
5281    ppIRStmt(stmt);
5282    vpanic("iselStmt(amd64)");
5283 }
5284
5285
5286 /*---------------------------------------------------------*/
5287 /*--- ISEL: Basic block terminators (Nexts)             ---*/
5288 /*---------------------------------------------------------*/
5289
5290 static void iselNext ( ISelEnv* env,
5291                        IRExpr* next, IRJumpKind jk, Int offsIP )
5292 {
5293    if (vex_traceflags & VEX_TRACE_VCODE) {
5294       vex_printf( "\n-- PUT(%d) = ", offsIP);
5295       ppIRExpr( next );
5296       vex_printf( "; exit-");
5297       ppIRJumpKind(jk);
5298       vex_printf( "\n");
5299    }
5300
5301    /* Case: boring transfer to known address */
5302    if (next->tag == Iex_Const) {
5303       IRConst* cdst = next->Iex.Const.con;
5304       vassert(cdst->tag == Ico_U64);
5305       if (jk == Ijk_Boring || jk == Ijk_Call) {
5306          /* Boring transfer to known address */
5307          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5308          if (env->chainingAllowed) {
5309             /* .. almost always true .. */
5310             /* Skip the event check at the dst if this is a forwards
5311                edge. */
5312             Bool toFastEP
5313                = ((Addr64)cdst->Ico.U64) > env->max_ga;
5314             if (0) vex_printf("%s", toFastEP ? "X" : ".");
5315             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5316                                              amRIP, Acc_ALWAYS,
5317                                              toFastEP));
5318          } else {
5319             /* .. very occasionally .. */
5320             /* We can't use chaining, so ask for an indirect transfer,
5321                as that's the cheapest alternative that is
5322                allowable. */
5323             HReg r = iselIntExpr_R(env, next);
5324             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5325                                                Ijk_Boring));
5326          }
5327          return;
5328       }
5329    }
5330
5331    /* Case: call/return (==boring) transfer to any address */
5332    switch (jk) {
5333       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5334          HReg        r     = iselIntExpr_R(env, next);
5335          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5336          if (env->chainingAllowed) {
5337             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5338          } else {
5339             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5340                                                Ijk_Boring));
5341          }
5342          return;
5343       }
5344       default:
5345          break;
5346    }
5347
5348    /* Case: assisted transfer to arbitrary address */
5349    switch (jk) {
5350       /* Keep this list in sync with that for Ist_Exit above */
5351       case Ijk_ClientReq:
5352       case Ijk_EmWarn:
5353       case Ijk_NoDecode:
5354       case Ijk_NoRedir:
5355       case Ijk_SigSEGV:
5356       case Ijk_SigBUS:
5357       case Ijk_SigTRAP:
5358       case Ijk_Sys_syscall:
5359       case Ijk_Sys_int210:
5360       case Ijk_InvalICache:
5361       case Ijk_Yield: {
5362          HReg        r     = iselIntExpr_R(env, next);
5363          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5364          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5365          return;
5366       }
5367       default:
5368          break;
5369    }
5370
5371    vex_printf( "\n-- PUT(%d) = ", offsIP);
5372    ppIRExpr( next );
5373    vex_printf( "; exit-");
5374    ppIRJumpKind(jk);
5375    vex_printf( "\n");
5376    vassert(0); // are we expecting any other kind?
5377 }
5378
5379
5380 /*---------------------------------------------------------*/
5381 /*--- Insn selector top-level                           ---*/
5382 /*---------------------------------------------------------*/
5383
5384 /* Translate an entire SB to amd64 code. */
5385
5386 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5387                             VexArch      arch_host,
5388                             const VexArchInfo* archinfo_host,
5389                             const VexAbiInfo*  vbi/*UNUSED*/,
5390                             Int offs_Host_EvC_Counter,
5391                             Int offs_Host_EvC_FailAddr,
5392                             Bool chainingAllowed,
5393                             Bool addProfInc,
5394                             Addr max_ga )
5395 {
5396    Int        i, j;
5397    HReg       hreg, hregHI;
5398    ISelEnv*   env;
5399    UInt       hwcaps_host = archinfo_host->hwcaps;
5400    AMD64AMode *amCounter, *amFailAddr;
5401
5402    /* sanity ... */
5403    vassert(arch_host == VexArchAMD64);
5404    vassert(0 == (hwcaps_host
5405                  & ~(VEX_HWCAPS_AMD64_SSE3
5406                      | VEX_HWCAPS_AMD64_SSSE3
5407                      | VEX_HWCAPS_AMD64_CX16
5408                      | VEX_HWCAPS_AMD64_LZCNT
5409                      | VEX_HWCAPS_AMD64_AVX
5410                      | VEX_HWCAPS_AMD64_RDTSCP
5411                      | VEX_HWCAPS_AMD64_BMI
5412                      | VEX_HWCAPS_AMD64_AVX2
5413                      | VEX_HWCAPS_AMD64_F16C
5414                      | VEX_HWCAPS_AMD64_RDRAND
5415                      | VEX_HWCAPS_AMD64_RDSEED
5416                      | VEX_HWCAPS_AMD64_FMA3
5417                      | VEX_HWCAPS_AMD64_FMA4)));
5418
5419    /* Check that the host's endianness is as expected. */
5420    vassert(archinfo_host->endness == VexEndnessLE);
5421
5422    /* Make up an initial environment to use. */
5423    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5424    env->vreg_ctr = 0;
5425
5426    /* Set up output code array. */
5427    env->code = newHInstrArray();
5428
5429    /* Copy BB's type env. */
5430    env->type_env = bb->tyenv;
5431
5432    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
5433       change as we go along. */
5434    env->n_vregmap = bb->tyenv->types_used;
5435    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5436    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5437
5438    /* and finally ... */
5439    env->chainingAllowed = chainingAllowed;
5440    env->hwcaps          = hwcaps_host;
5441    env->max_ga          = max_ga;
5442
5443    /* For each IR temporary, allocate a suitably-kinded virtual
5444       register. */
5445    j = 0;
5446    for (i = 0; i < env->n_vregmap; i++) {
5447       hregHI = hreg = INVALID_HREG;
5448       switch (bb->tyenv->types[i]) {
5449          case Ity_I1:
5450          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5451             hreg = mkHReg(True, HRcInt64, 0, j++);
5452             break;
5453          case Ity_I128:
5454             hreg   = mkHReg(True, HRcInt64, 0, j++);
5455             hregHI = mkHReg(True, HRcInt64, 0, j++);
5456             break;
5457          case Ity_F32:
5458          case Ity_F64:
5459          case Ity_V128:
5460             hreg = mkHReg(True, HRcVec128, 0, j++);
5461             break;
5462          case Ity_V256:
5463             hreg   = mkHReg(True, HRcVec128, 0, j++);
5464             hregHI = mkHReg(True, HRcVec128, 0, j++);
5465             break;
5466          default:
5467             ppIRType(bb->tyenv->types[i]);
5468             vpanic("iselBB(amd64): IRTemp type");
5469       }
5470       env->vregmap[i]   = hreg;
5471       env->vregmapHI[i] = hregHI;
5472    }
5473    env->vreg_ctr = j;
5474
5475    /* The very first instruction must be an event check. */
5476    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
5477    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5478    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5479
5480    /* Possibly a block counter increment (for profiling).  At this
5481       point we don't know the address of the counter, so just pretend
5482       it is zero.  It will have to be patched later, but before this
5483       translation is used, by a call to LibVEX_patchProfCtr. */
5484    if (addProfInc) {
5485       addInstr(env, AMD64Instr_ProfInc());
5486    }
5487
5488    /* Ok, finally we can iterate over the statements. */
5489    for (i = 0; i < bb->stmts_used; i++)
5490       if (bb->stmts[i])
5491          iselStmt(env, bb->stmts[i]);
5492
5493    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5494
5495    /* record the number of vregs we used. */
5496    env->code->n_vregs = env->vreg_ctr;
5497    return env->code;
5498 }
5499
5500
5501 /*---------------------------------------------------------------*/
5502 /*--- end                                   host_amd64_isel.c ---*/
5503 /*---------------------------------------------------------------*/