VEX/priv/host_amd64_isel.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_isel.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_ir.h"
  36 #include "libvex.h"
  37
  38 #include "ir_match.h"
  39 #include "main_util.h"
  40 #include "main_globals.h"
  41 #include "host_generic_regs.h"
  42 #include "host_generic_simd64.h"
  43 #include "host_generic_simd128.h"
  44 #include "host_generic_simd256.h"
  45 #include "host_generic_maddf.h"
  46 #include "host_amd64_defs.h"
  47
  48
  49 /*---------------------------------------------------------*/
  50 /*--- x87/SSE control word stuff                        ---*/
  51 /*---------------------------------------------------------*/
  52
  53 /* Vex-generated code expects to run with the FPU set as follows: all
  54    exceptions masked, round-to-nearest, precision = 53 bits.  This
  55    corresponds to a FPU control word value of 0x027F.
  56
  57    Similarly the SSE control word (%mxcsr) should be 0x1F80.
  58
  59    %fpucw and %mxcsr should have these values on entry to
  60    Vex-generated code, and should those values should be
  61    unchanged at exit.
  62 */
  63
  64 #define DEFAULT_FPUCW 0x027F
  65
  66 #define DEFAULT_MXCSR 0x1F80
  67
  68 /* debugging only, do not use */
  69 /* define DEFAULT_FPUCW 0x037F */
  70
  71
  72 /*---------------------------------------------------------*/
  73 /*--- misc helpers                                      ---*/
  74 /*---------------------------------------------------------*/
  75
  76 /* These are duplicated in guest-amd64/toIR.c */
  77 static IRExpr* unop ( IROp op, IRExpr* a )
  78 {
  79    return IRExpr_Unop(op, a);
  80 }
  81
  82 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
  83 {
  84    return IRExpr_Binop(op, a1, a2);
  85 }
  86
  87 static IRExpr* bind ( Int binder )
  88 {
  89    return IRExpr_Binder(binder);
  90 }
  91
  92 static Bool isZeroU8 ( const IRExpr* e )
  93 {
  94    return e->tag == Iex_Const
  95           && e->Iex.Const.con->tag == Ico_U8
  96           && e->Iex.Const.con->Ico.U8 == 0;
  97 }
  98
  99
 100 /*---------------------------------------------------------*/
 101 /*--- ISelEnv                                           ---*/
 102 /*---------------------------------------------------------*/
 103
 104 /* This carries around:
 105
 106    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
 107      might encounter.  This is computed before insn selection starts,
 108      and does not change.
 109
 110    - A mapping from IRTemp to HReg.  This tells the insn selector
 111      which virtual register is associated with each IRTemp
 112      temporary.  This is computed before insn selection starts, and
 113      does not change.  We expect this mapping to map precisely the
 114      same set of IRTemps as the type mapping does.
 115
 116         - vregmap   holds the primary register for the IRTemp.
 117         - vregmapHI is only used for 128-bit integer-typed
 118              IRTemps.  It holds the identity of a second
 119              64-bit virtual HReg, which holds the high half
 120              of the value.
 121
 122    - The host subarchitecture we are selecting insns for.
 123      This is set at the start and does not change.
 124
 125    - The code array, that is, the insns selected so far.
 126
 127    - A counter, for generating new virtual registers.
 128
 129    - A Bool for indicating whether we may generate chain-me
 130      instructions for control flow transfers, or whether we must use
 131      XAssisted.
 132
 133    - The maximum guest address of any guest insn in this block.
 134      Actually, the address of the highest-addressed byte from any insn
 135      in this block.  Is set at the start and does not change.  This is
 136      used for detecting jumps which are definitely forward-edges from
 137      this block, and therefore can be made (chained) to the fast entry
 138      point of the destination, thereby avoiding the destination's
 139      event check.
 140
 141    Note, this is all host-independent.  (JRS 20050201: well, kinda
 142    ... not completely.  Compare with ISelEnv for X86.)
 143 */
 144
 145 typedef
 146    struct {
 147       /* Constant -- are set at the start and do not change. */
 148       IRTypeEnv*   type_env;
 149
 150       HReg*        vregmap;
 151       HReg*        vregmapHI;
 152       Int          n_vregmap;
 153
 154       UInt         hwcaps;
 155
 156       Bool         chainingAllowed;
 157       Addr64       max_ga;
 158
 159       /* These are modified as we go along. */
 160       HInstrArray* code;
 161       Int          vreg_ctr;
 162    }
 163    ISelEnv;
 164
 165
 166 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
 167 {
 168    vassert(tmp >= 0);
 169    vassert(tmp < env->n_vregmap);
 170    return env->vregmap[tmp];
 171 }
 172
 173 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
 174                                ISelEnv* env, IRTemp tmp )
 175 {
 176    vassert(tmp >= 0);
 177    vassert(tmp < env->n_vregmap);
 178    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
 179    *vrLO = env->vregmap[tmp];
 180    *vrHI = env->vregmapHI[tmp];
 181 }
 182
 183 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
 184 {
 185    addHInstr(env->code, instr);
 186    if (vex_traceflags & VEX_TRACE_VCODE) {
 187       ppAMD64Instr(instr, True);
 188       vex_printf("\n");
 189    }
 190 }
 191
 192 static HReg newVRegI ( ISelEnv* env )
 193 {
 194    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
 195    env->vreg_ctr++;
 196    return reg;
 197 }
 198
 199 static HReg newVRegV ( ISelEnv* env )
 200 {
 201    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
 202    env->vreg_ctr++;
 203    return reg;
 204 }
 205
 206
 207 /*---------------------------------------------------------*/
 208 /*--- ISEL: Forward declarations                        ---*/
 209 /*---------------------------------------------------------*/
 210
 211 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
 212    iselXXX_wrk do the real work, but are not to be called directly.
 213    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
 214    checks that all returned registers are virtual.  You should not
 215    call the _wrk version directly.
 216 */
 217 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
 218 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
 219
 220 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, const IRExpr* e );
 221 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, const IRExpr* e );
 222
 223 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, const IRExpr* e );
 224 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, const IRExpr* e );
 225
 226 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, const IRExpr* e );
 227 static HReg          iselIntExpr_R       ( ISelEnv* env, const IRExpr* e );
 228
 229 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
 230 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
 231
 232 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 233                                           ISelEnv* env, const IRExpr* e );
 234 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
 235                                           ISelEnv* env, const IRExpr* e );
 236
 237 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, const IRExpr* e );
 238 static AMD64CondCode iselCondCode        ( ISelEnv* env, const IRExpr* e );
 239
 240 static HReg          iselDblExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 241 static HReg          iselDblExpr         ( ISelEnv* env, const IRExpr* e );
 242
 243 static HReg          iselFltExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 244 static HReg          iselFltExpr         ( ISelEnv* env, const IRExpr* e );
 245
 246 static HReg          iselVecExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 247 static HReg          iselVecExpr         ( ISelEnv* env, const IRExpr* e );
 248
 249 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 250                                         ISelEnv* env, const IRExpr* e );
 251 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
 252                                         ISelEnv* env, const IRExpr* e );
 253
 254
 255 /*---------------------------------------------------------*/
 256 /*--- ISEL: Misc helpers                                ---*/
 257 /*---------------------------------------------------------*/
 258
 259 static Bool sane_AMode ( AMD64AMode* am )
 260 {
 261    switch (am->tag) {
 262       case Aam_IR:
 263          return
 264             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
 265                     && (hregIsVirtual(am->Aam.IR.reg)
 266                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
 267       case Aam_IRRS:
 268          return
 269             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
 270                     && hregIsVirtual(am->Aam.IRRS.base)
 271                     && hregClass(am->Aam.IRRS.index) == HRcInt64
 272                     && hregIsVirtual(am->Aam.IRRS.index) );
 273       default:
 274         vpanic("sane_AMode: unknown amd64 amode tag");
 275    }
 276 }
 277
 278
 279 /* Can the lower 32 bits be signedly widened to produce the whole
 280    64-bit value?  In other words, are the top 33 bits either all 0 or
 281    all 1 ? */
 282 static Bool fitsIn32Bits ( ULong x )
 283 {
 284    Long y1;
 285    y1 = x << 32;
 286    y1 >>=/*s*/ 32;
 287    return toBool(x == y1);
 288 }
 289
 290 /* Is this a 64-bit zero expression? */
 291
 292 static Bool isZeroU64 ( const IRExpr* e )
 293 {
 294    return e->tag == Iex_Const
 295           && e->Iex.Const.con->tag == Ico_U64
 296           && e->Iex.Const.con->Ico.U64 == 0ULL;
 297 }
 298
 299 static Bool isZeroU32 ( const IRExpr* e )
 300 {
 301    return e->tag == Iex_Const
 302           && e->Iex.Const.con->tag == Ico_U32
 303           && e->Iex.Const.con->Ico.U32 == 0;
 304 }
 305
 306 /* Are both args atoms and the same?  This is copy of eqIRAtom
 307    that omits the assertions that the args are indeed atoms. */
 308
 309 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
 310 {
 311    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 312       return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
 313    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 314       return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
 315    return False;
 316 }
 317
 318 /* Make a int reg-reg move. */
 319
 320 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
 321 {
 322    vassert(hregClass(src) == HRcInt64);
 323    vassert(hregClass(dst) == HRcInt64);
 324    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
 325 }
 326
 327 /* Make a vector (128 bit) reg-reg move. */
 328
 329 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
 330 {
 331    vassert(hregClass(src) == HRcVec128);
 332    vassert(hregClass(dst) == HRcVec128);
 333    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
 334 }
 335
 336 /* Advance/retreat %rsp by n. */
 337
 338 static void add_to_rsp ( ISelEnv* env, Int n )
 339 {
 340    vassert(n > 0 && n < 256 && (n%8) == 0);
 341    addInstr(env,
 342             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
 343                                         hregAMD64_RSP()));
 344 }
 345
 346 static void sub_from_rsp ( ISelEnv* env, Int n )
 347 {
 348    vassert(n > 0 && n < 256 && (n%8) == 0);
 349    addInstr(env,
 350             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
 351                                         hregAMD64_RSP()));
 352 }
 353
 354 /* Push 64-bit constants on the stack. */
 355 static void push_uimm64( ISelEnv* env, ULong uimm64 )
 356 {
 357    /* If uimm64 can be expressed as the sign extension of its
 358       lower 32 bits, we can do it the easy way. */
 359    Long simm64 = (Long)uimm64;
 360    if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
 361       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
 362    } else {
 363       HReg tmp = newVRegI(env);
 364       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
 365       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
 366    }
 367 }
 368
 369
 370 /* Used only in doHelperCall.  If possible, produce a single
 371    instruction which computes 'e' into 'dst'.  If not possible, return
 372    NULL. */
 373
 374 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
 375                                                     HReg     dst,
 376                                                     IRExpr*  e )
 377 {
 378    /* Per comments in doHelperCall below, appearance of
 379       Iex_VECRET implies ill-formed IR. */
 380    vassert(e->tag != Iex_VECRET);
 381
 382    /* In this case we give out a copy of the BaseBlock pointer. */
 383    if (UNLIKELY(e->tag == Iex_GSPTR)) {
 384       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
 385    }
 386
 387    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
 388
 389    if (e->tag == Iex_Const) {
 390       vassert(e->Iex.Const.con->tag == Ico_U64);
 391       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
 392          return AMD64Instr_Alu64R(
 393                    Aalu_MOV,
 394                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
 395                    dst
 396                 );
 397       } else {
 398          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
 399       }
 400    }
 401
 402    if (e->tag == Iex_RdTmp) {
 403       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
 404       return mk_iMOVsd_RR(src, dst);
 405    }
 406
 407    if (e->tag == Iex_Get) {
 408       vassert(e->Iex.Get.ty == Ity_I64);
 409       return AMD64Instr_Alu64R(
 410                 Aalu_MOV,
 411                 AMD64RMI_Mem(
 412                    AMD64AMode_IR(e->Iex.Get.offset,
 413                                  hregAMD64_RBP())),
 414                 dst);
 415    }
 416
 417    if (e->tag == Iex_Unop
 418        && e->Iex.Unop.op == Iop_32Uto64
 419        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
 420       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
 421       return AMD64Instr_MovxLQ(False, src, dst);
 422    }
 423
 424    if (0) { ppIRExpr(e); vex_printf("\n"); }
 425
 426    return NULL;
 427 }
 428
 429
 430 /* Do a complete function call.  |guard| is a Ity_Bit expression
 431    indicating whether or not the call happens.  If guard==NULL, the
 432    call is unconditional.  |retloc| is set to indicate where the
 433    return value is after the call.  The caller (of this fn) must
 434    generate code to add |stackAdjustAfterCall| to the stack pointer
 435    after the call is done. */
 436
 437 static
 438 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
 439                     /*OUT*/RetLoc* retloc,
 440                     ISelEnv* env,
 441                     IRExpr* guard,
 442                     IRCallee* cee, IRType retTy, IRExpr** args )
 443 {
 444    AMD64CondCode cc;
 445    HReg          argregs[6];
 446    HReg          tmpregs[6];
 447    AMD64Instr*   fastinstrs[6];
 448    UInt          n_args, i;
 449
 450    /* Set default returns.  We'll update them later if needed. */
 451    *stackAdjustAfterCall = 0;
 452    *retloc               = mk_RetLoc_INVALID();
 453
 454    /* These are used for cross-checking that IR-level constraints on
 455       the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
 456    UInt nVECRETs = 0;
 457    UInt nGSPTRs  = 0;
 458
 459    /* Marshal args for a call and do the call.
 460
 461       This function only deals with a tiny set of possibilities, which
 462       cover all helpers in practice.  The restrictions are that only
 463       arguments in registers are supported, hence only 6x64 integer
 464       bits in total can be passed.  In fact the only supported arg
 465       type is I64.
 466
 467       The return type can be I{64,32,16,8} or V{128,256}.  In the
 468       latter two cases, it is expected that |args| will contain the
 469       special node IRExpr_VECRET(), in which case this routine
 470       generates code to allocate space on the stack for the vector
 471       return value.  Since we are not passing any scalars on the
 472       stack, it is enough to preallocate the return space before
 473       marshalling any arguments, in this case.
 474
 475       |args| may also contain IRExpr_GSPTR(), in which case the
 476       value in %rbp is passed as the corresponding argument.
 477
 478       Generating code which is both efficient and correct when
 479       parameters are to be passed in registers is difficult, for the
 480       reasons elaborated in detail in comments attached to
 481       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
 482       of the method described in those comments.
 483
 484       The problem is split into two cases: the fast scheme and the
 485       slow scheme.  In the fast scheme, arguments are computed
 486       directly into the target (real) registers.  This is only safe
 487       when we can be sure that computation of each argument will not
 488       trash any real registers set by computation of any other
 489       argument.
 490
 491       In the slow scheme, all args are first computed into vregs, and
 492       once they are all done, they are moved to the relevant real
 493       regs.  This always gives correct code, but it also gives a bunch
 494       of vreg-to-rreg moves which are usually redundant but are hard
 495       for the register allocator to get rid of.
 496
 497       To decide which scheme to use, all argument expressions are
 498       first examined.  If they are all so simple that it is clear they
 499       will be evaluated without use of any fixed registers, use the
 500       fast scheme, else use the slow scheme.  Note also that only
 501       unconditional calls may use the fast scheme, since having to
 502       compute a condition expression could itself trash real
 503       registers.  Note that for simplicity, in the case where
 504       IRExpr_VECRET() is present, we use the slow scheme.  This is
 505       motivated by the desire to avoid any possible complexity
 506       w.r.t. nested calls.
 507
 508       Note this requires being able to examine an expression and
 509       determine whether or not evaluation of it might use a fixed
 510       register.  That requires knowledge of how the rest of this insn
 511       selector works.  Currently just the following 3 are regarded as
 512       safe -- hopefully they cover the majority of arguments in
 513       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
 514    */
 515
 516    /* Note that the cee->regparms field is meaningless on AMD64 host
 517       (since there is only one calling convention) and so we always
 518       ignore it. */
 519    n_args = 0;
 520    for (i = 0; args[i]; i++)
 521       n_args++;
 522
 523    if (n_args > 6)
 524       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
 525
 526    argregs[0] = hregAMD64_RDI();
 527    argregs[1] = hregAMD64_RSI();
 528    argregs[2] = hregAMD64_RDX();
 529    argregs[3] = hregAMD64_RCX();
 530    argregs[4] = hregAMD64_R8();
 531    argregs[5] = hregAMD64_R9();
 532
 533    tmpregs[0] = tmpregs[1] = tmpregs[2] =
 534    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
 535
 536    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
 537    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
 538
 539    /* First decide which scheme (slow or fast) is to be used.  First
 540       assume the fast scheme, and select slow if any contraindications
 541       (wow) appear. */
 542
 543    /* We'll need space on the stack for the return value.  Avoid
 544       possible complications with nested calls by using the slow
 545       scheme. */
 546    if (retTy == Ity_V128 || retTy == Ity_V256)
 547       goto slowscheme;
 548
 549    if (guard) {
 550       if (guard->tag == Iex_Const
 551           && guard->Iex.Const.con->tag == Ico_U1
 552           && guard->Iex.Const.con->Ico.U1 == True) {
 553          /* unconditional */
 554       } else {
 555          /* Not manifestly unconditional -- be conservative. */
 556          goto slowscheme;
 557       }
 558    }
 559
 560    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
 561       use the slow scheme.  Because this is tentative, we can't call
 562       addInstr (that is, commit to) any instructions until we're
 563       handled all the arguments.  So park the resulting instructions
 564       in a buffer and emit that if we're successful. */
 565
 566    /* FAST SCHEME */
 567    /* In this loop, we process args that can be computed into the
 568       destination (real) register with a single instruction, without
 569       using any fixed regs.  That also includes IRExpr_GSPTR(), but
 570       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
 571       never see IRExpr_VECRET() at this point, since the return-type
 572       check above should ensure all those cases use the slow scheme
 573       instead. */
 574    vassert(n_args >= 0 && n_args <= 6);
 575    for (i = 0; i < n_args; i++) {
 576       IRExpr* arg = args[i];
 577       if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
 578          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 579       }
 580       fastinstrs[i]
 581          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
 582       if (fastinstrs[i] == NULL)
 583          goto slowscheme;
 584    }
 585
 586    /* Looks like we're in luck.  Emit the accumulated instructions and
 587       move on to doing the call itself. */
 588    for (i = 0; i < n_args; i++)
 589       addInstr(env, fastinstrs[i]);
 590
 591    /* Fast scheme only applies for unconditional calls.  Hence: */
 592    cc = Acc_ALWAYS;
 593
 594    goto handle_call;
 595
 596
 597    /* SLOW SCHEME; move via temporaries */
 598   slowscheme:
 599    {}
 600 #  if 0 /* debug only */
 601    if (n_args > 0) {for (i = 0; args[i]; i++) {
 602    ppIRExpr(args[i]); vex_printf(" "); }
 603    vex_printf("\n");}
 604 #  endif
 605
 606    /* If we have a vector return type, allocate a place for it on the
 607       stack and record its address. */
 608    HReg r_vecRetAddr = INVALID_HREG;
 609    if (retTy == Ity_V128) {
 610       r_vecRetAddr = newVRegI(env);
 611       sub_from_rsp(env, 16);
 612       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 613    }
 614    else if (retTy == Ity_V256) {
 615       r_vecRetAddr = newVRegI(env);
 616       sub_from_rsp(env, 32);
 617       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 618    }
 619
 620    vassert(n_args >= 0 && n_args <= 6);
 621    for (i = 0; i < n_args; i++) {
 622       IRExpr* arg = args[i];
 623       if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 624          tmpregs[i] = newVRegI(env);
 625          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
 626          nGSPTRs++;
 627       }
 628       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
 629          /* We stashed the address of the return slot earlier, so just
 630             retrieve it now. */
 631          vassert(!hregIsInvalid(r_vecRetAddr));
 632          tmpregs[i] = r_vecRetAddr;
 633          nVECRETs++;
 634       }
 635       else {
 636          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 637          tmpregs[i] = iselIntExpr_R(env, args[i]);
 638       }
 639    }
 640
 641    /* Now we can compute the condition.  We can't do it earlier
 642       because the argument computations could trash the condition
 643       codes.  Be a bit clever to handle the common case where the
 644       guard is 1:Bit. */
 645    cc = Acc_ALWAYS;
 646    if (guard) {
 647       if (guard->tag == Iex_Const
 648           && guard->Iex.Const.con->tag == Ico_U1
 649           && guard->Iex.Const.con->Ico.U1 == True) {
 650          /* unconditional -- do nothing */
 651       } else {
 652          cc = iselCondCode( env, guard );
 653       }
 654    }
 655
 656    /* Move the args to their final destinations. */
 657    for (i = 0; i < n_args; i++) {
 658       /* None of these insns, including any spill code that might
 659          be generated, may alter the condition codes. */
 660       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
 661    }
 662
 663
 664    /* Do final checks, set the return values, and generate the call
 665       instruction proper. */
 666   handle_call:
 667
 668    if (retTy == Ity_V128 || retTy == Ity_V256) {
 669       vassert(nVECRETs == 1);
 670    } else {
 671       vassert(nVECRETs == 0);
 672    }
 673
 674    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 675
 676    vassert(*stackAdjustAfterCall == 0);
 677    vassert(is_RetLoc_INVALID(*retloc));
 678    switch (retTy) {
 679          case Ity_INVALID:
 680             /* Function doesn't return a value. */
 681             *retloc = mk_RetLoc_simple(RLPri_None);
 682             break;
 683          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
 684             *retloc = mk_RetLoc_simple(RLPri_Int);
 685             break;
 686          case Ity_V128:
 687             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
 688             *stackAdjustAfterCall = 16;
 689             break;
 690          case Ity_V256:
 691             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
 692             *stackAdjustAfterCall = 32;
 693             break;
 694          default:
 695             /* IR can denote other possible return types, but we don't
 696                handle those here. */
 697            vassert(0);
 698    }
 699
 700    /* Finally, generate the call itself.  This needs the *retloc value
 701       set in the switch above, which is why it's at the end. */
 702    addInstr(env,
 703             AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
 704 }
 705
 706
 707 /* Given a guest-state array descriptor, an index expression and a
 708    bias, generate an AMD64AMode holding the relevant guest state
 709    offset. */
 710
 711 static
 712 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
 713                                   IRExpr* off, Int bias )
 714 {
 715    HReg tmp, roff;
 716    Int  elemSz = sizeofIRType(descr->elemTy);
 717    Int  nElems = descr->nElems;
 718
 719    /* Throw out any cases not generated by an amd64 front end.  In
 720       theory there might be a day where we need to handle them -- if
 721       we ever run non-amd64-guest on amd64 host. */
 722
 723    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
 724       vpanic("genGuestArrayOffset(amd64 host)");
 725
 726    /* Compute off into a reg, %off.  Then return:
 727
 728          movq %off, %tmp
 729          addq $bias, %tmp  (if bias != 0)
 730          andq %tmp, 7
 731          ... base(%rbp, %tmp, shift) ...
 732    */
 733    tmp  = newVRegI(env);
 734    roff = iselIntExpr_R(env, off);
 735    addInstr(env, mk_iMOVsd_RR(roff, tmp));
 736    if (bias != 0) {
 737       /* Make sure the bias is sane, in the sense that there are
 738          no significant bits above bit 30 in it. */
 739       vassert(-10000 < bias && bias < 10000);
 740       addInstr(env,
 741                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
 742    }
 743    addInstr(env,
 744             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
 745    vassert(elemSz == 1 || elemSz == 8);
 746    return
 747       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
 748                                     elemSz==8 ? 3 : 0);
 749 }
 750
 751
 752 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
 753 static
 754 void set_SSE_rounding_default ( ISelEnv* env )
 755 {
 756    /* pushq $DEFAULT_MXCSR
 757       ldmxcsr 0(%rsp)
 758       addq $8, %rsp
 759    */
 760    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 761    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
 762    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 763    add_to_rsp(env, 8);
 764 }
 765
 766 /* Mess with the FPU's rounding mode: set to the default rounding mode
 767    (DEFAULT_FPUCW). */
 768 static
 769 void set_FPU_rounding_default ( ISelEnv* env )
 770 {
 771    /* movq $DEFAULT_FPUCW, -8(%rsp)
 772       fldcw -8(%esp)
 773    */
 774    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 775    addInstr(env, AMD64Instr_Alu64M(
 776                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
 777    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 778 }
 779
 780
 781 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
 782    expression denoting a value in the range 0 .. 3, indicating a round
 783    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
 784    have the same rounding.
 785 */
 786 static
 787 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
 788 {
 789    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
 790       both rounding bits == 0.  If that wasn't the case, we couldn't
 791       create a new rounding field simply by ORing the new value into
 792       place. */
 793
 794    /* movq $3, %reg
 795       andq [[mode]], %reg  -- shouldn't be needed; paranoia
 796       shlq $13, %reg
 797       orq $DEFAULT_MXCSR, %reg
 798       pushq %reg
 799       ldmxcsr 0(%esp)
 800       addq $8, %rsp
 801    */
 802    HReg        reg      = newVRegI(env);
 803    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 804    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
 805    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
 806                                    iselIntExpr_RMI(env, mode), reg));
 807    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
 808    addInstr(env, AMD64Instr_Alu64R(
 809                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
 810    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
 811    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 812    add_to_rsp(env, 8);
 813 }
 814
 815
 816 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
 817    expression denoting a value in the range 0 .. 3, indicating a round
 818    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
 819    the same rounding.
 820 */
 821 static
 822 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
 823 {
 824    HReg rrm  = iselIntExpr_R(env, mode);
 825    HReg rrm2 = newVRegI(env);
 826    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 827
 828    /* movq  %rrm, %rrm2
 829       andq  $3, %rrm2   -- shouldn't be needed; paranoia
 830       shlq  $10, %rrm2
 831       orq   $DEFAULT_FPUCW, %rrm2
 832       movq  %rrm2, -8(%rsp)
 833       fldcw -8(%esp)
 834    */
 835    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
 836    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
 837    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
 838    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
 839                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
 840    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
 841                                    AMD64RI_Reg(rrm2), m8_rsp));
 842    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 843 }
 844
 845
 846 /* Generate all-zeroes into a new vector register.
 847 */
 848 static HReg generate_zeroes_V128 ( ISelEnv* env )
 849 {
 850    HReg dst = newVRegV(env);
 851    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
 852    return dst;
 853 }
 854
 855 /* Generate all-ones into a new vector register.
 856 */
 857 static HReg generate_ones_V128 ( ISelEnv* env )
 858 {
 859    HReg dst = newVRegV(env);
 860    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
 861    return dst;
 862 }
 863
 864
 865 /* Generate !src into a new vector register.  Amazing that there isn't
 866    a less crappy way to do this.
 867 */
 868 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
 869 {
 870    HReg dst = generate_ones_V128(env);
 871    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
 872    return dst;
 873 }
 874
 875
 876 /* Expand the given byte into a 64-bit word, by cloning each bit
 877    8 times. */
 878 static ULong bitmask8_to_bytemask64 ( UShort w8 )
 879 {
 880    vassert(w8 == (w8 & 0xFF));
 881    ULong w64 = 0;
 882    Int i;
 883    for (i = 0; i < 8; i++) {
 884       if (w8 & (1<<i))
 885          w64 |= (0xFFULL << (8 * i));
 886    }
 887    return w64;
 888 }
 889
 890
 891 /*---------------------------------------------------------*/
 892 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
 893 /*---------------------------------------------------------*/
 894
 895 /* Select insns for an integer-typed expression, and add them to the
 896    code list.  Return a reg holding the result.  This reg will be a
 897    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
 898    want to modify it, ask for a new vreg, copy it in there, and modify
 899    the copy.  The register allocator will do its best to map both
 900    vregs to the same real register, so the copies will often disappear
 901    later in the game.
 902
 903    This should handle expressions of 64, 32, 16 and 8-bit type.  All
 904    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
 905    expressions, the upper 32/48/56 bits are arbitrary, so you should
 906    mask or sign extend partial values if necessary.
 907 */
 908
 909 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
 910 {
 911    HReg r = iselIntExpr_R_wrk(env, e);
 912    /* sanity checks ... */
 913 #  if 0
 914    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
 915 #  endif
 916    vassert(hregClass(r) == HRcInt64);
 917    vassert(hregIsVirtual(r));
 918    return r;
 919 }
 920
 921 /* DO NOT CALL THIS DIRECTLY ! */
 922 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
 923 {
 924    MatchInfo mi;
 925    DECLARE_PATTERN(p_1Uto8_64to1);
 926    DECLARE_PATTERN(p_LDle8_then_8Uto64);
 927    DECLARE_PATTERN(p_LDle16_then_16Uto64);
 928
 929    IRType ty = typeOfIRExpr(env->type_env,e);
 930    switch (ty) {
 931       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
 932       default: vassert(0);
 933    }
 934
 935    switch (e->tag) {
 936
 937    /* --------- TEMP --------- */
 938    case Iex_RdTmp: {
 939       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
 940    }
 941
 942    /* --------- LOAD --------- */
 943    case Iex_Load: {
 944       HReg dst = newVRegI(env);
 945       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
 946
 947       /* We can't handle big-endian loads, nor load-linked. */
 948       if (e->Iex.Load.end != Iend_LE)
 949          goto irreducible;
 950
 951       if (ty == Ity_I64) {
 952          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
 953                                          AMD64RMI_Mem(amode), dst) );
 954          return dst;
 955       }
 956       if (ty == Ity_I32) {
 957          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
 958          return dst;
 959       }
 960       if (ty == Ity_I16) {
 961          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
 962          return dst;
 963       }
 964       if (ty == Ity_I8) {
 965          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
 966          return dst;
 967       }
 968       break;
 969    }
 970
 971    /* --------- BINARY OP --------- */
 972    case Iex_Binop: {
 973       AMD64AluOp   aluOp;
 974       AMD64ShiftOp shOp;
 975
 976       /* Pattern: Sub64(0,x) */
 977       /*     and: Sub32(0,x) */
 978       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
 979           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
 980          HReg dst = newVRegI(env);
 981          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
 982          addInstr(env, mk_iMOVsd_RR(reg,dst));
 983          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
 984          return dst;
 985       }
 986
 987       /* Is it an addition or logical style op? */
 988       switch (e->Iex.Binop.op) {
 989          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
 990             aluOp = Aalu_ADD; break;
 991          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
 992             aluOp = Aalu_SUB; break;
 993          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
 994             aluOp = Aalu_AND; break;
 995          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
 996             aluOp = Aalu_OR; break;
 997          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
 998             aluOp = Aalu_XOR; break;
 999          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1000             aluOp = Aalu_MUL; break;
1001          default:
1002             aluOp = Aalu_INVALID; break;
1003       }
1004       /* For commutative ops we assume any literal
1005          values are on the second operand. */
1006       if (aluOp != Aalu_INVALID) {
1007          HReg dst      = newVRegI(env);
1008          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1009          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1010          addInstr(env, mk_iMOVsd_RR(reg,dst));
1011          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1012          return dst;
1013       }
1014
1015       /* Perhaps a shift op? */
1016       switch (e->Iex.Binop.op) {
1017          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1018             shOp = Ash_SHL; break;
1019          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1020             shOp = Ash_SHR; break;
1021          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1022             shOp = Ash_SAR; break;
1023          default:
1024             shOp = Ash_INVALID; break;
1025       }
1026       if (shOp != Ash_INVALID) {
1027          HReg dst = newVRegI(env);
1028
1029          /* regL = the value to be shifted */
1030          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1031          addInstr(env, mk_iMOVsd_RR(regL,dst));
1032
1033          /* Do any necessary widening for 32/16/8 bit operands */
1034          switch (e->Iex.Binop.op) {
1035             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1036                break;
1037             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1038                break;
1039             case Iop_Shr8:
1040                addInstr(env, AMD64Instr_Alu64R(
1041                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1042                break;
1043             case Iop_Shr16:
1044                addInstr(env, AMD64Instr_Alu64R(
1045                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1046                break;
1047             case Iop_Shr32:
1048                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1049                break;
1050             case Iop_Sar8:
1051                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1052                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1053                break;
1054             case Iop_Sar16:
1055                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1056                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1057                break;
1058             case Iop_Sar32:
1059                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1060                break;
1061             default:
1062                ppIROp(e->Iex.Binop.op);
1063                vassert(0);
1064          }
1065
1066          /* Now consider the shift amount.  If it's a literal, we
1067             can do a much better job than the general case. */
1068          if (e->Iex.Binop.arg2->tag == Iex_Const) {
1069             /* assert that the IR is well-typed */
1070             Int nshift;
1071             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1072             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1073             vassert(nshift >= 0);
1074             if (nshift > 0)
1075                /* Can't allow nshift==0 since that means %cl */
1076                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1077          } else {
1078             /* General case; we have to force the amount into %cl. */
1079             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1080             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1081             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1082          }
1083          return dst;
1084       }
1085
1086       /* Handle misc other scalar ops. */
1087       if (e->Iex.Binop.op == Iop_Max32U) {
1088          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1089          HReg dst  = newVRegI(env);
1090          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1091          addInstr(env, mk_iMOVsd_RR(src1, dst));
1092          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1093          addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1094          return dst;
1095       }
1096
1097       if (e->Iex.Binop.op == Iop_DivModS64to32
1098           || e->Iex.Binop.op == Iop_DivModU64to32) {
1099          /* 64 x 32 -> (32(rem),32(div)) division */
1100          /* Get the 64-bit operand into edx:eax, and the other into
1101             any old R/M. */
1102          HReg      rax     = hregAMD64_RAX();
1103          HReg      rdx     = hregAMD64_RDX();
1104          HReg      dst     = newVRegI(env);
1105          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1106          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1107          /* Compute the left operand into a reg, and then
1108             put the top half in edx and the bottom in eax. */
1109          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1110          addInstr(env, mk_iMOVsd_RR(left64, rdx));
1111          addInstr(env, mk_iMOVsd_RR(left64, rax));
1112          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1113          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1114          addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1115          addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1116          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1117          addInstr(env, mk_iMOVsd_RR(rax, dst));
1118          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1119          return dst;
1120       }
1121
1122       if (e->Iex.Binop.op == Iop_32HLto64) {
1123          HReg hi32  = newVRegI(env);
1124          HReg lo32  = newVRegI(env);
1125          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1126          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1127          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1128          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1129          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1130          addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1131          addInstr(env, AMD64Instr_Alu64R(
1132                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1133          return hi32;
1134       }
1135
1136       if (e->Iex.Binop.op == Iop_16HLto32) {
1137          HReg hi16  = newVRegI(env);
1138          HReg lo16  = newVRegI(env);
1139          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1140          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1141          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1142          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1143          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1144          addInstr(env, AMD64Instr_Alu64R(
1145                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1146          addInstr(env, AMD64Instr_Alu64R(
1147                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1148          return hi16;
1149       }
1150
1151       if (e->Iex.Binop.op == Iop_8HLto16) {
1152          HReg hi8  = newVRegI(env);
1153          HReg lo8  = newVRegI(env);
1154          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1155          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1156          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1157          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1158          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1159          addInstr(env, AMD64Instr_Alu64R(
1160                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1161          addInstr(env, AMD64Instr_Alu64R(
1162                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1163          return hi8;
1164       }
1165
1166       if (e->Iex.Binop.op == Iop_MullS32
1167           || e->Iex.Binop.op == Iop_MullS16
1168           || e->Iex.Binop.op == Iop_MullS8
1169           || e->Iex.Binop.op == Iop_MullU32
1170           || e->Iex.Binop.op == Iop_MullU16
1171           || e->Iex.Binop.op == Iop_MullU8) {
1172          HReg a32   = newVRegI(env);
1173          HReg b32   = newVRegI(env);
1174          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1175          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1176          Int          shift  = 0;
1177          AMD64ShiftOp shr_op = Ash_SHR;
1178          switch (e->Iex.Binop.op) {
1179             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1180             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1181             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1182             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1183             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1184             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1185             default: vassert(0);
1186          }
1187
1188          addInstr(env, mk_iMOVsd_RR(a32s, a32));
1189          addInstr(env, mk_iMOVsd_RR(b32s, b32));
1190          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1191          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1192          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1193          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1194          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1195          return b32;
1196       }
1197
1198       if (e->Iex.Binop.op == Iop_CmpF64) {
1199          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1200          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1201          HReg dst = newVRegI(env);
1202          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1203          /* Mask out irrelevant parts of the result so as to conform
1204             to the CmpF64 definition. */
1205          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1206          return dst;
1207       }
1208
1209       if (e->Iex.Binop.op == Iop_F64toI32S
1210           || e->Iex.Binop.op == Iop_F64toI64S) {
1211          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1212          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1213          HReg dst = newVRegI(env);
1214          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1215          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1216          set_SSE_rounding_default(env);
1217          return dst;
1218       }
1219
1220       /* Deal with 64-bit SIMD binary ops.  For the most part these are doable
1221          by using the equivalent 128-bit operation and ignoring the upper half
1222          of the result. */
1223       AMD64SseOp op = Asse_INVALID;
1224       Bool arg1isEReg = False;
1225       Bool preShift32R = False;
1226       switch (e->Iex.Binop.op) {
1227          // The following 3 could be done with 128 bit insns too, but
1228          // first require the inputs to be reformatted.
1229          //case Iop_QNarrowBin32Sto16Sx4:
1230          //op = Asse_PACKSSD; arg1isEReg = True; break;
1231          //case Iop_QNarrowBin16Sto8Sx8:
1232          //op = Asse_PACKSSW; arg1isEReg = True; break;
1233          //case Iop_QNarrowBin16Sto8Ux8:
1234          //op = Asse_PACKUSW; arg1isEReg = True; break;
1235
1236          case Iop_InterleaveHI8x8:
1237             op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1238             break;
1239          case Iop_InterleaveHI16x4:
1240             op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1241             break;
1242          case Iop_InterleaveHI32x2:
1243             op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1244             break;
1245          case Iop_InterleaveLO8x8:
1246             op = Asse_UNPCKLB; arg1isEReg = True;
1247             break;
1248          case Iop_InterleaveLO16x4:
1249             op = Asse_UNPCKLW; arg1isEReg = True;
1250             break;
1251          case Iop_InterleaveLO32x2:
1252             op = Asse_UNPCKLD; arg1isEReg = True;
1253             break;
1254
1255          case Iop_Add8x8:     op = Asse_ADD8;     break;
1256          case Iop_Add16x4:    op = Asse_ADD16;    break;
1257          case Iop_Add32x2:    op = Asse_ADD32;    break;
1258          case Iop_QAdd8Sx8:   op = Asse_QADD8S;   break;
1259          case Iop_QAdd16Sx4:  op = Asse_QADD16S;  break;
1260          case Iop_QAdd8Ux8:   op = Asse_QADD8U;   break;
1261          case Iop_QAdd16Ux4:  op = Asse_QADD16U;  break;
1262          case Iop_Avg8Ux8:    op = Asse_AVG8U;    break;
1263          case Iop_Avg16Ux4:   op = Asse_AVG16U;   break;
1264          case Iop_CmpEQ8x8:   op = Asse_CMPEQ8;   break;
1265          case Iop_CmpEQ16x4:  op = Asse_CMPEQ16;  break;
1266          case Iop_CmpEQ32x2:  op = Asse_CMPEQ32;  break;
1267          case Iop_CmpGT8Sx8:  op = Asse_CMPGT8S;  break;
1268          case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1269          case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1270          case Iop_Max16Sx4:   op = Asse_MAX16S;   break;
1271          case Iop_Max8Ux8:    op = Asse_MAX8U;    break;
1272          case Iop_Min16Sx4:   op = Asse_MIN16S;   break;
1273          case Iop_Min8Ux8:    op = Asse_MIN8U;    break;
1274          case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1275          case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1276          case Iop_Mul16x4:    op = Asse_MUL16;    break;
1277          case Iop_Sub8x8:     op = Asse_SUB8;     break;
1278          case Iop_Sub16x4:    op = Asse_SUB16;    break;
1279          case Iop_Sub32x2:    op = Asse_SUB32;    break;
1280          case Iop_QSub8Sx8:   op = Asse_QSUB8S;   break;
1281          case Iop_QSub16Sx4:  op = Asse_QSUB16S;  break;
1282          case Iop_QSub8Ux8:   op = Asse_QSUB8U;   break;
1283          case Iop_QSub16Ux4:  op = Asse_QSUB16U;  break;
1284          default: break;
1285       }
1286       if (op != Asse_INVALID) {
1287          /* This isn't pretty, but .. move each arg to the low half of an XMM
1288             register, do the operation on the whole register, and move the
1289             result back to an integer register. */
1290          const IRExpr* arg1 = e->Iex.Binop.arg1;
1291          const IRExpr* arg2 = e->Iex.Binop.arg2;
1292          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1293          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1294          HReg iarg1 = iselIntExpr_R(env, arg1);
1295          HReg iarg2 = iselIntExpr_R(env, arg2);
1296          HReg varg1 = newVRegV(env);
1297          HReg varg2 = newVRegV(env);
1298          HReg idst  = newVRegI(env);
1299          addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1300          addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1301          if (arg1isEReg) {
1302             if (preShift32R) {
1303                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1304                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1305             }
1306             addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1307             addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1308          } else {
1309             vassert(!preShift32R);
1310             addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1311             addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1312          }
1313          return idst;
1314       }
1315
1316       UInt laneBits = 0;
1317       op = Asse_INVALID;
1318       switch (e->Iex.Binop.op) {
1319          case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1320          case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1321          case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1322          case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1323          case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1324          case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1325          default: break;
1326       }
1327       if (op != Asse_INVALID) {
1328          const IRExpr* arg1 = e->Iex.Binop.arg1;
1329          const IRExpr* arg2 = e->Iex.Binop.arg2;
1330          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1331          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1332          HReg igreg = iselIntExpr_R(env, arg1);
1333          HReg vgreg = newVRegV(env);
1334          HReg idst  = newVRegI(env);
1335          addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1336          /* If it's a shift by an in-range immediate, generate a single
1337             instruction. */
1338          if (arg2->tag == Iex_Const) {
1339             IRConst* c = arg2->Iex.Const.con;
1340             vassert(c->tag == Ico_U8);
1341             UInt shift = c->Ico.U8;
1342             if (shift < laneBits) {
1343                addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1344                addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1345                return idst;
1346             }
1347          }
1348          /* Otherwise we have to do it the longwinded way. */
1349          HReg ishift = iselIntExpr_R(env, arg2);
1350          HReg vshift = newVRegV(env);
1351          addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1352          addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1353          addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1354          return idst;
1355       }
1356
1357       if (e->Iex.Binop.op == Iop_Mul32x2) {
1358          const IRExpr* arg1 = e->Iex.Binop.arg1;
1359          const IRExpr* arg2 = e->Iex.Binop.arg2;
1360          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1361          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1362          HReg s1 = iselIntExpr_R(env, arg1);
1363          HReg s2 = iselIntExpr_R(env, arg2);
1364          HReg resLo = newVRegI(env);
1365          // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1366          addInstr(env, mk_iMOVsd_RR(s1, resLo));
1367          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1368          addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1369
1370          // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1371          HReg resHi = newVRegI(env);
1372          addInstr(env, mk_iMOVsd_RR(s1, resHi));
1373          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1374          HReg tmp = newVRegI(env);
1375          addInstr(env, mk_iMOVsd_RR(s2, tmp));
1376          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1377          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1378          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1379
1380          // final result = resHi | resLo
1381          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1382          return resLo;
1383       }
1384
1385       // A few remaining SIMD64 ops require helper functions, at least for
1386       // now.
1387       Bool second_is_UInt = False;
1388       HWord fn = 0;
1389       switch (e->Iex.Binop.op) {
1390          case Iop_CatOddLanes16x4:
1391             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1392          case Iop_CatEvenLanes16x4:
1393             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1394          case Iop_PermOrZero8x8:
1395             fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1396
1397          case Iop_QNarrowBin32Sto16Sx4:
1398             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1399          case Iop_QNarrowBin16Sto8Sx8:
1400             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1401          case Iop_QNarrowBin16Sto8Ux8:
1402             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1403
1404          case Iop_NarrowBin16to8x8:
1405             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1406          case Iop_NarrowBin32to16x4:
1407             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1408
1409          case Iop_SarN8x8:
1410             fn = (HWord)h_generic_calc_SarN8x8;
1411             second_is_UInt = True;
1412             break;
1413
1414          default:
1415             fn = (HWord)0; break;
1416       }
1417       if (fn != (HWord)0) {
1418          /* Note: the following assumes all helpers are of signature
1419                ULong fn ( ULong, ULong ), and they are
1420             not marked as regparm functions.
1421          */
1422          HReg dst  = newVRegI(env);
1423          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1424          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1425          if (second_is_UInt)
1426             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1427          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1428          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1429          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1430                                         mk_RetLoc_simple(RLPri_Int) ));
1431          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1432          return dst;
1433       }
1434
1435       // Half-float vector conversion
1436       if (e->Iex.Binop.op == Iop_F32toF16x4
1437           && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1438          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1439          HReg dstV = newVRegV(env);
1440          HReg dstI = newVRegI(env);
1441          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1442          addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1443          set_SSE_rounding_default(env);
1444          addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1445          return dstI;
1446       }
1447
1448       break;
1449    }
1450
1451    /* --------- UNARY OP --------- */
1452    case Iex_Unop: {
1453
1454       /* 1Uto8(64to1(expr64)) */
1455       {
1456          DEFINE_PATTERN( p_1Uto8_64to1,
1457                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1458          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1459             const IRExpr* expr64 = mi.bindee[0];
1460             HReg    dst    = newVRegI(env);
1461             HReg    src    = iselIntExpr_R(env, expr64);
1462             addInstr(env, mk_iMOVsd_RR(src,dst) );
1463             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1464                                             AMD64RMI_Imm(1), dst));
1465             return dst;
1466          }
1467       }
1468
1469       /* 8Uto64(LDle(expr64)) */
1470       {
1471          DEFINE_PATTERN(p_LDle8_then_8Uto64,
1472                         unop(Iop_8Uto64,
1473                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1474          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1475             HReg dst = newVRegI(env);
1476             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1477             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1478             return dst;
1479          }
1480       }
1481
1482       /* 16Uto64(LDle(expr64)) */
1483       {
1484          DEFINE_PATTERN(p_LDle16_then_16Uto64,
1485                         unop(Iop_16Uto64,
1486                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1487          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1488             HReg dst = newVRegI(env);
1489             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1490             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1491             return dst;
1492          }
1493       }
1494
1495       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1496          Use 32 bit arithmetic and let the default zero-extend rule
1497          do the 32Uto64 for free. */
1498       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1499          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1500          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1501          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1502          AMD64AluOp aluOp = Aalu_INVALID;
1503          switch (opi) {
1504             case Iop_Add32: aluOp = Aalu_ADD; break;
1505             case Iop_Sub32: aluOp = Aalu_SUB; break;
1506             case Iop_And32: aluOp = Aalu_AND; break;
1507             case Iop_Or32:  aluOp = Aalu_OR;  break;
1508             case Iop_Xor32: aluOp = Aalu_XOR; break;
1509             default: break;
1510          }
1511          if (aluOp != Aalu_INVALID) {
1512             /* For commutative ops we assume any literal values are on
1513                the second operand. */
1514             HReg dst      = newVRegI(env);
1515             HReg reg      = iselIntExpr_R(env, argL);
1516             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1517             addInstr(env, mk_iMOVsd_RR(reg,dst));
1518             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1519             return dst;
1520          }
1521          /* just fall through to normal handling for Iop_32Uto64 */
1522       }
1523
1524       /* Fallback cases */
1525       switch (e->Iex.Unop.op) {
1526          case Iop_32Uto64:
1527          case Iop_32Sto64: {
1528             HReg dst = newVRegI(env);
1529             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1530             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1531                                             src, dst) );
1532             return dst;
1533          }
1534          case Iop_128HIto64: {
1535             HReg rHi, rLo;
1536             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1537             return rHi; /* and abandon rLo */
1538          }
1539          case Iop_128to64: {
1540             HReg rHi, rLo;
1541             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1542             return rLo; /* and abandon rHi */
1543          }
1544          case Iop_8Uto16:
1545          case Iop_8Uto32:
1546          case Iop_8Uto64:
1547          case Iop_16Uto64:
1548          case Iop_16Uto32: {
1549             HReg dst     = newVRegI(env);
1550             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1551             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1552                                    || e->Iex.Unop.op==Iop_16Uto64 );
1553             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1554             addInstr(env, mk_iMOVsd_RR(src,dst) );
1555             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1556                                             AMD64RMI_Imm(mask), dst));
1557             return dst;
1558          }
1559          case Iop_8Sto16:
1560          case Iop_8Sto64:
1561          case Iop_8Sto32:
1562          case Iop_16Sto32:
1563          case Iop_16Sto64: {
1564             HReg dst     = newVRegI(env);
1565             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1566             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1567                                    || e->Iex.Unop.op==Iop_16Sto64 );
1568             UInt amt     = srcIs16 ? 48 : 56;
1569             addInstr(env, mk_iMOVsd_RR(src,dst) );
1570             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1571             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1572             return dst;
1573          }
1574          case Iop_Not8:
1575          case Iop_Not16:
1576          case Iop_Not32:
1577          case Iop_Not64: {
1578             HReg dst = newVRegI(env);
1579             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1580             addInstr(env, mk_iMOVsd_RR(src,dst) );
1581             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1582             return dst;
1583          }
1584          case Iop_16HIto8:
1585          case Iop_32HIto16:
1586          case Iop_64HIto32: {
1587             HReg dst  = newVRegI(env);
1588             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1589             Int shift = 0;
1590             switch (e->Iex.Unop.op) {
1591                case Iop_16HIto8:  shift = 8;  break;
1592                case Iop_32HIto16: shift = 16; break;
1593                case Iop_64HIto32: shift = 32; break;
1594                default: vassert(0);
1595             }
1596             addInstr(env, mk_iMOVsd_RR(src,dst) );
1597             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1598             return dst;
1599          }
1600          case Iop_1Uto64:
1601          case Iop_1Uto32:
1602          case Iop_1Uto8: {
1603             HReg dst           = newVRegI(env);
1604             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1605             addInstr(env, AMD64Instr_Set64(cond,dst));
1606             return dst;
1607          }
1608          case Iop_1Sto8:
1609          case Iop_1Sto16:
1610          case Iop_1Sto32:
1611          case Iop_1Sto64: {
1612             /* could do better than this, but for now ... */
1613             HReg dst           = newVRegI(env);
1614             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1615             addInstr(env, AMD64Instr_Set64(cond,dst));
1616             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1617             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1618             return dst;
1619          }
1620          case Iop_Ctz64: {
1621             /* Count trailing zeroes, implemented by amd64 'bsfq' */
1622             HReg dst = newVRegI(env);
1623             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1624             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1625             return dst;
1626          }
1627          case Iop_Clz64: {
1628             /* Count leading zeroes.  Do 'bsrq' to establish the index
1629                of the highest set bit, and subtract that value from
1630                63. */
1631             HReg tmp = newVRegI(env);
1632             HReg dst = newVRegI(env);
1633             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1634             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1635             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1636                                             AMD64RMI_Imm(63), dst));
1637             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1638                                             AMD64RMI_Reg(tmp), dst));
1639             return dst;
1640          }
1641
1642          case Iop_CmpwNEZ64: {
1643             HReg dst = newVRegI(env);
1644             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1645             addInstr(env, mk_iMOVsd_RR(src,dst));
1646             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1647             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1648                                             AMD64RMI_Reg(src), dst));
1649             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1650             return dst;
1651          }
1652
1653          case Iop_CmpwNEZ32: {
1654             HReg src = newVRegI(env);
1655             HReg dst = newVRegI(env);
1656             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1657             addInstr(env, mk_iMOVsd_RR(pre,src));
1658             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1659             addInstr(env, mk_iMOVsd_RR(src,dst));
1660             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1661             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1662                                             AMD64RMI_Reg(src), dst));
1663             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1664             return dst;
1665          }
1666
1667          case Iop_Left8:
1668          case Iop_Left16:
1669          case Iop_Left32:
1670          case Iop_Left64: {
1671             HReg dst = newVRegI(env);
1672             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1673             addInstr(env, mk_iMOVsd_RR(src, dst));
1674             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1675             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1676             return dst;
1677          }
1678
1679          case Iop_V128to32: {
1680             HReg        dst     = newVRegI(env);
1681             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1682             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1683             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1684             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1685             return dst;
1686          }
1687
1688          /* V128{HI}to64 */
1689          case Iop_V128to64: {
1690             HReg dst = newVRegI(env);
1691             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1692             addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1693             return dst;
1694          }
1695          case Iop_V128HIto64: {
1696             HReg dst  = newVRegI(env);
1697             HReg vec  = iselVecExpr(env, e->Iex.Unop.arg);
1698             HReg vec2 = newVRegV(env);
1699             addInstr(env, mk_vMOVsd_RR(vec, vec2));
1700             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1701             addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1702             return dst;
1703          }
1704
1705          /* V256to64_{3,2,1,0} */
1706          case Iop_V256to64_0: case Iop_V256to64_1:
1707          case Iop_V256to64_2: case Iop_V256to64_3: {
1708             HReg vHi, vLo, vec;
1709             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1710             /* Do the first part of the selection by deciding which of
1711                the 128 bit registers to look at, and second part using
1712                the same scheme as for V128{HI}to64 above. */
1713             Bool low64of128 = True;
1714             switch (e->Iex.Unop.op) {
1715                case Iop_V256to64_0: vec = vLo; low64of128 = True;  break;
1716                case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1717                case Iop_V256to64_2: vec = vHi; low64of128 = True;  break;
1718                case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1719                default: vassert(0);
1720             }
1721             HReg dst = newVRegI(env);
1722             if (low64of128) {
1723                addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1724             } else {
1725                HReg vec2 = newVRegV(env);
1726                addInstr(env, mk_vMOVsd_RR(vec, vec2));
1727                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1728                addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1729             }
1730             return dst;
1731          }
1732
1733          /* ReinterpF64asI64(e) */
1734          /* Given an IEEE754 double, produce an I64 with the same bit
1735             pattern. */
1736          case Iop_ReinterpF64asI64: {
1737             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1738             HReg        dst    = newVRegI(env);
1739             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1740             /* paranoia */
1741             set_SSE_rounding_default(env);
1742             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1743             addInstr(env, AMD64Instr_Alu64R(
1744                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1745             return dst;
1746          }
1747
1748          /* ReinterpF32asI32(e) */
1749          /* Given an IEEE754 single, produce an I64 with the same bit
1750             pattern in the lower half. */
1751          case Iop_ReinterpF32asI32: {
1752             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1753             HReg        dst    = newVRegI(env);
1754             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1755             /* paranoia */
1756             set_SSE_rounding_default(env);
1757             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1758             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1759             return dst;
1760          }
1761
1762          case Iop_16to8:
1763          case Iop_32to8:
1764          case Iop_64to8:
1765          case Iop_32to16:
1766          case Iop_64to16:
1767          case Iop_64to32:
1768             /* These are no-ops. */
1769             return iselIntExpr_R(env, e->Iex.Unop.arg);
1770
1771          case Iop_GetMSBs8x8: {
1772             /* Note: the following assumes the helper is of
1773                signature
1774                   UInt fn ( ULong ), and is not a regparm fn.
1775             */
1776             HReg dst = newVRegI(env);
1777             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1778             HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1779             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1780             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1781                                            1, mk_RetLoc_simple(RLPri_Int) ));
1782             /* MovxLQ is not exactly the right thing here.  We just
1783                need to get the bottom 8 bits of RAX into dst, and zero
1784                out everything else.  Assuming that the helper returns
1785                a UInt with the top 24 bits zeroed out, it'll do,
1786                though. */
1787             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1788             return dst;
1789          }
1790
1791          case Iop_GetMSBs8x16: {
1792             /* Note: the following assumes the helper is of signature
1793                   UInt fn ( ULong w64hi, ULong w64Lo ),
1794                and is not a regparm fn. */
1795             HReg dst = newVRegI(env);
1796             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1797             HReg rsp = hregAMD64_RSP();
1798             HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1799             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1800             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1801             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1802                                              16, vec, m16_rsp));
1803             /* hi 64 bits into RDI -- the first arg */
1804             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1805                                              AMD64RMI_Mem(m8_rsp),
1806                                              hregAMD64_RDI() )); /* 1st arg */
1807             /* lo 64 bits into RSI -- the 2nd arg */
1808             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1809                                              AMD64RMI_Mem(m16_rsp),
1810                                              hregAMD64_RSI() )); /* 2nd arg */
1811             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1812                                            2, mk_RetLoc_simple(RLPri_Int) ));
1813             /* MovxLQ is not exactly the right thing here.  We just
1814                need to get the bottom 16 bits of RAX into dst, and zero
1815                out everything else.  Assuming that the helper returns
1816                a UInt with the top 16 bits zeroed out, it'll do,
1817                though. */
1818             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1819             return dst;
1820          }
1821
1822          default:
1823             break;
1824       }
1825
1826       /* Deal with unary 64-bit SIMD ops. */
1827       HWord fn = 0;
1828       switch (e->Iex.Unop.op) {
1829          case Iop_CmpNEZ32x2:
1830             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1831          case Iop_CmpNEZ16x4:
1832             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1833          case Iop_CmpNEZ8x8:
1834             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1835          default:
1836             fn = (HWord)0; break;
1837       }
1838       if (fn != (HWord)0) {
1839          /* Note: the following assumes all helpers are of
1840             signature
1841                ULong fn ( ULong ), and they are
1842             not marked as regparm functions.
1843          */
1844          HReg dst = newVRegI(env);
1845          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1846          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1847          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1848                                         mk_RetLoc_simple(RLPri_Int) ));
1849          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1850          return dst;
1851       }
1852
1853       break;
1854    }
1855
1856    /* --------- GET --------- */
1857    case Iex_Get: {
1858       if (ty == Ity_I64) {
1859          HReg dst = newVRegI(env);
1860          addInstr(env, AMD64Instr_Alu64R(
1861                           Aalu_MOV,
1862                           AMD64RMI_Mem(
1863                              AMD64AMode_IR(e->Iex.Get.offset,
1864                                            hregAMD64_RBP())),
1865                           dst));
1866          return dst;
1867       }
1868       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1869          HReg dst = newVRegI(env);
1870          addInstr(env, AMD64Instr_LoadEX(
1871                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1872                           False,
1873                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1874                           dst));
1875          return dst;
1876       }
1877       break;
1878    }
1879
1880    case Iex_GetI: {
1881       AMD64AMode* am
1882          = genGuestArrayOffset(
1883               env, e->Iex.GetI.descr,
1884                    e->Iex.GetI.ix, e->Iex.GetI.bias );
1885       HReg dst = newVRegI(env);
1886       if (ty == Ity_I8) {
1887          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1888          return dst;
1889       }
1890       if (ty == Ity_I64) {
1891          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1892          return dst;
1893       }
1894       break;
1895    }
1896
1897    /* --------- CCALL --------- */
1898    case Iex_CCall: {
1899       HReg    dst = newVRegI(env);
1900       vassert(ty == e->Iex.CCall.retty);
1901
1902       /* be very restrictive for now.  Only 64-bit ints allowed for
1903          args, and 64 or 32 bits for return type. */
1904       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1905          goto irreducible;
1906
1907       /* Marshal args, do the call. */
1908       UInt   addToSp = 0;
1909       RetLoc rloc    = mk_RetLoc_INVALID();
1910       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1911                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1912       vassert(is_sane_RetLoc(rloc));
1913       vassert(rloc.pri == RLPri_Int);
1914       vassert(addToSp == 0);
1915
1916       /* Move to dst, and zero out the top 32 bits if the result type is
1917          Ity_I32.  Probably overkill, but still .. */
1918       if (e->Iex.CCall.retty == Ity_I64)
1919          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1920       else
1921          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1922
1923       return dst;
1924    }
1925
1926    /* --------- LITERAL --------- */
1927    /* 64/32/16/8-bit literals */
1928    case Iex_Const:
1929       if (ty == Ity_I64) {
1930          HReg r = newVRegI(env);
1931          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1932          return r;
1933       } else {
1934          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1935          HReg      r   = newVRegI(env);
1936          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1937          return r;
1938       }
1939
1940    /* --------- MULTIPLEX --------- */
1941    case Iex_ITE: { // VFD
1942       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1943           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1944          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1945          HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1946          HReg     dst = newVRegI(env);
1947          addInstr(env, mk_iMOVsd_RR(r1,dst));
1948          AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1949          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1950          return dst;
1951       }
1952       break;
1953    }
1954
1955    /* --------- TERNARY OP --------- */
1956    case Iex_Triop: {
1957       IRTriop *triop = e->Iex.Triop.details;
1958       /* C3210 flags following FPU partial remainder (fprem), both
1959          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1960       if (triop->op == Iop_PRemC3210F64
1961           || triop->op == Iop_PRem1C3210F64) {
1962          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1963          HReg        arg1   = iselDblExpr(env, triop->arg2);
1964          HReg        arg2   = iselDblExpr(env, triop->arg3);
1965          HReg        dst    = newVRegI(env);
1966          addInstr(env, AMD64Instr_A87Free(2));
1967
1968          /* one arg -> top of x87 stack */
1969          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1970          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1971
1972          /* other arg -> top of x87 stack */
1973          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1974          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1975
1976          switch (triop->op) {
1977             case Iop_PRemC3210F64:
1978                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1979                break;
1980             case Iop_PRem1C3210F64:
1981                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1982                break;
1983             default:
1984                vassert(0);
1985          }
1986          /* Ignore the result, and instead make off with the FPU's
1987             C3210 flags (in the status word). */
1988          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1989          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1990          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1991          return dst;
1992       }
1993       break;
1994    }
1995
1996    default:
1997    break;
1998    } /* switch (e->tag) */
1999
2000    /* We get here if no pattern matched. */
2001   irreducible:
2002    ppIRExpr(e);
2003    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2004 }
2005
2006
2007 /*---------------------------------------------------------*/
2008 /*--- ISEL: Integer expression auxiliaries              ---*/
2009 /*---------------------------------------------------------*/
2010
2011 /* --------------------- AMODEs --------------------- */
2012
2013 /* Return an AMode which computes the value of the specified
2014    expression, possibly also adding insns to the code list as a
2015    result.  The expression may only be a 32-bit one.
2016 */
2017
2018 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2019 {
2020    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2021    vassert(sane_AMode(am));
2022    return am;
2023 }
2024
2025 /* DO NOT CALL THIS DIRECTLY ! */
2026 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2027 {
2028    MatchInfo mi;
2029    DECLARE_PATTERN(p_complex);
2030    IRType ty = typeOfIRExpr(env->type_env,e);
2031    vassert(ty == Ity_I64);
2032
2033    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2034    /*              bind0        bind1  bind2   bind3   */
2035    DEFINE_PATTERN(p_complex,
2036       binop( Iop_Add64,
2037              binop( Iop_Add64,
2038                     bind(0),
2039                     binop(Iop_Shl64, bind(1), bind(2))
2040                   ),
2041              bind(3)
2042            )
2043    );
2044    if (matchIRExpr(&mi, p_complex, e)) {
2045       const IRExpr* expr1  = mi.bindee[0];
2046       const IRExpr* expr2  = mi.bindee[1];
2047       const IRExpr* imm8   = mi.bindee[2];
2048       const IRExpr* simm32 = mi.bindee[3];
2049       if (imm8->tag == Iex_Const
2050           && imm8->Iex.Const.con->tag == Ico_U8
2051           && imm8->Iex.Const.con->Ico.U8 < 4
2052           /* imm8 is OK, now check simm32 */
2053           && simm32->tag == Iex_Const
2054           && simm32->Iex.Const.con->tag == Ico_U64
2055           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2056          UInt shift = imm8->Iex.Const.con->Ico.U8;
2057          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2058          HReg r1 = iselIntExpr_R(env, expr1);
2059          HReg r2 = iselIntExpr_R(env, expr2);
2060          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2061          return AMD64AMode_IRRS(offset, r1, r2, shift);
2062       }
2063    }
2064
2065    /* Add64(expr1, Shl64(expr2, imm)) */
2066    if (e->tag == Iex_Binop
2067        && e->Iex.Binop.op == Iop_Add64
2068        && e->Iex.Binop.arg2->tag == Iex_Binop
2069        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2070        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2071        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2072       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2073       if (shift == 1 || shift == 2 || shift == 3) {
2074          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2075          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2076          return AMD64AMode_IRRS(0, r1, r2, shift);
2077       }
2078    }
2079
2080    /* Add64(expr,i) */
2081    if (e->tag == Iex_Binop
2082        && e->Iex.Binop.op == Iop_Add64
2083        && e->Iex.Binop.arg2->tag == Iex_Const
2084        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2085        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2086       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2087       return AMD64AMode_IR(
2088                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2089                 r1
2090              );
2091    }
2092
2093    /* Doesn't match anything in particular.  Generate it into
2094       a register and use that. */
2095    {
2096       HReg r1 = iselIntExpr_R(env, e);
2097       return AMD64AMode_IR(0, r1);
2098    }
2099 }
2100
2101
2102 /* --------------------- RMIs --------------------- */
2103
2104 /* Similarly, calculate an expression into an X86RMI operand.  As with
2105    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2106
2107 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2108 {
2109    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2110    /* sanity checks ... */
2111    switch (rmi->tag) {
2112       case Armi_Imm:
2113          return rmi;
2114       case Armi_Reg:
2115          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2116          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2117          return rmi;
2118       case Armi_Mem:
2119          vassert(sane_AMode(rmi->Armi.Mem.am));
2120          return rmi;
2121       default:
2122          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2123    }
2124 }
2125
2126 /* DO NOT CALL THIS DIRECTLY ! */
2127 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2128 {
2129    IRType ty = typeOfIRExpr(env->type_env,e);
2130    vassert(ty == Ity_I64 || ty == Ity_I32
2131            || ty == Ity_I16 || ty == Ity_I8);
2132
2133    /* special case: immediate 64/32/16/8 */
2134    if (e->tag == Iex_Const) {
2135       switch (e->Iex.Const.con->tag) {
2136         case Ico_U64:
2137            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2138               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2139            }
2140            break;
2141          case Ico_U32:
2142             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2143          case Ico_U16:
2144             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2145          case Ico_U8:
2146             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2147          default:
2148             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2149       }
2150    }
2151
2152    /* special case: 64-bit GET */
2153    if (e->tag == Iex_Get && ty == Ity_I64) {
2154       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2155                                         hregAMD64_RBP()));
2156    }
2157
2158    /* special case: 64-bit load from memory */
2159    if (e->tag == Iex_Load && ty == Ity_I64
2160        && e->Iex.Load.end == Iend_LE) {
2161       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2162       return AMD64RMI_Mem(am);
2163    }
2164
2165    /* default case: calculate into a register and return that */
2166    {
2167       HReg r = iselIntExpr_R ( env, e );
2168       return AMD64RMI_Reg(r);
2169    }
2170 }
2171
2172
2173 /* --------------------- RIs --------------------- */
2174
2175 /* Calculate an expression into an AMD64RI operand.  As with
2176    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2177    bits. */
2178
2179 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2180 {
2181    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2182    /* sanity checks ... */
2183    switch (ri->tag) {
2184       case Ari_Imm:
2185          return ri;
2186       case Ari_Reg:
2187          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2188          vassert(hregIsVirtual(ri->Ari.Reg.reg));
2189          return ri;
2190       default:
2191          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2192    }
2193 }
2194
2195 /* DO NOT CALL THIS DIRECTLY ! */
2196 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2197 {
2198    IRType ty = typeOfIRExpr(env->type_env,e);
2199    vassert(ty == Ity_I64 || ty == Ity_I32
2200            || ty == Ity_I16 || ty == Ity_I8);
2201
2202    /* special case: immediate */
2203    if (e->tag == Iex_Const) {
2204       switch (e->Iex.Const.con->tag) {
2205         case Ico_U64:
2206            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2207               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2208            }
2209            break;
2210          case Ico_U32:
2211             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2212          case Ico_U16:
2213             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2214          case Ico_U8:
2215             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2216          default:
2217             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2218       }
2219    }
2220
2221    /* default case: calculate into a register and return that */
2222    {
2223       HReg r = iselIntExpr_R ( env, e );
2224       return AMD64RI_Reg(r);
2225    }
2226 }
2227
2228
2229 /* --------------------- RMs --------------------- */
2230
2231 /* Similarly, calculate an expression into an AMD64RM operand.  As
2232    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2233    bits.  */
2234
2235 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2236 {
2237    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2238    /* sanity checks ... */
2239    switch (rm->tag) {
2240       case Arm_Reg:
2241          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2242          vassert(hregIsVirtual(rm->Arm.Reg.reg));
2243          return rm;
2244       case Arm_Mem:
2245          vassert(sane_AMode(rm->Arm.Mem.am));
2246          return rm;
2247       default:
2248          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2249    }
2250 }
2251
2252 /* DO NOT CALL THIS DIRECTLY ! */
2253 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2254 {
2255    IRType ty = typeOfIRExpr(env->type_env,e);
2256    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2257
2258    /* special case: 64-bit GET */
2259    if (e->tag == Iex_Get && ty == Ity_I64) {
2260       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2261                                        hregAMD64_RBP()));
2262    }
2263
2264    /* special case: load from memory */
2265
2266    /* default case: calculate into a register and return that */
2267    {
2268       HReg r = iselIntExpr_R ( env, e );
2269       return AMD64RM_Reg(r);
2270    }
2271 }
2272
2273
2274 /* --------------------- CONDCODE --------------------- */
2275
2276 /* Generate code to evaluated a bit-typed expression, returning the
2277    condition code which would correspond when the expression would
2278    notionally have returned 1. */
2279
2280 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2281 {
2282    /* Uh, there's nothing we can sanity check here, unfortunately. */
2283    return iselCondCode_wrk(env,e);
2284 }
2285
2286 /* DO NOT CALL THIS DIRECTLY ! */
2287 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2288 {
2289    vassert(e);
2290    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2291
2292    /* var */
2293    if (e->tag == Iex_RdTmp) {
2294       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2295       addInstr(env, AMD64Instr_Test64(1,r64));
2296       return Acc_NZ;
2297    }
2298
2299    /* Constant 1:Bit */
2300    if (e->tag == Iex_Const) {
2301       HReg r;
2302       vassert(e->Iex.Const.con->tag == Ico_U1);
2303       vassert(e->Iex.Const.con->Ico.U1 == True
2304               || e->Iex.Const.con->Ico.U1 == False);
2305       r = newVRegI(env);
2306       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2307       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2308       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2309    }
2310
2311    /* Not1(...) */
2312    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2313       /* Generate code for the arg, and negate the test condition */
2314       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2315    }
2316
2317    /* --- patterns rooted at: 64to1 --- */
2318
2319    /* 64to1 */
2320    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2321       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2322       addInstr(env, AMD64Instr_Test64(1,reg));
2323       return Acc_NZ;
2324    }
2325
2326    /* --- patterns rooted at: 32to1 --- */
2327
2328    /* 32to1 */
2329    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2330       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2331       addInstr(env, AMD64Instr_Test64(1,reg));
2332       return Acc_NZ;
2333    }
2334
2335    /* --- patterns rooted at: CmpNEZ8 --- */
2336
2337    /* CmpNEZ8(x) */
2338    if (e->tag == Iex_Unop
2339        && e->Iex.Unop.op == Iop_CmpNEZ8) {
2340       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2341       addInstr(env, AMD64Instr_Test64(0xFF,r));
2342       return Acc_NZ;
2343    }
2344
2345    /* --- patterns rooted at: CmpNEZ16 --- */
2346
2347    /* CmpNEZ16(x) */
2348    if (e->tag == Iex_Unop
2349        && e->Iex.Unop.op == Iop_CmpNEZ16) {
2350       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2351       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2352       return Acc_NZ;
2353    }
2354
2355    /* --- patterns rooted at: CmpNEZ32 --- */
2356
2357    if (e->tag == Iex_Unop
2358        && e->Iex.Unop.op == Iop_CmpNEZ32) {
2359       IRExpr* arg = e->Iex.Unop.arg;
2360       if (arg->tag == Iex_Binop
2361           && (arg->Iex.Binop.op == Iop_Or32
2362               || arg->Iex.Binop.op == Iop_And32)) {
2363          /* CmpNEZ32(Or32(x,y)) */
2364          /* CmpNEZ32(And32(x,y)) */
2365          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2366          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2367          HReg      tmp  = newVRegI(env);
2368          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2369          addInstr(env, AMD64Instr_Alu32R(
2370                           arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2371                           rmi1, tmp));
2372          return Acc_NZ;
2373       }
2374       /* CmpNEZ32(x) */
2375       HReg      r1   = iselIntExpr_R(env, arg);
2376       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2377       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2378       return Acc_NZ;
2379    }
2380
2381    /* --- patterns rooted at: CmpNEZ64 --- */
2382
2383    if (e->tag == Iex_Unop
2384        && e->Iex.Unop.op == Iop_CmpNEZ64) {
2385       IRExpr* arg = e->Iex.Unop.arg;
2386       if (arg->tag == Iex_Binop
2387           && (arg->Iex.Binop.op == Iop_Or64
2388               || arg->Iex.Binop.op == Iop_And64)) {
2389          /* CmpNEZ64(Or64(x,y)) */
2390          /* CmpNEZ64(And64(x,y)) */
2391          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2392          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2393          HReg      tmp  = newVRegI(env);
2394          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2395          addInstr(env, AMD64Instr_Alu64R(
2396                           arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2397                           rmi1, tmp));
2398          return Acc_NZ;
2399       }
2400       /* CmpNEZ64(x) */
2401       HReg      r1   = iselIntExpr_R(env, arg);
2402       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2403       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2404       return Acc_NZ;
2405    }
2406
2407    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2408
2409    /* CmpEQ8 / CmpNE8 */
2410    if (e->tag == Iex_Binop
2411        && (e->Iex.Binop.op == Iop_CmpEQ8
2412            || e->Iex.Binop.op == Iop_CmpNE8
2413            || e->Iex.Binop.op == Iop_CasCmpEQ8
2414            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2415       if (isZeroU8(e->Iex.Binop.arg2)) {
2416          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2417          addInstr(env, AMD64Instr_Test64(0xFF,r1));
2418          switch (e->Iex.Binop.op) {
2419             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2420             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2421             default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2422          }
2423       } else {
2424          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2425          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2426          HReg      r    = newVRegI(env);
2427          addInstr(env, mk_iMOVsd_RR(r1,r));
2428          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2429          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2430          switch (e->Iex.Binop.op) {
2431             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2432             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2433             default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2434          }
2435       }
2436    }
2437
2438    /* CmpEQ16 / CmpNE16 */
2439    if (e->tag == Iex_Binop
2440        && (e->Iex.Binop.op == Iop_CmpEQ16
2441            || e->Iex.Binop.op == Iop_CmpNE16
2442            || e->Iex.Binop.op == Iop_CasCmpEQ16
2443            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2444       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2445       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2446       HReg      r    = newVRegI(env);
2447       addInstr(env, mk_iMOVsd_RR(r1,r));
2448       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2449       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2450       switch (e->Iex.Binop.op) {
2451          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2452          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2453          default: vpanic("iselCondCode(amd64): CmpXX16");
2454       }
2455    }
2456
2457    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2458       Saves a "movq %rax, %tmp" compared to the default route. */
2459    if (e->tag == Iex_Binop
2460        && e->Iex.Binop.op == Iop_CmpNE64
2461        && e->Iex.Binop.arg1->tag == Iex_CCall
2462        && e->Iex.Binop.arg2->tag == Iex_Const) {
2463       IRExpr* cal = e->Iex.Binop.arg1;
2464       IRExpr* con = e->Iex.Binop.arg2;
2465       HReg    tmp = newVRegI(env);
2466       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2467       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2468       vassert(con->Iex.Const.con->tag == Ico_U64);
2469       /* Marshal args, do the call. */
2470       UInt   addToSp = 0;
2471       RetLoc rloc    = mk_RetLoc_INVALID();
2472       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2473                     cal->Iex.CCall.cee,
2474                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
2475       vassert(is_sane_RetLoc(rloc));
2476       vassert(rloc.pri == RLPri_Int);
2477       vassert(addToSp == 0);
2478       /* */
2479       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2480       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2481                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2482       return Acc_NZ;
2483    }
2484
2485    /* Cmp*64*(x,y) */
2486    if (e->tag == Iex_Binop
2487        && (e->Iex.Binop.op == Iop_CmpEQ64
2488            || e->Iex.Binop.op == Iop_CmpNE64
2489            || e->Iex.Binop.op == Iop_CmpLT64S
2490            || e->Iex.Binop.op == Iop_CmpLT64U
2491            || e->Iex.Binop.op == Iop_CmpLE64S
2492            || e->Iex.Binop.op == Iop_CmpLE64U
2493            || e->Iex.Binop.op == Iop_CasCmpEQ64
2494            || e->Iex.Binop.op == Iop_CasCmpNE64
2495            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2496       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2497       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2498       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2499       switch (e->Iex.Binop.op) {
2500          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2501          case Iop_CmpNE64:
2502          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2503          case Iop_CmpLT64S: return Acc_L;
2504          case Iop_CmpLT64U: return Acc_B;
2505          case Iop_CmpLE64S: return Acc_LE;
2506          case Iop_CmpLE64U: return Acc_BE;
2507          default: vpanic("iselCondCode(amd64): CmpXX64");
2508       }
2509    }
2510
2511    /* Cmp*32*(x,y) */
2512    if (e->tag == Iex_Binop
2513        && (e->Iex.Binop.op == Iop_CmpEQ32
2514            || e->Iex.Binop.op == Iop_CmpNE32
2515            || e->Iex.Binop.op == Iop_CmpLT32S
2516            || e->Iex.Binop.op == Iop_CmpLT32U
2517            || e->Iex.Binop.op == Iop_CmpLE32S
2518            || e->Iex.Binop.op == Iop_CmpLE32U
2519            || e->Iex.Binop.op == Iop_CasCmpEQ32
2520            || e->Iex.Binop.op == Iop_CasCmpNE32
2521            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2522       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2523       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2524       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2525       switch (e->Iex.Binop.op) {
2526          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2527          case Iop_CmpNE32:
2528          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2529          case Iop_CmpLT32S: return Acc_L;
2530          case Iop_CmpLT32U: return Acc_B;
2531          case Iop_CmpLE32S: return Acc_LE;
2532          case Iop_CmpLE32U: return Acc_BE;
2533          default: vpanic("iselCondCode(amd64): CmpXX32");
2534       }
2535    }
2536
2537    /* And1(x,y), Or1(x,y) */
2538    /* FIXME: We could (and probably should) do a lot better here.  If both args
2539       are in temps already then we can just emit a reg-reg And/Or directly,
2540       followed by the final Test. */
2541    if (e->tag == Iex_Binop
2542        && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) {
2543       // We could probably be cleverer about this.  In the meantime ..
2544       HReg x_as_64 = newVRegI(env);
2545       AMD64CondCode cc_x = iselCondCode(env, e->Iex.Binop.arg1);
2546       addInstr(env, AMD64Instr_Set64(cc_x, x_as_64));
2547       HReg y_as_64 = newVRegI(env);
2548       AMD64CondCode cc_y = iselCondCode(env, e->Iex.Binop.arg2);
2549       addInstr(env, AMD64Instr_Set64(cc_y, y_as_64));
2550       AMD64AluOp aop = e->Iex.Binop.op == Iop_And1 ? Aalu_AND : Aalu_OR;
2551       addInstr(env, AMD64Instr_Alu64R(aop, AMD64RMI_Reg(x_as_64), y_as_64));
2552       addInstr(env, AMD64Instr_Test64(1, y_as_64));
2553       return Acc_NZ;
2554    }
2555
2556    ppIRExpr(e);
2557    vpanic("iselCondCode(amd64)");
2558 }
2559
2560
2561 /*---------------------------------------------------------*/
2562 /*--- ISEL: Integer expressions (128 bit)               ---*/
2563 /*---------------------------------------------------------*/
2564
2565 /* Compute a 128-bit value into a register pair, which is returned as
2566    the first two parameters.  As with iselIntExpr_R, these may be
2567    either real or virtual regs; in any case they must not be changed
2568    by subsequent code emitted by the caller.  */
2569
2570 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2571                              ISelEnv* env, const IRExpr* e )
2572 {
2573    iselInt128Expr_wrk(rHi, rLo, env, e);
2574 #  if 0
2575    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2576 #  endif
2577    vassert(hregClass(*rHi) == HRcInt64);
2578    vassert(hregIsVirtual(*rHi));
2579    vassert(hregClass(*rLo) == HRcInt64);
2580    vassert(hregIsVirtual(*rLo));
2581 }
2582
2583 /* DO NOT CALL THIS DIRECTLY ! */
2584 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2585                                  ISelEnv* env, const IRExpr* e )
2586 {
2587    vassert(e);
2588    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2589
2590    /* read 128-bit IRTemp */
2591    if (e->tag == Iex_RdTmp) {
2592       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2593       return;
2594    }
2595
2596    /* --------- BINARY ops --------- */
2597    if (e->tag == Iex_Binop) {
2598       switch (e->Iex.Binop.op) {
2599          /* 64 x 64 -> 128 multiply */
2600          case Iop_MullU64:
2601          case Iop_MullS64: {
2602             /* get one operand into %rax, and the other into a R/M.
2603                Need to make an educated guess about which is better in
2604                which. */
2605             HReg     tLo    = newVRegI(env);
2606             HReg     tHi    = newVRegI(env);
2607             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2608             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2609             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2610             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2611             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2612             /* Result is now in RDX:RAX.  Tell the caller. */
2613             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2614             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2615             *rHi = tHi;
2616             *rLo = tLo;
2617             return;
2618          }
2619
2620          /* 128 x 64 -> (64(rem),64(div)) division */
2621          case Iop_DivModU128to64:
2622          case Iop_DivModS128to64: {
2623             /* Get the 128-bit operand into rdx:rax, and the other into
2624                any old R/M. */
2625             HReg sHi, sLo;
2626             HReg     tLo     = newVRegI(env);
2627             HReg     tHi     = newVRegI(env);
2628             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2629             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2630             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2631             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2632             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2633             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2634             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2635             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2636             *rHi = tHi;
2637             *rLo = tLo;
2638             return;
2639          }
2640
2641          /* 64HLto128(e1,e2) */
2642          case Iop_64HLto128:
2643             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2644             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2645             return;
2646
2647          default:
2648             break;
2649       }
2650    } /* if (e->tag == Iex_Binop) */
2651
2652    ppIRExpr(e);
2653    vpanic("iselInt128Expr");
2654 }
2655
2656
2657 /*---------------------------------------------------------*/
2658 /*--- ISEL: Floating point expressions (32 bit)         ---*/
2659 /*---------------------------------------------------------*/
2660
2661 /* Nothing interesting here; really just wrappers for
2662    64-bit stuff. */
2663
2664 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2665 {
2666    HReg r = iselFltExpr_wrk( env, e );
2667 #  if 0
2668    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2669 #  endif
2670    vassert(hregClass(r) == HRcVec128);
2671    vassert(hregIsVirtual(r));
2672    return r;
2673 }
2674
2675 /* DO NOT CALL THIS DIRECTLY */
2676 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2677 {
2678    IRType ty = typeOfIRExpr(env->type_env,e);
2679    vassert(ty == Ity_F32);
2680
2681    if (e->tag == Iex_RdTmp) {
2682       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2683    }
2684
2685    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2686       AMD64AMode* am;
2687       HReg res = newVRegV(env);
2688       vassert(e->Iex.Load.ty == Ity_F32);
2689       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2690       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2691       return res;
2692    }
2693
2694    if (e->tag == Iex_Binop
2695        && e->Iex.Binop.op == Iop_F64toF32) {
2696       /* Although the result is still held in a standard SSE register,
2697          we need to round it to reflect the loss of accuracy/range
2698          entailed in casting it to a 32-bit float. */
2699       HReg dst = newVRegV(env);
2700       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2701       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2702       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2703       set_SSE_rounding_default( env );
2704       return dst;
2705    }
2706
2707    if (e->tag == Iex_Get) {
2708       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2709                                        hregAMD64_RBP() );
2710       HReg res = newVRegV(env);
2711       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2712       return res;
2713    }
2714
2715    if (e->tag == Iex_Unop
2716        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2717        /* Given an I32, produce an IEEE754 float with the same bit
2718           pattern. */
2719        HReg        dst    = newVRegV(env);
2720        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2721        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2722        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2723        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2724        return dst;
2725    }
2726
2727    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2728       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2729       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2730       HReg        dst    = newVRegV(env);
2731
2732       /* rf now holds the value to be rounded.  The first thing to do
2733          is set the FPU's rounding mode accordingly. */
2734
2735       /* Set host x87 rounding mode */
2736       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2737
2738       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2739       addInstr(env, AMD64Instr_A87Free(1));
2740       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2741       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2742       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2743       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2744
2745       /* Restore default x87 rounding. */
2746       set_FPU_rounding_default( env );
2747
2748       return dst;
2749    }
2750
2751    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2752       /* Sigh ... very rough code.  Could do much better. */
2753       /* Get the 128-bit literal 00---0 10---0 into a register
2754          and xor it with the value to be negated. */
2755       HReg r1  = newVRegI(env);
2756       HReg dst = newVRegV(env);
2757       HReg tmp = newVRegV(env);
2758       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2759       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2760       addInstr(env, mk_vMOVsd_RR(src,tmp));
2761       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2762       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2763       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2764       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2765       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2766       add_to_rsp(env, 16);
2767       return dst;
2768    }
2769
2770    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2771       IRQop *qop = e->Iex.Qop.details;
2772       HReg dst  = newVRegV(env);
2773       HReg argX = iselFltExpr(env, qop->arg2);
2774       HReg argY = iselFltExpr(env, qop->arg3);
2775       HReg argZ = iselFltExpr(env, qop->arg4);
2776       /* XXXROUNDINGFIXME */
2777       /* set roundingmode here */
2778       /* subq $16, %rsp         -- make a space*/
2779       sub_from_rsp(env, 16);
2780       /* Prepare 4 arg regs:
2781          leaq 0(%rsp), %rdi
2782          leaq 4(%rsp), %rsi
2783          leaq 8(%rsp), %rdx
2784          leaq 12(%rsp), %rcx
2785       */
2786       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2787                                      hregAMD64_RDI()));
2788       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2789                                      hregAMD64_RSI()));
2790       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2791                                      hregAMD64_RDX()));
2792       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2793                                      hregAMD64_RCX()));
2794       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2795          movss  %argX, 0(%rsi)
2796          movss  %argY, 0(%rdx)
2797          movss  %argZ, 0(%rcx)
2798          */
2799       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2800                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2801       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2802                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2803       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2804                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2805       /* call the helper */
2806       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2807                                      (ULong)(HWord)h_generic_calc_MAddF32,
2808                                      4, mk_RetLoc_simple(RLPri_None) ));
2809       /* fetch the result from memory, using %r_argp, which the
2810          register allocator will keep alive across the call. */
2811       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2812                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2813       /* and finally, clear the space */
2814       add_to_rsp(env, 16);
2815       return dst;
2816    }
2817
2818    ppIRExpr(e);
2819    vpanic("iselFltExpr_wrk");
2820 }
2821
2822
2823 /*---------------------------------------------------------*/
2824 /*--- ISEL: Floating point expressions (64 bit)         ---*/
2825 /*---------------------------------------------------------*/
2826
2827 /* Compute a 64-bit floating point value into the lower half of an xmm
2828    register, the identity of which is returned.  As with
2829    iselIntExpr_R, the returned reg will be virtual, and it must not be
2830    changed by subsequent code emitted by the caller.
2831 */
2832
2833 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2834
2835     Type                  S (1 bit)   E (11 bits)   F (52 bits)
2836     ----                  ---------   -----------   -----------
2837     signalling NaN        u           2047 (max)    .0uuuuu---u
2838                                                     (with at least
2839                                                      one 1 bit)
2840     quiet NaN             u           2047 (max)    .1uuuuu---u
2841
2842     negative infinity     1           2047 (max)    .000000---0
2843
2844     positive infinity     0           2047 (max)    .000000---0
2845
2846     negative zero         1           0             .000000---0
2847
2848     positive zero         0           0             .000000---0
2849 */
2850
2851 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2852 {
2853    HReg r = iselDblExpr_wrk( env, e );
2854 #  if 0
2855    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2856 #  endif
2857    vassert(hregClass(r) == HRcVec128);
2858    vassert(hregIsVirtual(r));
2859    return r;
2860 }
2861
2862 /* DO NOT CALL THIS DIRECTLY */
2863 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2864 {
2865    IRType ty = typeOfIRExpr(env->type_env,e);
2866    vassert(e);
2867    vassert(ty == Ity_F64);
2868
2869    if (e->tag == Iex_RdTmp) {
2870       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2871    }
2872
2873    if (e->tag == Iex_Const) {
2874       union { ULong u64; Double f64; } u;
2875       HReg res = newVRegV(env);
2876       HReg tmp = newVRegI(env);
2877       vassert(sizeof(u) == 8);
2878       vassert(sizeof(u.u64) == 8);
2879       vassert(sizeof(u.f64) == 8);
2880
2881       if (e->Iex.Const.con->tag == Ico_F64) {
2882          u.f64 = e->Iex.Const.con->Ico.F64;
2883       }
2884       else if (e->Iex.Const.con->tag == Ico_F64i) {
2885          u.u64 = e->Iex.Const.con->Ico.F64i;
2886       }
2887       else
2888          vpanic("iselDblExpr(amd64): const");
2889
2890       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2891       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2892       addInstr(env, AMD64Instr_SseLdSt(
2893                        True/*load*/, 8, res,
2894                        AMD64AMode_IR(0, hregAMD64_RSP())
2895               ));
2896       add_to_rsp(env, 8);
2897       return res;
2898    }
2899
2900    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2901       AMD64AMode* am;
2902       HReg res = newVRegV(env);
2903       vassert(e->Iex.Load.ty == Ity_F64);
2904       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2905       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2906       return res;
2907    }
2908
2909    if (e->tag == Iex_Get) {
2910       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2911                                       hregAMD64_RBP() );
2912       HReg res = newVRegV(env);
2913       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2914       return res;
2915    }
2916
2917    if (e->tag == Iex_GetI) {
2918       AMD64AMode* am
2919          = genGuestArrayOffset(
2920               env, e->Iex.GetI.descr,
2921                    e->Iex.GetI.ix, e->Iex.GetI.bias );
2922       HReg res = newVRegV(env);
2923       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2924       return res;
2925    }
2926
2927    if (e->tag == Iex_Triop) {
2928       IRTriop *triop = e->Iex.Triop.details;
2929       AMD64SseOp op = Asse_INVALID;
2930       switch (triop->op) {
2931          case Iop_AddF64: op = Asse_ADDF; break;
2932          case Iop_SubF64: op = Asse_SUBF; break;
2933          case Iop_MulF64: op = Asse_MULF; break;
2934          case Iop_DivF64: op = Asse_DIVF; break;
2935          default: break;
2936       }
2937       if (op != Asse_INVALID) {
2938          HReg dst  = newVRegV(env);
2939          HReg argL = iselDblExpr(env, triop->arg2);
2940          HReg argR = iselDblExpr(env, triop->arg3);
2941          addInstr(env, mk_vMOVsd_RR(argL, dst));
2942          /* XXXROUNDINGFIXME */
2943          /* set roundingmode here */
2944          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2945          return dst;
2946       }
2947    }
2948
2949    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2950       IRQop *qop = e->Iex.Qop.details;
2951       HReg dst  = newVRegV(env);
2952       HReg argX = iselDblExpr(env, qop->arg2);
2953       HReg argY = iselDblExpr(env, qop->arg3);
2954       HReg argZ = iselDblExpr(env, qop->arg4);
2955       /* XXXROUNDINGFIXME */
2956       /* set roundingmode here */
2957       /* subq $32, %rsp         -- make a space*/
2958       sub_from_rsp(env, 32);
2959       /* Prepare 4 arg regs:
2960          leaq 0(%rsp), %rdi
2961          leaq 8(%rsp), %rsi
2962          leaq 16(%rsp), %rdx
2963          leaq 24(%rsp), %rcx
2964       */
2965       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2966                                      hregAMD64_RDI()));
2967       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2968                                      hregAMD64_RSI()));
2969       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2970                                      hregAMD64_RDX()));
2971       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2972                                      hregAMD64_RCX()));
2973       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2974          movsd  %argX, 0(%rsi)
2975          movsd  %argY, 0(%rdx)
2976          movsd  %argZ, 0(%rcx)
2977          */
2978       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2979                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2980       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2981                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2982       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2983                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2984       /* call the helper */
2985       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2986                                      (ULong)(HWord)h_generic_calc_MAddF64,
2987                                      4, mk_RetLoc_simple(RLPri_None) ));
2988       /* fetch the result from memory, using %r_argp, which the
2989          register allocator will keep alive across the call. */
2990       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2991                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2992       /* and finally, clear the space */
2993       add_to_rsp(env, 32);
2994       return dst;
2995    }
2996
2997    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2998       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2999       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3000       HReg        dst    = newVRegV(env);
3001
3002       /* rf now holds the value to be rounded.  The first thing to do
3003          is set the FPU's rounding mode accordingly. */
3004
3005       /* Set host x87 rounding mode */
3006       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
3007
3008       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3009       addInstr(env, AMD64Instr_A87Free(1));
3010       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3011       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
3012       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3013       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3014
3015       /* Restore default x87 rounding. */
3016       set_FPU_rounding_default( env );
3017
3018       return dst;
3019    }
3020
3021    IRTriop *triop = e->Iex.Triop.details;
3022    if (e->tag == Iex_Triop
3023        && (triop->op == Iop_ScaleF64
3024            || triop->op == Iop_AtanF64
3025            || triop->op == Iop_Yl2xF64
3026            || triop->op == Iop_Yl2xp1F64
3027            || triop->op == Iop_PRemF64
3028            || triop->op == Iop_PRem1F64)
3029       ) {
3030       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3031       HReg        arg1   = iselDblExpr(env, triop->arg2);
3032       HReg        arg2   = iselDblExpr(env, triop->arg3);
3033       HReg        dst    = newVRegV(env);
3034       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
3035                                   || triop->op == Iop_PRemF64
3036                                   || triop->op == Iop_PRem1F64);
3037       addInstr(env, AMD64Instr_A87Free(2));
3038
3039       /* one arg -> top of x87 stack */
3040       addInstr(env, AMD64Instr_SseLdSt(
3041                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3042       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3043
3044       /* other arg -> top of x87 stack */
3045       addInstr(env, AMD64Instr_SseLdSt(
3046                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3047       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3048
3049       /* do it */
3050       /* XXXROUNDINGFIXME */
3051       /* set roundingmode here */
3052       switch (triop->op) {
3053          case Iop_ScaleF64:
3054             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3055             break;
3056          case Iop_AtanF64:
3057             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3058             break;
3059          case Iop_Yl2xF64:
3060             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3061             break;
3062          case Iop_Yl2xp1F64:
3063             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3064             break;
3065          case Iop_PRemF64:
3066             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3067             break;
3068          case Iop_PRem1F64:
3069             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3070             break;
3071          default:
3072             vassert(0);
3073       }
3074
3075       /* save result */
3076       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3077       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3078       return dst;
3079    }
3080
3081    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3082       HReg dst = newVRegV(env);
3083       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3084       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3085       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3086       set_SSE_rounding_default( env );
3087       return dst;
3088    }
3089
3090    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3091       HReg dst = newVRegV(env);
3092       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3093       set_SSE_rounding_default( env );
3094       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3095       return dst;
3096    }
3097
3098    if (e->tag == Iex_Unop
3099        && (e->Iex.Unop.op == Iop_NegF64
3100            || e->Iex.Unop.op == Iop_AbsF64)) {
3101       /* Sigh ... very rough code.  Could do much better. */
3102       /* Get the 128-bit literal 00---0 10---0 into a register
3103          and xor/nand it with the value to be negated. */
3104       HReg r1  = newVRegI(env);
3105       HReg dst = newVRegV(env);
3106       HReg tmp = newVRegV(env);
3107       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3108       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3109       addInstr(env, mk_vMOVsd_RR(src,tmp));
3110       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3111       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3112       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3113       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3114
3115       if (e->Iex.Unop.op == Iop_NegF64)
3116          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3117       else
3118          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3119
3120       add_to_rsp(env, 16);
3121       return dst;
3122    }
3123
3124    if (e->tag == Iex_Binop) {
3125       A87FpOp fpop = Afp_INVALID;
3126       switch (e->Iex.Binop.op) {
3127          case Iop_SqrtF64: fpop = Afp_SQRT; break;
3128          case Iop_SinF64:  fpop = Afp_SIN;  break;
3129          case Iop_CosF64:  fpop = Afp_COS;  break;
3130          case Iop_TanF64:  fpop = Afp_TAN;  break;
3131          case Iop_2xm1F64: fpop = Afp_2XM1; break;
3132          default: break;
3133       }
3134       if (fpop != Afp_INVALID) {
3135          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3136          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3137          HReg        dst    = newVRegV(env);
3138          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3139          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3140          addInstr(env, AMD64Instr_A87Free(nNeeded));
3141          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3142          /* XXXROUNDINGFIXME */
3143          /* set roundingmode here */
3144          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3145             codes.  I don't think that matters, since this insn
3146             selector never generates such an instruction intervening
3147             between an flag-setting instruction and a flag-using
3148             instruction. */
3149          addInstr(env, AMD64Instr_A87FpOp(fpop));
3150          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3151          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3152          return dst;
3153       }
3154    }
3155
3156    if (e->tag == Iex_Unop) {
3157       switch (e->Iex.Unop.op) {
3158 //..          case Iop_I32toF64: {
3159 //..             HReg dst = newVRegF(env);
3160 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3161 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3162 //..             set_FPU_rounding_default(env);
3163 //..             addInstr(env, X86Instr_FpLdStI(
3164 //..                              True/*load*/, 4, dst,
3165 //..                              X86AMode_IR(0, hregX86_ESP())));
3166 //..             add_to_esp(env, 4);
3167 //..             return dst;
3168 //..          }
3169          case Iop_ReinterpI64asF64: {
3170             /* Given an I64, produce an IEEE754 double with the same
3171                bit pattern. */
3172             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3173             HReg        dst    = newVRegV(env);
3174             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3175             /* paranoia */
3176             set_SSE_rounding_default(env);
3177             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3178             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3179             return dst;
3180          }
3181          case Iop_F32toF64: {
3182             HReg f32;
3183             HReg f64 = newVRegV(env);
3184             /* this shouldn't be necessary, but be paranoid ... */
3185             set_SSE_rounding_default(env);
3186             f32 = iselFltExpr(env, e->Iex.Unop.arg);
3187             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3188             return f64;
3189          }
3190          default:
3191             break;
3192       }
3193    }
3194
3195    /* --------- MULTIPLEX --------- */
3196    if (e->tag == Iex_ITE) { // VFD
3197       HReg r1, r0, dst;
3198       vassert(ty == Ity_F64);
3199       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3200       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3201       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3202       dst = newVRegV(env);
3203       addInstr(env, mk_vMOVsd_RR(r1,dst));
3204       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3205       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3206       return dst;
3207    }
3208
3209    ppIRExpr(e);
3210    vpanic("iselDblExpr_wrk");
3211 }
3212
3213
3214 /*---------------------------------------------------------*/
3215 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3216 /*---------------------------------------------------------*/
3217
3218 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3219 {
3220    HReg r = iselVecExpr_wrk( env, e );
3221 #  if 0
3222    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3223 #  endif
3224    vassert(hregClass(r) == HRcVec128);
3225    vassert(hregIsVirtual(r));
3226    return r;
3227 }
3228
3229
3230 /* DO NOT CALL THIS DIRECTLY */
3231 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3232 {
3233    HWord      fn = 0; /* address of helper fn, if required */
3234    Bool       arg1isEReg = False;
3235    AMD64SseOp op = Asse_INVALID;
3236    vassert(e);
3237    IRType ty = typeOfIRExpr(env->type_env, e);
3238    vassert(ty == Ity_V128);
3239    UInt laneBits = 0;
3240
3241    if (e->tag == Iex_RdTmp) {
3242       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3243    }
3244
3245    if (e->tag == Iex_Get) {
3246       HReg dst = newVRegV(env);
3247       addInstr(env, AMD64Instr_SseLdSt(
3248                        True/*load*/,
3249                        16,
3250                        dst,
3251                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3252                     )
3253               );
3254       return dst;
3255    }
3256
3257    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3258       HReg        dst = newVRegV(env);
3259       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3260       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3261       return dst;
3262    }
3263
3264    if (e->tag == Iex_Const) {
3265       HReg dst = newVRegV(env);
3266       vassert(e->Iex.Const.con->tag == Ico_V128);
3267       switch (e->Iex.Const.con->Ico.V128) {
3268          case 0x0000:
3269             dst = generate_zeroes_V128(env);
3270             break;
3271          case 0xFFFF:
3272             dst = generate_ones_V128(env);
3273             break;
3274          default: {
3275             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3276             /* do push_uimm64 twice, first time for the high-order half. */
3277             push_uimm64(env, bitmask8_to_bytemask64(
3278                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3279                        ));
3280             push_uimm64(env, bitmask8_to_bytemask64(
3281                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3282                        ));
3283             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3284             add_to_rsp(env, 16);
3285             break;
3286          }
3287       }
3288       return dst;
3289    }
3290
3291    if (e->tag == Iex_Unop) {
3292    switch (e->Iex.Unop.op) {
3293
3294       case Iop_NotV128: {
3295          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3296          return do_sse_NotV128(env, arg);
3297       }
3298
3299       case Iop_CmpNEZ64x2: {
3300          /* We can use SSE2 instructions for this. */
3301          /* Ideally, we want to do a 64Ix2 comparison against zero of
3302             the operand.  Problem is no such insn exists.  Solution
3303             therefore is to do a 32Ix4 comparison instead, and bitwise-
3304             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3305             let the not'd result of this initial comparison be a:b:c:d.
3306             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3307             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3308             giving the required result.
3309
3310             The required selection sequence is 2,3,0,1, which
3311             according to Intel's documentation means the pshufd
3312             literal value is 0xB1, that is,
3313             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3314          */
3315          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3316          HReg tmp  = generate_zeroes_V128(env);
3317          HReg dst  = newVRegV(env);
3318          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3319          tmp = do_sse_NotV128(env, tmp);
3320          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3321          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3322          return dst;
3323       }
3324
3325       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3326       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3327       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3328       do_CmpNEZ_vector:
3329       {
3330          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3331          HReg tmp  = newVRegV(env);
3332          HReg zero = generate_zeroes_V128(env);
3333          HReg dst;
3334          addInstr(env, mk_vMOVsd_RR(arg, tmp));
3335          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3336          dst = do_sse_NotV128(env, tmp);
3337          return dst;
3338       }
3339
3340       case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3341       case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3342       do_32Fx4_unary:
3343       {
3344          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3345          HReg dst = newVRegV(env);
3346          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3347          return dst;
3348       }
3349
3350       case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3351       case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3352       case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
3353       do_32F0x4_unary:
3354       {
3355          /* A bit subtle.  We have to copy the arg to the result
3356             register first, because actually doing the SSE scalar insn
3357             leaves the upper 3/4 of the destination register
3358             unchanged.  Whereas the required semantics of these
3359             primops is that the upper 3/4 is simply copied in from the
3360             argument. */
3361          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3362          HReg dst = newVRegV(env);
3363          addInstr(env, mk_vMOVsd_RR(arg, dst));
3364          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3365          return dst;
3366       }
3367
3368       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3369       do_64F0x2_unary:
3370       {
3371          /* A bit subtle.  We have to copy the arg to the result
3372             register first, because actually doing the SSE scalar insn
3373             leaves the upper half of the destination register
3374             unchanged.  Whereas the required semantics of these
3375             primops is that the upper half is simply copied in from the
3376             argument. */
3377          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3378          HReg dst = newVRegV(env);
3379          addInstr(env, mk_vMOVsd_RR(arg, dst));
3380          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3381          return dst;
3382       }
3383
3384       case Iop_32UtoV128: {
3385          // FIXME maybe just use MOVQ here?
3386          HReg        dst     = newVRegV(env);
3387          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3388          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3389          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3390          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3391          return dst;
3392       }
3393
3394       case Iop_64UtoV128: {
3395          // FIXME maybe just use MOVQ here?
3396          HReg        dst  = newVRegV(env);
3397          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3398          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3399          addInstr(env, AMD64Instr_Push(rmi));
3400          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3401          add_to_rsp(env, 8);
3402          return dst;
3403       }
3404
3405       case Iop_V256toV128_0:
3406       case Iop_V256toV128_1: {
3407          HReg vHi, vLo;
3408          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3409          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3410       }
3411
3412       case Iop_F16toF32x4: {
3413          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3414             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3415             HReg dst = newVRegV(env);
3416             addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3417             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3418             return dst;
3419          }
3420          break;
3421       }
3422
3423       default:
3424          break;
3425    } /* switch (e->Iex.Unop.op) */
3426    } /* if (e->tag == Iex_Unop) */
3427
3428    if (e->tag == Iex_Binop) {
3429    switch (e->Iex.Binop.op) {
3430
3431       case Iop_Sqrt64Fx2:
3432       case Iop_Sqrt32Fx4: {
3433          /* :: (rmode, vec) -> vec */
3434          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3435          HReg dst = newVRegV(env);
3436          /* XXXROUNDINGFIXME */
3437          /* set roundingmode here */
3438          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3439                            ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3440                        (Asse_SQRTF, arg, dst));
3441          return dst;
3442       }
3443
3444       /* FIXME: could we generate MOVQ here? */
3445       case Iop_SetV128lo64: {
3446          HReg dst  = newVRegV(env);
3447          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3448          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3449          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3450          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3451          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3452          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3453          return dst;
3454       }
3455
3456       /* FIXME: could we generate MOVD here? */
3457       case Iop_SetV128lo32: {
3458          HReg dst  = newVRegV(env);
3459          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3460          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3461          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3462          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3463          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3464          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3465          return dst;
3466       }
3467
3468       case Iop_64HLtoV128: {
3469          const IRExpr* arg1 = e->Iex.Binop.arg1;
3470          const IRExpr* arg2 = e->Iex.Binop.arg2;
3471          HReg dst = newVRegV(env);
3472          HReg tmp = newVRegV(env);
3473          HReg qHi = iselIntExpr_R(env, arg1);
3474          // If the args are trivially the same (tmp or const), use the same
3475          // source register for both, and only one movq since those are
3476          // (relatively) expensive.
3477          if (areAtomsAndEqual(arg1, arg2)) {
3478             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3479             addInstr(env, mk_vMOVsd_RR(dst, tmp));
3480             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3481             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3482          } else {
3483             HReg qLo = iselIntExpr_R(env, arg2);
3484             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3485             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3486             addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3487             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3488          }
3489          return dst;
3490       }
3491
3492       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3493       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3494       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3495       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3496       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3497       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3498       do_32Fx4:
3499       {
3500          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3501          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3502          HReg dst = newVRegV(env);
3503          addInstr(env, mk_vMOVsd_RR(argL, dst));
3504          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3505          return dst;
3506       }
3507
3508       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3509       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3510       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3511       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3512       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3513       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3514       do_64Fx2:
3515       {
3516          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3517          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3518          HReg dst = newVRegV(env);
3519          addInstr(env, mk_vMOVsd_RR(argL, dst));
3520          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3521          return dst;
3522       }
3523
3524       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3525       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3526       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3527       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3528       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3529       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3530       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3531       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3532       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3533       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3534       do_32F0x4: {
3535          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3536          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3537          HReg dst = newVRegV(env);
3538          addInstr(env, mk_vMOVsd_RR(argL, dst));
3539          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3540          return dst;
3541       }
3542
3543       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3544       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3545       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3546       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3547       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3548       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3549       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3550       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3551       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3552       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3553       do_64F0x2: {
3554          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3555          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3556          HReg dst = newVRegV(env);
3557          addInstr(env, mk_vMOVsd_RR(argL, dst));
3558          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3559          return dst;
3560       }
3561
3562       case Iop_PermOrZero8x16:
3563          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3564             op = Asse_PSHUFB;
3565             goto do_SseReRg;
3566          }
3567          // Otherwise we'll have to generate a call to
3568          // h_generic_calc_PermOrZero8x16 (ATK).  But that would only be for a
3569          // host which doesn't have SSSE3, in which case we don't expect this
3570          // IROp to enter the compilation pipeline in the first place.
3571          break;
3572
3573       case Iop_PwExtUSMulQAdd8x16:
3574          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3575             op = Asse_PMADDUBSW;
3576             goto do_SseReRg;
3577          }
3578          break;
3579
3580       case Iop_QNarrowBin32Sto16Sx8:
3581          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3582       case Iop_QNarrowBin16Sto8Sx16:
3583          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3584       case Iop_QNarrowBin16Sto8Ux16:
3585          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3586
3587       case Iop_InterleaveHI8x16:
3588          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3589       case Iop_InterleaveHI16x8:
3590          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3591       case Iop_InterleaveHI32x4:
3592          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3593       case Iop_InterleaveHI64x2:
3594          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3595
3596       case Iop_InterleaveLO8x16:
3597          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3598       case Iop_InterleaveLO16x8:
3599          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3600       case Iop_InterleaveLO32x4:
3601          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3602       case Iop_InterleaveLO64x2:
3603          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3604
3605       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3606       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3607       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3608       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3609       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3610       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3611       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3612       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3613       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3614       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3615       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3616       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3617       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3618       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3619       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3620       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3621       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3622       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3623       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3624       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3625       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3626       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3627       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3628       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3629       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3630       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3631       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3632       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3633       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3634       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3635       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3636       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3637       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3638       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3639       do_SseReRg: {
3640          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3641          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3642          HReg dst = newVRegV(env);
3643          if (arg1isEReg) {
3644             addInstr(env, mk_vMOVsd_RR(arg2, dst));
3645             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3646          } else {
3647             addInstr(env, mk_vMOVsd_RR(arg1, dst));
3648             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3649          }
3650          return dst;
3651       }
3652
3653       case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3654       case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3655       case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3656       case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3657       case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3658       case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3659       case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3660       case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3661       do_SseShift: {
3662          HReg dst  = newVRegV(env);
3663          HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3664          /* If it's a shift by an in-range immediate, generate a single
3665             instruction. */
3666          if (e->Iex.Binop.arg2->tag == Iex_Const) {
3667             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3668             vassert(c->tag == Ico_U8);
3669             UInt shift = c->Ico.U8;
3670             if (shift < laneBits) {
3671                addInstr(env, mk_vMOVsd_RR(greg, dst));
3672                addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3673                return dst;
3674             }
3675          }
3676          /* Otherwise we have to do it the longwinded way. */
3677          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3678          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3679          HReg        ereg = newVRegV(env);
3680          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3681          addInstr(env, AMD64Instr_Push(rmi));
3682          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3683          addInstr(env, mk_vMOVsd_RR(greg, dst));
3684          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3685          add_to_rsp(env, 16);
3686          return dst;
3687       }
3688
3689       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3690                            goto do_SseAssistedBinary;
3691       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3692                            goto do_SseAssistedBinary;
3693       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3694                            goto do_SseAssistedBinary;
3695       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3696                            goto do_SseAssistedBinary;
3697       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3698                            goto do_SseAssistedBinary;
3699       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3700                            goto do_SseAssistedBinary;
3701       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3702                            goto do_SseAssistedBinary;
3703       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3704                            goto do_SseAssistedBinary;
3705       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3706                            goto do_SseAssistedBinary;
3707       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3708                            goto do_SseAssistedBinary;
3709       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3710                            goto do_SseAssistedBinary;
3711       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3712                            goto do_SseAssistedBinary;
3713       case Iop_QNarrowBin32Sto16Ux8:
3714                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3715                            goto do_SseAssistedBinary;
3716       case Iop_NarrowBin16to8x16:
3717                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3718                            goto do_SseAssistedBinary;
3719       case Iop_NarrowBin32to16x8:
3720                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3721                            goto do_SseAssistedBinary;
3722       do_SseAssistedBinary: {
3723          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3724             well. */
3725          vassert(fn != 0);
3726          HReg dst = newVRegV(env);
3727          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3728          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3729          HReg argp = newVRegI(env);
3730          /* subq $112, %rsp         -- make a space*/
3731          sub_from_rsp(env, 112);
3732          /* leaq 48(%rsp), %r_argp  -- point into it */
3733          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3734                                         argp));
3735          /* andq $-16, %r_argp      -- 16-align the pointer */
3736          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3737                                          AMD64RMI_Imm( ~(UInt)15 ),
3738                                          argp));
3739          /* Prepare 3 arg regs:
3740             leaq 0(%r_argp), %rdi
3741             leaq 16(%r_argp), %rsi
3742             leaq 32(%r_argp), %rdx
3743          */
3744          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3745                                         hregAMD64_RDI()));
3746          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3747                                         hregAMD64_RSI()));
3748          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3749                                         hregAMD64_RDX()));
3750          /* Store the two args, at (%rsi) and (%rdx):
3751             movupd  %argL, 0(%rsi)
3752             movupd  %argR, 0(%rdx)
3753          */
3754          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3755                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3756          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3757                                           AMD64AMode_IR(0, hregAMD64_RDX())));
3758          /* call the helper */
3759          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3760                                         3, mk_RetLoc_simple(RLPri_None) ));
3761          /* fetch the result from memory, using %r_argp, which the
3762             register allocator will keep alive across the call. */
3763          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3764                                           AMD64AMode_IR(0, argp)));
3765          /* and finally, clear the space */
3766          add_to_rsp(env, 112);
3767          return dst;
3768       }
3769
3770       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3771                          goto do_SseAssistedVectorAndScalar;
3772       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3773                          goto do_SseAssistedVectorAndScalar;
3774       do_SseAssistedVectorAndScalar: {
3775          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3776             well. */
3777          vassert(fn != 0);
3778          HReg dst = newVRegV(env);
3779          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3780          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3781          HReg argp = newVRegI(env);
3782          /* subq $112, %rsp         -- make a space*/
3783          sub_from_rsp(env, 112);
3784          /* leaq 48(%rsp), %r_argp  -- point into it */
3785          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3786                                         argp));
3787          /* andq $-16, %r_argp      -- 16-align the pointer */
3788          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3789                                          AMD64RMI_Imm( ~(UInt)15 ),
3790                                          argp));
3791          /* Prepare 2 vector arg regs:
3792             leaq 0(%r_argp), %rdi
3793             leaq 16(%r_argp), %rsi
3794          */
3795          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3796                                         hregAMD64_RDI()));
3797          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3798                                         hregAMD64_RSI()));
3799          /* Store the vector arg, at (%rsi):
3800             movupd  %argL, 0(%rsi)
3801          */
3802          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3803                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3804          /* And get the scalar value into rdx */
3805          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3806
3807          /* call the helper */
3808          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3809                                         3, mk_RetLoc_simple(RLPri_None) ));
3810          /* fetch the result from memory, using %r_argp, which the
3811             register allocator will keep alive across the call. */
3812          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3813                                           AMD64AMode_IR(0, argp)));
3814          /* and finally, clear the space */
3815          add_to_rsp(env, 112);
3816          return dst;
3817       }
3818
3819       case Iop_I32StoF32x4:
3820       case Iop_F32toI32Sx4: {
3821          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3822          HReg dst = newVRegV(env);
3823          AMD64SseOp mop
3824             = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3825          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3826          addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3827          set_SSE_rounding_default(env);
3828          return dst;
3829       }
3830
3831       // Half-float vector conversion
3832       case Iop_F32toF16x8: {
3833          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3834             HReg srcHi, srcLo;
3835             iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
3836             HReg dstHi = newVRegV(env);
3837             HReg dstLo = newVRegV(env);
3838             set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3839             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
3840             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
3841             set_SSE_rounding_default(env);
3842             // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
3843             // need to compact all that into one register.  There's probably a
3844             // more elegant way to do this, but ..
3845             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
3846             // dstHi is now 127:64 = useful data, 63:0 = zero
3847             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
3848             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
3849             // dstLo is now 127:64 = zero, 63:0 = useful data
3850             addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
3851             return dstLo;
3852          }
3853          break;
3854       }
3855
3856       default:
3857          break;
3858    } /* switch (e->Iex.Binop.op) */
3859    } /* if (e->tag == Iex_Binop) */
3860
3861    if (e->tag == Iex_Triop) {
3862    IRTriop *triop = e->Iex.Triop.details;
3863    switch (triop->op) {
3864
3865       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3866       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3867       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3868       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3869       do_64Fx2_w_rm:
3870       {
3871          HReg argL = iselVecExpr(env, triop->arg2);
3872          HReg argR = iselVecExpr(env, triop->arg3);
3873          HReg dst = newVRegV(env);
3874          addInstr(env, mk_vMOVsd_RR(argL, dst));
3875          /* XXXROUNDINGFIXME */
3876          /* set roundingmode here */
3877          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3878          return dst;
3879       }
3880
3881       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3882       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3883       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3884       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3885       do_32Fx4_w_rm:
3886       {
3887          HReg argL = iselVecExpr(env, triop->arg2);
3888          HReg argR = iselVecExpr(env, triop->arg3);
3889          HReg dst = newVRegV(env);
3890          addInstr(env, mk_vMOVsd_RR(argL, dst));
3891          /* XXXROUNDINGFIXME */
3892          /* set roundingmode here */
3893          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3894          return dst;
3895       }
3896
3897       default:
3898          break;
3899    } /* switch (triop->op) */
3900    } /* if (e->tag == Iex_Triop) */
3901
3902    if (e->tag == Iex_ITE) { // VFD
3903       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3904       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3905       HReg dst = newVRegV(env);
3906       addInstr(env, mk_vMOVsd_RR(r1,dst));
3907       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3908       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3909       return dst;
3910    }
3911
3912    //vec_fail:
3913    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3914               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3915    ppIRExpr(e);
3916    vpanic("iselVecExpr_wrk");
3917 }
3918
3919
3920 /*---------------------------------------------------------*/
3921 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3922 /*---------------------------------------------------------*/
3923
3924 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3925                            ISelEnv* env, const IRExpr* e )
3926 {
3927    iselDVecExpr_wrk( rHi, rLo, env, e );
3928 #  if 0
3929    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3930 #  endif
3931    vassert(hregClass(*rHi) == HRcVec128);
3932    vassert(hregClass(*rLo) == HRcVec128);
3933    vassert(hregIsVirtual(*rHi));
3934    vassert(hregIsVirtual(*rLo));
3935 }
3936
3937
3938 /* DO NOT CALL THIS DIRECTLY */
3939 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3940                                ISelEnv* env, const IRExpr* e )
3941 {
3942    HWord fn = 0; /* address of helper fn, if required */
3943    vassert(e);
3944    IRType ty = typeOfIRExpr(env->type_env, e);
3945    vassert(ty == Ity_V256);
3946    UInt laneBits = 0;
3947
3948    AMD64SseOp op = Asse_INVALID;
3949
3950    /* read 256-bit IRTemp */
3951    if (e->tag == Iex_RdTmp) {
3952       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3953       return;
3954    }
3955
3956    if (e->tag == Iex_Get) {
3957       HReg        vHi  = newVRegV(env);
3958       HReg        vLo  = newVRegV(env);
3959       HReg        rbp  = hregAMD64_RBP();
3960       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3961       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3962       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3963       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3964       *rHi = vHi;
3965       *rLo = vLo;
3966       return;
3967    }
3968
3969    if (e->tag == Iex_Load) {
3970       HReg        vHi  = newVRegV(env);
3971       HReg        vLo  = newVRegV(env);
3972       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3973       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3974       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3975       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3976       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3977       *rHi = vHi;
3978       *rLo = vLo;
3979       return;
3980    }
3981
3982    if (e->tag == Iex_Const) {
3983       vassert(e->Iex.Const.con->tag == Ico_V256);
3984       switch (e->Iex.Const.con->Ico.V256) {
3985          case 0x00000000: {
3986             HReg vHi = generate_zeroes_V128(env);
3987             HReg vLo = newVRegV(env);
3988             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3989             *rHi = vHi;
3990             *rLo = vLo;
3991             return;
3992          }
3993          default:
3994             break; /* give up.   Until such time as is necessary. */
3995       }
3996    }
3997
3998    if (e->tag == Iex_Unop) {
3999    switch (e->Iex.Unop.op) {
4000
4001       case Iop_NotV256: {
4002          HReg argHi, argLo;
4003          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4004          *rHi = do_sse_NotV128(env, argHi);
4005          *rLo = do_sse_NotV128(env, argLo);
4006          return;
4007       }
4008
4009       case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
4010       case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
4011       case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
4012       do_32Fx8_unary:
4013       {
4014          HReg argHi, argLo;
4015          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4016          HReg dstHi = newVRegV(env);
4017          HReg dstLo = newVRegV(env);
4018          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4019          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4020          *rHi = dstHi;
4021          *rLo = dstLo;
4022          return;
4023       }
4024
4025       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
4026       do_64Fx4_unary:
4027       {
4028          HReg argHi, argLo;
4029          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4030          HReg dstHi = newVRegV(env);
4031          HReg dstLo = newVRegV(env);
4032          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4033          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4034          *rHi = dstHi;
4035          *rLo = dstLo;
4036          return;
4037       }
4038
4039       case Iop_CmpNEZ64x4: {
4040          /* We can use SSE2 instructions for this. */
4041          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4042             (obviously).  See comment on Iop_CmpNEZ64x2 for
4043             explanation of what's going on here. */
4044          HReg argHi, argLo;
4045          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4046          HReg tmpHi  = generate_zeroes_V128(env);
4047          HReg tmpLo  = newVRegV(env);
4048          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4049          HReg dstHi  = newVRegV(env);
4050          HReg dstLo  = newVRegV(env);
4051          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4052          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4053          tmpHi = do_sse_NotV128(env, tmpHi);
4054          tmpLo = do_sse_NotV128(env, tmpLo);
4055          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4056          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4057          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4058          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4059          *rHi = dstHi;
4060          *rLo = dstLo;
4061          return;
4062       }
4063
4064       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4065       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4066       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
4067       do_CmpNEZ_vector:
4068       {
4069          HReg argHi, argLo;
4070          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4071          HReg tmpHi = newVRegV(env);
4072          HReg tmpLo = newVRegV(env);
4073          HReg zero  = generate_zeroes_V128(env);
4074          HReg dstHi, dstLo;
4075          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4076          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4077          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4078          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4079          dstHi = do_sse_NotV128(env, tmpHi);
4080          dstLo = do_sse_NotV128(env, tmpLo);
4081          *rHi = dstHi;
4082          *rLo = dstLo;
4083          return;
4084       }
4085
4086       case Iop_F16toF32x8: {
4087          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4088             HReg src     = iselVecExpr(env, e->Iex.Unop.arg);
4089             HReg srcCopy = newVRegV(env);
4090             HReg dstHi   = newVRegV(env);
4091             HReg dstLo   = newVRegV(env);
4092             // Copy src, since we'll need to modify it.
4093             addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4094             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4095             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4096             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4097             *rHi = dstHi;
4098             *rLo = dstLo;
4099             return;
4100          }
4101          break;
4102       }
4103
4104       default:
4105          break;
4106    } /* switch (e->Iex.Unop.op) */
4107    } /* if (e->tag == Iex_Unop) */
4108
4109    if (e->tag == Iex_Binop) {
4110    switch (e->Iex.Binop.op) {
4111
4112       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
4113       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
4114       do_64Fx4:
4115       {
4116          HReg argLhi, argLlo, argRhi, argRlo;
4117          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4118          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4119          HReg dstHi = newVRegV(env);
4120          HReg dstLo = newVRegV(env);
4121          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4122          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4123          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4124          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4125          *rHi = dstHi;
4126          *rLo = dstLo;
4127          return;
4128       }
4129
4130       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
4131       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
4132       do_32Fx8:
4133       {
4134          HReg argLhi, argLlo, argRhi, argRlo;
4135          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4136          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4137          HReg dstHi = newVRegV(env);
4138          HReg dstLo = newVRegV(env);
4139          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4140          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4141          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4142          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4143          *rHi = dstHi;
4144          *rLo = dstLo;
4145          return;
4146       }
4147
4148       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
4149       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
4150       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
4151       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
4152       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
4153       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
4154       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
4155       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
4156       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
4157       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
4158       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
4159       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
4160       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
4161       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
4162       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
4163       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
4164       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
4165       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4166       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4167       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
4168       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
4169       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
4170       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
4171       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4172       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4173       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
4174       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
4175       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
4176       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
4177       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
4178       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
4179       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
4180       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
4181       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
4182       do_SseReRg:
4183       {
4184          HReg argLhi, argLlo, argRhi, argRlo;
4185          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4186          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4187          HReg dstHi = newVRegV(env);
4188          HReg dstLo = newVRegV(env);
4189          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4190          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4191          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4192          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4193          *rHi = dstHi;
4194          *rLo = dstLo;
4195          return;
4196       }
4197
4198       case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4199       case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4200       case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4201       case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4202       case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4203       case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4204       case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4205       case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4206       do_SseShift: {
4207          HReg dstHi = newVRegV(env);
4208          HReg dstLo = newVRegV(env);
4209          HReg gregHi, gregLo;
4210          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4211          /* If it's a shift by an in-range immediate, generate two single
4212             instructions. */
4213          if (e->Iex.Binop.arg2->tag == Iex_Const) {
4214             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4215             vassert(c->tag == Ico_U8);
4216             UInt shift = c->Ico.U8;
4217             if (shift < laneBits) {
4218                addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4219                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4220                addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4221                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4222                *rHi = dstHi;
4223                *rLo = dstLo;
4224                return;
4225             }
4226          }
4227          /* Otherwise we have to do it the longwinded way. */
4228          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4229          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
4230          HReg        ereg  = newVRegV(env);
4231          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4232          addInstr(env, AMD64Instr_Push(rmi));
4233          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4234          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4235          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4236          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4237          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4238          add_to_rsp(env, 16);
4239          *rHi = dstHi;
4240          *rLo = dstLo;
4241          return;
4242       }
4243
4244       case Iop_V128HLtoV256: {
4245          // Curiously, there doesn't seem to be any benefit to be had here by
4246          // checking whether arg1 and arg2 are the same, in the style of how
4247          // (eg) 64HLtoV128 is handled elsewhere in this file.
4248          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4249          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4250          return;
4251       }
4252
4253       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4254                            goto do_SseAssistedBinary;
4255       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4256                            goto do_SseAssistedBinary;
4257       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4258                            goto do_SseAssistedBinary;
4259       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4260                            goto do_SseAssistedBinary;
4261       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4262                            goto do_SseAssistedBinary;
4263       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4264                            goto do_SseAssistedBinary;
4265       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4266                            goto do_SseAssistedBinary;
4267       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4268                            goto do_SseAssistedBinary;
4269       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4270                            goto do_SseAssistedBinary;
4271       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4272                            goto do_SseAssistedBinary;
4273       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4274                            goto do_SseAssistedBinary;
4275       do_SseAssistedBinary: {
4276          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4277             well. */
4278          vassert(fn != 0);
4279          HReg dstHi = newVRegV(env);
4280          HReg dstLo = newVRegV(env);
4281          HReg argLhi, argLlo, argRhi, argRlo;
4282          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4283          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4284          HReg argp = newVRegI(env);
4285          /* subq $160, %rsp         -- make a space*/
4286          sub_from_rsp(env, 160);
4287          /* leaq 48(%rsp), %r_argp  -- point into it */
4288          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4289                                         argp));
4290          /* andq $-16, %r_argp      -- 16-align the pointer */
4291          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4292                                          AMD64RMI_Imm( ~(UInt)15 ),
4293                                          argp));
4294          /* Prepare 3 arg regs:
4295             leaq 0(%r_argp), %rdi
4296             leaq 16(%r_argp), %rsi
4297             leaq 32(%r_argp), %rdx
4298          */
4299          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4300                                         hregAMD64_RDI()));
4301          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4302                                         hregAMD64_RSI()));
4303          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4304                                         hregAMD64_RDX()));
4305          /* Store the two high args, at (%rsi) and (%rdx):
4306             movupd  %argLhi, 0(%rsi)
4307             movupd  %argRhi, 0(%rdx)
4308          */
4309          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4310                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4311          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4312                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4313          /* Store the two low args, at 48(%rsi) and 48(%rdx):
4314             movupd  %argLlo, 48(%rsi)
4315             movupd  %argRlo, 48(%rdx)
4316          */
4317          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4318                                           AMD64AMode_IR(48, hregAMD64_RSI())));
4319          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4320                                           AMD64AMode_IR(48, hregAMD64_RDX())));
4321          /* call the helper */
4322          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4323                                         mk_RetLoc_simple(RLPri_None) ));
4324          /* Prepare 3 arg regs:
4325             leaq 48(%r_argp), %rdi
4326             leaq 64(%r_argp), %rsi
4327             leaq 80(%r_argp), %rdx
4328          */
4329          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4330                                         hregAMD64_RDI()));
4331          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4332                                         hregAMD64_RSI()));
4333          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4334                                         hregAMD64_RDX()));
4335          /* call the helper */
4336          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4337                                         mk_RetLoc_simple(RLPri_None) ));
4338          /* fetch the result from memory, using %r_argp, which the
4339             register allocator will keep alive across the call. */
4340          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4341                                           AMD64AMode_IR(0, argp)));
4342          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4343                                           AMD64AMode_IR(48, argp)));
4344          /* and finally, clear the space */
4345          add_to_rsp(env, 160);
4346          *rHi = dstHi;
4347          *rLo = dstLo;
4348          return;
4349       }
4350
4351       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4352                            goto do_SseAssistedBinary256;
4353       do_SseAssistedBinary256: {
4354          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4355             well. */
4356          vassert(fn != 0);
4357          HReg dstHi = newVRegV(env);
4358          HReg dstLo = newVRegV(env);
4359          HReg argLhi, argLlo, argRhi, argRlo;
4360          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4361          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4362          HReg argp = newVRegI(env);
4363          /* subq $160, %rsp         -- make a space*/
4364          sub_from_rsp(env, 160);
4365          /* leaq 48(%rsp), %r_argp  -- point into it */
4366          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4367                                         argp));
4368          /* andq $-16, %r_argp      -- 16-align the pointer */
4369          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4370                                          AMD64RMI_Imm( ~(UInt)15 ),
4371                                          argp));
4372          /* Prepare 3 arg regs:
4373             leaq 0(%r_argp), %rdi
4374             leaq 32(%r_argp), %rsi
4375             leaq 64(%r_argp), %rdx
4376          */
4377          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4378                                         hregAMD64_RDI()));
4379          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4380                                         hregAMD64_RSI()));
4381          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4382                                         hregAMD64_RDX()));
4383          /* Store the two args, at (%rsi) and (%rdx):
4384             movupd  %argLlo, 0(%rsi)
4385             movupd  %argLhi, 16(%rsi)
4386             movupd  %argRlo, 0(%rdx)
4387             movupd  %argRhi, 16(%rdx)
4388          */
4389          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4390                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4391          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4392                                           AMD64AMode_IR(16, hregAMD64_RSI())));
4393          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4394                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4395          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4396                                           AMD64AMode_IR(16, hregAMD64_RDX())));
4397          /* call the helper */
4398          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4399                                         mk_RetLoc_simple(RLPri_None) ));
4400          /* fetch the result from memory, using %r_argp, which the
4401             register allocator will keep alive across the call. */
4402          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4403                                           AMD64AMode_IR(0, argp)));
4404          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4405                                           AMD64AMode_IR(16, argp)));
4406          /* and finally, clear the space */
4407          add_to_rsp(env, 160);
4408          *rHi = dstHi;
4409          *rLo = dstLo;
4410          return;
4411       }
4412
4413       case Iop_I32StoF32x8:
4414       case Iop_F32toI32Sx8: {
4415          HReg argHi, argLo;
4416          iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4417          HReg dstHi = newVRegV(env);
4418          HReg dstLo = newVRegV(env);
4419          AMD64SseOp mop
4420             = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4421          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4422          addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4423          addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4424          set_SSE_rounding_default(env);
4425          *rHi = dstHi;
4426          *rLo = dstLo;
4427          return;
4428       }
4429
4430       default:
4431          break;
4432    } /* switch (e->Iex.Binop.op) */
4433    } /* if (e->tag == Iex_Binop) */
4434
4435    if (e->tag == Iex_Triop) {
4436    IRTriop *triop = e->Iex.Triop.details;
4437    switch (triop->op) {
4438
4439       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4440       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4441       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4442       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4443       do_64Fx4_w_rm:
4444       {
4445          HReg argLhi, argLlo, argRhi, argRlo;
4446          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4447          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4448          HReg dstHi = newVRegV(env);
4449          HReg dstLo = newVRegV(env);
4450          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4451          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4452          /* XXXROUNDINGFIXME */
4453          /* set roundingmode here */
4454          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4455          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4456          *rHi = dstHi;
4457          *rLo = dstLo;
4458          return;
4459       }
4460
4461       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4462       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4463       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4464       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4465       do_32Fx8_w_rm:
4466       {
4467          HReg argLhi, argLlo, argRhi, argRlo;
4468          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4469          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4470          HReg dstHi = newVRegV(env);
4471          HReg dstLo = newVRegV(env);
4472          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4473          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4474          /* XXXROUNDINGFIXME */
4475          /* set roundingmode here */
4476          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4477          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4478          *rHi = dstHi;
4479          *rLo = dstLo;
4480          return;
4481       }
4482
4483       default:
4484          break;
4485    } /* switch (triop->op) */
4486    } /* if (e->tag == Iex_Triop) */
4487
4488
4489    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4490       const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4491       const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4492       const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4493       const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4494       // If the args are trivially the same (tmp or const), use the same
4495       // source register for all four, and only one movq since those are
4496       // (relatively) expensive.
4497       if (areAtomsAndEqual(arg1, arg2)
4498           && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4499          HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4500          HReg tmp = newVRegV(env);
4501          HReg dst = newVRegV(env);
4502          addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4503          addInstr(env, mk_vMOVsd_RR(dst, tmp));
4504          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4505          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4506          *rHi = dst;
4507          *rLo = dst;
4508       } else {
4509          /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4510          HReg q3 = iselIntExpr_R(env, arg1);
4511          HReg q2 = iselIntExpr_R(env, arg2);
4512          HReg q1 = iselIntExpr_R(env, arg3);
4513          HReg q0 = iselIntExpr_R(env, arg4);
4514          HReg tmp = newVRegV(env);
4515          HReg dstHi = newVRegV(env);
4516          HReg dstLo = newVRegV(env);
4517          addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4518          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4519          addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4520          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4521          addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4522          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4523          addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4524          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4525          *rHi = dstHi;
4526          *rLo = dstLo;
4527       }
4528       return;
4529    }
4530
4531    if (e->tag == Iex_ITE) {
4532       HReg r1Hi, r1Lo, r0Hi, r0Lo;
4533       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4534       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4535       HReg dstHi = newVRegV(env);
4536       HReg dstLo = newVRegV(env);
4537       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4538       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4539       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4540       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4541       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4542       *rHi = dstHi;
4543       *rLo = dstLo;
4544       return;
4545    }
4546
4547    //avx_fail:
4548    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4549               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4550    ppIRExpr(e);
4551    vpanic("iselDVecExpr_wrk");
4552 }
4553
4554
4555 /*---------------------------------------------------------*/
4556 /*--- ISEL: Statements                                  ---*/
4557 /*---------------------------------------------------------*/
4558
4559 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4560 {
4561    if (vex_traceflags & VEX_TRACE_VCODE) {
4562       vex_printf("\n-- ");
4563       ppIRStmt(stmt);
4564       vex_printf("\n");
4565    }
4566
4567    switch (stmt->tag) {
4568
4569    /* --------- LOADG (guarded load) --------- */
4570    case Ist_LoadG: {
4571       IRLoadG* lg = stmt->Ist.LoadG.details;
4572       if (lg->end != Iend_LE)
4573          goto stmt_fail;
4574
4575       UChar szB = 0; /* invalid */
4576       switch (lg->cvt) {
4577          case ILGop_Ident32:   szB = 4;  break;
4578          case ILGop_Ident64:   szB = 8;  break;
4579          case ILGop_IdentV128: szB = 16; break;
4580          default: break;
4581       }
4582       if (szB == 0)
4583          goto stmt_fail;
4584
4585       AMD64AMode* amAddr
4586          = iselIntExpr_AMode(env, lg->addr);
4587       HReg rAlt
4588          = szB == 16 ? iselVecExpr(env, lg->alt)
4589                      : iselIntExpr_R(env, lg->alt);
4590       HReg rDst
4591          = lookupIRTemp(env, lg->dst);
4592
4593       /* Get the alt value into the dst.  We'll do a conditional load
4594          which overwrites it -- or not -- with loaded data. */
4595       if (szB == 16) {
4596          addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4597       } else {
4598          addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4599       }
4600       AMD64CondCode cc = iselCondCode(env, lg->guard);
4601       if (szB == 16) {
4602          addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4603       } else {
4604          addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4605       }
4606       return;
4607    }
4608
4609    /* --------- STOREG (guarded store) --------- */
4610    case Ist_StoreG: {
4611       IRStoreG* sg = stmt->Ist.StoreG.details;
4612       if (sg->end != Iend_LE)
4613          goto stmt_fail;
4614
4615       UChar szB = 0; /* invalid */
4616       switch (typeOfIRExpr(env->type_env, sg->data)) {
4617          case Ity_I32:  szB = 4; break;
4618          case Ity_I64:  szB = 8; break;
4619          case Ity_V128: szB = 16; break;
4620          default: break;
4621       }
4622       if (szB == 0)
4623          goto stmt_fail;
4624
4625       AMD64AMode* amAddr
4626          = iselIntExpr_AMode(env, sg->addr);
4627       HReg rSrc
4628          = szB == 16 ? iselVecExpr(env, sg->data)
4629                      : iselIntExpr_R(env, sg->data);
4630       AMD64CondCode cc
4631          = iselCondCode(env, sg->guard);
4632       if (szB == 16) {
4633          addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4634       } else {
4635          addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4636       }
4637       return;
4638    }
4639
4640    /* --------- STORE --------- */
4641    case Ist_Store: {
4642       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4643       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4644       IREndness end   = stmt->Ist.Store.end;
4645
4646       if (tya != Ity_I64 || end != Iend_LE)
4647          goto stmt_fail;
4648
4649       if (tyd == Ity_I64) {
4650          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4651          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4652          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4653          return;
4654       }
4655       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4656          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4657          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4658          addInstr(env, AMD64Instr_Store(
4659                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4660                           r,am));
4661          return;
4662       }
4663       if (tyd == Ity_F64) {
4664          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4665          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4666          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4667          return;
4668       }
4669       if (tyd == Ity_F32) {
4670          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4671          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4672          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4673          return;
4674       }
4675       if (tyd == Ity_V128) {
4676          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4677          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4678          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4679          return;
4680       }
4681       if (tyd == Ity_V256) {
4682          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4683          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4684          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4685          HReg vHi, vLo;
4686          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4687          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4688          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4689          return;
4690       }
4691       break;
4692    }
4693
4694    /* --------- PUT --------- */
4695    case Ist_Put: {
4696       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4697       if (ty == Ity_I64) {
4698          /* We're going to write to memory, so compute the RHS into an
4699             AMD64RI. */
4700          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4701          addInstr(env,
4702                   AMD64Instr_Alu64M(
4703                      Aalu_MOV,
4704                      ri,
4705                      AMD64AMode_IR(stmt->Ist.Put.offset,
4706                                    hregAMD64_RBP())
4707                  ));
4708          return;
4709       }
4710       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4711          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4712          addInstr(env, AMD64Instr_Store(
4713                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4714                           r,
4715                           AMD64AMode_IR(stmt->Ist.Put.offset,
4716                                         hregAMD64_RBP())));
4717          return;
4718       }
4719       if (ty == Ity_F32) {
4720          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4721          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4722          set_SSE_rounding_default(env); /* paranoia */
4723          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4724          return;
4725       }
4726       if (ty == Ity_F64) {
4727          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4728          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4729                                          hregAMD64_RBP() );
4730          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4731          return;
4732       }
4733       if (ty == Ity_V128) {
4734          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4735          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4736                                          hregAMD64_RBP());
4737          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4738          return;
4739       }
4740       if (ty == Ity_V256) {
4741          HReg vHi, vLo;
4742          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4743          HReg        rbp  = hregAMD64_RBP();
4744          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4745          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4746          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4747          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4748          return;
4749       }
4750       break;
4751    }
4752
4753    /* --------- Indexed PUT --------- */
4754    case Ist_PutI: {
4755       IRPutI *puti = stmt->Ist.PutI.details;
4756
4757       AMD64AMode* am
4758          = genGuestArrayOffset(
4759               env, puti->descr,
4760                    puti->ix, puti->bias );
4761
4762       IRType ty = typeOfIRExpr(env->type_env, puti->data);
4763       if (ty == Ity_F64) {
4764          HReg val = iselDblExpr(env, puti->data);
4765          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4766          return;
4767       }
4768       if (ty == Ity_I8) {
4769          HReg r = iselIntExpr_R(env, puti->data);
4770          addInstr(env, AMD64Instr_Store( 1, r, am ));
4771          return;
4772       }
4773       if (ty == Ity_I64) {
4774          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4775          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4776          return;
4777       }
4778       break;
4779    }
4780
4781    /* --------- TMP --------- */
4782    case Ist_WrTmp: {
4783       IRTemp tmp = stmt->Ist.WrTmp.tmp;
4784       IRType ty = typeOfIRTemp(env->type_env, tmp);
4785
4786       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4787          compute it into an AMode and then use LEA.  This usually
4788          produces fewer instructions, often because (for memcheck
4789          created IR) we get t = address-expression, (t is later used
4790          twice) and so doing this naturally turns address-expression
4791          back into an AMD64 amode. */
4792       if (ty == Ity_I64
4793           && stmt->Ist.WrTmp.data->tag == Iex_Binop
4794           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4795          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4796          HReg dst = lookupIRTemp(env, tmp);
4797          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4798             /* Hmm, iselIntExpr_AMode wimped out and just computed the
4799                value into a register.  Just emit a normal reg-reg move
4800                so reg-alloc can coalesce it away in the usual way. */
4801             HReg src = am->Aam.IR.reg;
4802             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4803          } else {
4804             addInstr(env, AMD64Instr_Lea64(am,dst));
4805          }
4806          return;
4807       }
4808
4809       if (ty == Ity_I64 || ty == Ity_I32
4810           || ty == Ity_I16 || ty == Ity_I8) {
4811          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4812          HReg dst = lookupIRTemp(env, tmp);
4813          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4814          return;
4815       }
4816       if (ty == Ity_I128) {
4817          HReg rHi, rLo, dstHi, dstLo;
4818          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4819          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4820          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4821          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4822          return;
4823       }
4824       if (ty == Ity_I1) {
4825          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4826          HReg dst = lookupIRTemp(env, tmp);
4827          addInstr(env, AMD64Instr_Set64(cond, dst));
4828          return;
4829       }
4830       if (ty == Ity_F64) {
4831          HReg dst = lookupIRTemp(env, tmp);
4832          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4833          addInstr(env, mk_vMOVsd_RR(src, dst));
4834          return;
4835       }
4836       if (ty == Ity_F32) {
4837          HReg dst = lookupIRTemp(env, tmp);
4838          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4839          addInstr(env, mk_vMOVsd_RR(src, dst));
4840          return;
4841       }
4842       if (ty == Ity_V128) {
4843          HReg dst = lookupIRTemp(env, tmp);
4844          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4845          addInstr(env, mk_vMOVsd_RR(src, dst));
4846          return;
4847       }
4848       if (ty == Ity_V256) {
4849          HReg rHi, rLo, dstHi, dstLo;
4850          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4851          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4852          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4853          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4854          return;
4855       }
4856       break;
4857    }
4858
4859    /* --------- Call to DIRTY helper --------- */
4860    case Ist_Dirty: {
4861       IRDirty* d = stmt->Ist.Dirty.details;
4862
4863       /* Figure out the return type, if any. */
4864       IRType retty = Ity_INVALID;
4865       if (d->tmp != IRTemp_INVALID)
4866          retty = typeOfIRTemp(env->type_env, d->tmp);
4867
4868       /* Throw out any return types we don't know about. */
4869       Bool retty_ok = False;
4870       switch (retty) {
4871          case Ity_INVALID: /* function doesn't return anything */
4872          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4873          case Ity_V128: case Ity_V256:
4874             retty_ok = True; break;
4875          default:
4876             break;
4877       }
4878       if (!retty_ok)
4879          break; /* will go to stmt_fail: */
4880
4881       /* Marshal args, do the call, and set the return value to
4882          0x555..555 if this is a conditional call that returns a value
4883          and the call is skipped. */
4884       UInt   addToSp = 0;
4885       RetLoc rloc    = mk_RetLoc_INVALID();
4886       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4887       vassert(is_sane_RetLoc(rloc));
4888
4889       /* Now figure out what to do with the returned value, if any. */
4890       switch (retty) {
4891          case Ity_INVALID: {
4892             /* No return value.  Nothing to do. */
4893             vassert(d->tmp == IRTemp_INVALID);
4894             vassert(rloc.pri == RLPri_None);
4895             vassert(addToSp == 0);
4896             return;
4897          }
4898          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4899             /* The returned value is in %rax.  Park it in the register
4900                associated with tmp. */
4901             vassert(rloc.pri == RLPri_Int);
4902             vassert(addToSp == 0);
4903             HReg dst = lookupIRTemp(env, d->tmp);
4904             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4905             return;
4906          }
4907          case Ity_V128: {
4908             /* The returned value is on the stack, and rloc.spOff
4909                tells us where.  Fish it off the stack and then move
4910                the stack pointer upwards to clear it, as directed by
4911                doHelperCall. */
4912             vassert(rloc.pri == RLPri_V128SpRel);
4913             vassert(addToSp >= 16);
4914             HReg        dst = lookupIRTemp(env, d->tmp);
4915             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4916             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4917             add_to_rsp(env, addToSp);
4918             return;
4919          }
4920          case Ity_V256: {
4921             /* See comments for Ity_V128. */
4922             vassert(rloc.pri == RLPri_V256SpRel);
4923             vassert(addToSp >= 32);
4924             HReg        dstLo, dstHi;
4925             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4926             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4927             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4928             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4929             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4930             add_to_rsp(env, addToSp);
4931             return;
4932          }
4933          default:
4934             /*NOTREACHED*/
4935             vassert(0);
4936       }
4937       break;
4938    }
4939
4940    /* --------- MEM FENCE --------- */
4941    case Ist_MBE:
4942       switch (stmt->Ist.MBE.event) {
4943          case Imbe_Fence:
4944             addInstr(env, AMD64Instr_MFence());
4945             return;
4946          default:
4947             break;
4948       }
4949       break;
4950
4951    /* --------- ACAS --------- */
4952    case Ist_CAS:
4953       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4954          /* "normal" singleton CAS */
4955          UChar  sz;
4956          IRCAS* cas = stmt->Ist.CAS.details;
4957          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4958          /* get: cas->expd into %rax, and cas->data into %rbx */
4959          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4960          HReg rData = iselIntExpr_R(env, cas->dataLo);
4961          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4962          HReg rOld  = lookupIRTemp(env, cas->oldLo);
4963          vassert(cas->expdHi == NULL);
4964          vassert(cas->dataHi == NULL);
4965          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4966          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4967          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4968          switch (ty) {
4969             case Ity_I64: sz = 8; break;
4970             case Ity_I32: sz = 4; break;
4971             case Ity_I16: sz = 2; break;
4972             case Ity_I8:  sz = 1; break;
4973             default: goto unhandled_cas;
4974          }
4975          addInstr(env, AMD64Instr_ACAS(am, sz));
4976          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4977          return;
4978       } else {
4979          /* double CAS */
4980          UChar  sz;
4981          IRCAS* cas = stmt->Ist.CAS.details;
4982          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4983          /* only 32-bit and 64-bit allowed in this case */
4984          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4985          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4986          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4987          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4988          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4989          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4990          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4991          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4992          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4993          switch (ty) {
4994             case Ity_I64:
4995                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4996                   goto unhandled_cas; /* we'd have to generate
4997                                          cmpxchg16b, but the host
4998                                          doesn't support that */
4999                sz = 8;
5000                break;
5001             case Ity_I32:
5002                sz = 4;
5003                break;
5004             default:
5005                goto unhandled_cas;
5006          }
5007          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
5008          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
5009          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
5010          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
5011          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
5012          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
5013          addInstr(env, AMD64Instr_DACAS(am, sz));
5014          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
5015          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
5016          return;
5017       }
5018       unhandled_cas:
5019       break;
5020
5021    /* --------- INSTR MARK --------- */
5022    /* Doesn't generate any executable code ... */
5023    case Ist_IMark:
5024        return;
5025
5026    /* --------- ABI HINT --------- */
5027    /* These have no meaning (denotation in the IR) and so we ignore
5028       them ... if any actually made it this far. */
5029    case Ist_AbiHint:
5030        return;
5031
5032    /* --------- NO-OP --------- */
5033    case Ist_NoOp:
5034        return;
5035
5036    /* --------- EXIT --------- */
5037    case Ist_Exit: {
5038       if (stmt->Ist.Exit.dst->tag != Ico_U64)
5039          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5040
5041       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
5042       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5043                                           hregAMD64_RBP());
5044
5045       /* Case: boring transfer to known address */
5046       if (stmt->Ist.Exit.jk == Ijk_Boring) {
5047          if (env->chainingAllowed) {
5048             /* .. almost always true .. */
5049             /* Skip the event check at the dst if this is a forwards
5050                edge. */
5051             Bool toFastEP
5052                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5053             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5054             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5055                                              amRIP, cc, toFastEP));
5056          } else {
5057             /* .. very occasionally .. */
5058             /* We can't use chaining, so ask for an assisted transfer,
5059                as that's the only alternative that is allowable. */
5060             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5061             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5062          }
5063          return;
5064       }
5065
5066       /* Case: assisted transfer to arbitrary address */
5067       switch (stmt->Ist.Exit.jk) {
5068          /* Keep this list in sync with that in iselNext below */
5069          case Ijk_ClientReq:
5070          case Ijk_EmWarn:
5071          case Ijk_NoDecode:
5072          case Ijk_NoRedir:
5073          case Ijk_SigSEGV:
5074          case Ijk_SigTRAP:
5075          case Ijk_Sys_syscall:
5076          case Ijk_Sys_int210:
5077          case Ijk_InvalICache:
5078          case Ijk_Yield:
5079          {
5080             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5081             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5082             return;
5083          }
5084          default:
5085             break;
5086       }
5087
5088       /* Do we ever expect to see any other kind? */
5089       goto stmt_fail;
5090    }
5091
5092    default: break;
5093    }
5094   stmt_fail:
5095    ppIRStmt(stmt);
5096    vpanic("iselStmt(amd64)");
5097 }
5098
5099
5100 /*---------------------------------------------------------*/
5101 /*--- ISEL: Basic block terminators (Nexts)             ---*/
5102 /*---------------------------------------------------------*/
5103
5104 static void iselNext ( ISelEnv* env,
5105                        IRExpr* next, IRJumpKind jk, Int offsIP )
5106 {
5107    if (vex_traceflags & VEX_TRACE_VCODE) {
5108       vex_printf( "\n-- PUT(%d) = ", offsIP);
5109       ppIRExpr( next );
5110       vex_printf( "; exit-");
5111       ppIRJumpKind(jk);
5112       vex_printf( "\n");
5113    }
5114
5115    /* Case: boring transfer to known address */
5116    if (next->tag == Iex_Const) {
5117       IRConst* cdst = next->Iex.Const.con;
5118       vassert(cdst->tag == Ico_U64);
5119       if (jk == Ijk_Boring || jk == Ijk_Call) {
5120          /* Boring transfer to known address */
5121          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5122          if (env->chainingAllowed) {
5123             /* .. almost always true .. */
5124             /* Skip the event check at the dst if this is a forwards
5125                edge. */
5126             Bool toFastEP
5127                = ((Addr64)cdst->Ico.U64) > env->max_ga;
5128             if (0) vex_printf("%s", toFastEP ? "X" : ".");
5129             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5130                                              amRIP, Acc_ALWAYS,
5131                                              toFastEP));
5132          } else {
5133             /* .. very occasionally .. */
5134             /* We can't use chaining, so ask for an indirect transfer,
5135                as that's the cheapest alternative that is
5136                allowable. */
5137             HReg r = iselIntExpr_R(env, next);
5138             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5139                                                Ijk_Boring));
5140          }
5141          return;
5142       }
5143    }
5144
5145    /* Case: call/return (==boring) transfer to any address */
5146    switch (jk) {
5147       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5148          HReg        r     = iselIntExpr_R(env, next);
5149          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5150          if (env->chainingAllowed) {
5151             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5152          } else {
5153             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5154                                                Ijk_Boring));
5155          }
5156          return;
5157       }
5158       default:
5159          break;
5160    }
5161
5162    /* Case: assisted transfer to arbitrary address */
5163    switch (jk) {
5164       /* Keep this list in sync with that for Ist_Exit above */
5165       case Ijk_ClientReq:
5166       case Ijk_EmWarn:
5167       case Ijk_NoDecode:
5168       case Ijk_NoRedir:
5169       case Ijk_SigSEGV:
5170       case Ijk_SigTRAP:
5171       case Ijk_Sys_syscall:
5172       case Ijk_Sys_int210:
5173       case Ijk_InvalICache:
5174       case Ijk_Yield: {
5175          HReg        r     = iselIntExpr_R(env, next);
5176          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5177          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5178          return;
5179       }
5180       default:
5181          break;
5182    }
5183
5184    vex_printf( "\n-- PUT(%d) = ", offsIP);
5185    ppIRExpr( next );
5186    vex_printf( "; exit-");
5187    ppIRJumpKind(jk);
5188    vex_printf( "\n");
5189    vassert(0); // are we expecting any other kind?
5190 }
5191
5192
5193 /*---------------------------------------------------------*/
5194 /*--- Insn selector top-level                           ---*/
5195 /*---------------------------------------------------------*/
5196
5197 /* Translate an entire SB to amd64 code. */
5198
5199 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5200                             VexArch      arch_host,
5201                             const VexArchInfo* archinfo_host,
5202                             const VexAbiInfo*  vbi/*UNUSED*/,
5203                             Int offs_Host_EvC_Counter,
5204                             Int offs_Host_EvC_FailAddr,
5205                             Bool chainingAllowed,
5206                             Bool addProfInc,
5207                             Addr max_ga )
5208 {
5209    Int        i, j;
5210    HReg       hreg, hregHI;
5211    ISelEnv*   env;
5212    UInt       hwcaps_host = archinfo_host->hwcaps;
5213    AMD64AMode *amCounter, *amFailAddr;
5214
5215    /* sanity ... */
5216    vassert(arch_host == VexArchAMD64);
5217    vassert(0 == (hwcaps_host
5218                  & ~(VEX_HWCAPS_AMD64_SSE3
5219                      | VEX_HWCAPS_AMD64_SSSE3
5220                      | VEX_HWCAPS_AMD64_CX16
5221                      | VEX_HWCAPS_AMD64_LZCNT
5222                      | VEX_HWCAPS_AMD64_AVX
5223                      | VEX_HWCAPS_AMD64_RDTSCP
5224                      | VEX_HWCAPS_AMD64_BMI
5225                      | VEX_HWCAPS_AMD64_AVX2
5226                      | VEX_HWCAPS_AMD64_F16C
5227                      | VEX_HWCAPS_AMD64_RDRAND)));
5228
5229    /* Check that the host's endianness is as expected. */
5230    vassert(archinfo_host->endness == VexEndnessLE);
5231
5232    /* Make up an initial environment to use. */
5233    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5234    env->vreg_ctr = 0;
5235
5236    /* Set up output code array. */
5237    env->code = newHInstrArray();
5238
5239    /* Copy BB's type env. */
5240    env->type_env = bb->tyenv;
5241
5242    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
5243       change as we go along. */
5244    env->n_vregmap = bb->tyenv->types_used;
5245    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5246    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5247
5248    /* and finally ... */
5249    env->chainingAllowed = chainingAllowed;
5250    env->hwcaps          = hwcaps_host;
5251    env->max_ga          = max_ga;
5252
5253    /* For each IR temporary, allocate a suitably-kinded virtual
5254       register. */
5255    j = 0;
5256    for (i = 0; i < env->n_vregmap; i++) {
5257       hregHI = hreg = INVALID_HREG;
5258       switch (bb->tyenv->types[i]) {
5259          case Ity_I1:
5260          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5261             hreg = mkHReg(True, HRcInt64, 0, j++);
5262             break;
5263          case Ity_I128:
5264             hreg   = mkHReg(True, HRcInt64, 0, j++);
5265             hregHI = mkHReg(True, HRcInt64, 0, j++);
5266             break;
5267          case Ity_F32:
5268          case Ity_F64:
5269          case Ity_V128:
5270             hreg = mkHReg(True, HRcVec128, 0, j++);
5271             break;
5272          case Ity_V256:
5273             hreg   = mkHReg(True, HRcVec128, 0, j++);
5274             hregHI = mkHReg(True, HRcVec128, 0, j++);
5275             break;
5276          default:
5277             ppIRType(bb->tyenv->types[i]);
5278             vpanic("iselBB(amd64): IRTemp type");
5279       }
5280       env->vregmap[i]   = hreg;
5281       env->vregmapHI[i] = hregHI;
5282    }
5283    env->vreg_ctr = j;
5284
5285    /* The very first instruction must be an event check. */
5286    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
5287    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5288    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5289
5290    /* Possibly a block counter increment (for profiling).  At this
5291       point we don't know the address of the counter, so just pretend
5292       it is zero.  It will have to be patched later, but before this
5293       translation is used, by a call to LibVEX_patchProfCtr. */
5294    if (addProfInc) {
5295       addInstr(env, AMD64Instr_ProfInc());
5296    }
5297
5298    /* Ok, finally we can iterate over the statements. */
5299    for (i = 0; i < bb->stmts_used; i++)
5300       if (bb->stmts[i])
5301          iselStmt(env, bb->stmts[i]);
5302
5303    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5304
5305    /* record the number of vregs we used. */
5306    env->code->n_vregs = env->vreg_ctr;
5307    return env->code;
5308 }
5309
5310
5311 /*---------------------------------------------------------------*/
5312 /*--- end                                   host_amd64_isel.c ---*/
5313 /*---------------------------------------------------------------*/