VEX/priv/host_amd64_isel.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_isel.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, write to the Free Software
  25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26    02110-1301, USA.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29
  30    Neither the names of the U.S. Department of Energy nor the
  31    University of California nor the names of its contributors may be
  32    used to endorse or promote products derived from this software
  33    without prior written permission.
  34 */
  35
  36 #include "libvex_basictypes.h"
  37 #include "libvex_ir.h"
  38 #include "libvex.h"
  39
  40 #include "ir_match.h"
  41 #include "main_util.h"
  42 #include "main_globals.h"
  43 #include "host_generic_regs.h"
  44 #include "host_generic_simd64.h"
  45 #include "host_generic_simd128.h"
  46 #include "host_generic_simd256.h"
  47 #include "host_generic_maddf.h"
  48 #include "host_amd64_defs.h"
  49
  50
  51 /*---------------------------------------------------------*/
  52 /*--- x87/SSE control word stuff                        ---*/
  53 /*---------------------------------------------------------*/
  54
  55 /* Vex-generated code expects to run with the FPU set as follows: all
  56    exceptions masked, round-to-nearest, precision = 53 bits.  This
  57    corresponds to a FPU control word value of 0x027F.
  58
  59    Similarly the SSE control word (%mxcsr) should be 0x1F80.
  60
  61    %fpucw and %mxcsr should have these values on entry to
  62    Vex-generated code, and should those values should be
  63    unchanged at exit.
  64 */
  65
  66 #define DEFAULT_FPUCW 0x027F
  67
  68 #define DEFAULT_MXCSR 0x1F80
  69
  70 /* debugging only, do not use */
  71 /* define DEFAULT_FPUCW 0x037F */
  72
  73
  74 /*---------------------------------------------------------*/
  75 /*--- misc helpers                                      ---*/
  76 /*---------------------------------------------------------*/
  77
  78 /* These are duplicated in guest-amd64/toIR.c */
  79 static IRExpr* unop ( IROp op, IRExpr* a )
  80 {
  81    return IRExpr_Unop(op, a);
  82 }
  83
  84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
  85 {
  86    return IRExpr_Binop(op, a1, a2);
  87 }
  88
  89 static IRExpr* bind ( Int binder )
  90 {
  91    return IRExpr_Binder(binder);
  92 }
  93
  94 static Bool isZeroU8 ( const IRExpr* e )
  95 {
  96    return e->tag == Iex_Const
  97           && e->Iex.Const.con->tag == Ico_U8
  98           && e->Iex.Const.con->Ico.U8 == 0;
  99 }
 100
 101
 102 /*---------------------------------------------------------*/
 103 /*--- ISelEnv                                           ---*/
 104 /*---------------------------------------------------------*/
 105
 106 /* This carries around:
 107
 108    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
 109      might encounter.  This is computed before insn selection starts,
 110      and does not change.
 111
 112    - A mapping from IRTemp to HReg.  This tells the insn selector
 113      which virtual register is associated with each IRTemp
 114      temporary.  This is computed before insn selection starts, and
 115      does not change.  We expect this mapping to map precisely the
 116      same set of IRTemps as the type mapping does.
 117
 118         - vregmap   holds the primary register for the IRTemp.
 119         - vregmapHI is only used for 128-bit integer-typed
 120              IRTemps.  It holds the identity of a second
 121              64-bit virtual HReg, which holds the high half
 122              of the value.
 123
 124    - The host subarchitecture we are selecting insns for.
 125      This is set at the start and does not change.
 126
 127    - The code array, that is, the insns selected so far.
 128
 129    - A counter, for generating new virtual registers.
 130
 131    - A Bool for indicating whether we may generate chain-me
 132      instructions for control flow transfers, or whether we must use
 133      XAssisted.
 134
 135    - The maximum guest address of any guest insn in this block.
 136      Actually, the address of the highest-addressed byte from any insn
 137      in this block.  Is set at the start and does not change.  This is
 138      used for detecting jumps which are definitely forward-edges from
 139      this block, and therefore can be made (chained) to the fast entry
 140      point of the destination, thereby avoiding the destination's
 141      event check.
 142
 143    Note, this is all host-independent.  (JRS 20050201: well, kinda
 144    ... not completely.  Compare with ISelEnv for X86.)
 145 */
 146
 147 typedef
 148    struct {
 149       /* Constant -- are set at the start and do not change. */
 150       IRTypeEnv*   type_env;
 151
 152       HReg*        vregmap;
 153       HReg*        vregmapHI;
 154       Int          n_vregmap;
 155
 156       UInt         hwcaps;
 157
 158       Bool         chainingAllowed;
 159       Addr64       max_ga;
 160
 161       /* These are modified as we go along. */
 162       HInstrArray* code;
 163       Int          vreg_ctr;
 164    }
 165    ISelEnv;
 166
 167
 168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
 169 {
 170    vassert(tmp >= 0);
 171    vassert(tmp < env->n_vregmap);
 172    return env->vregmap[tmp];
 173 }
 174
 175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
 176                                ISelEnv* env, IRTemp tmp )
 177 {
 178    vassert(tmp >= 0);
 179    vassert(tmp < env->n_vregmap);
 180    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
 181    *vrLO = env->vregmap[tmp];
 182    *vrHI = env->vregmapHI[tmp];
 183 }
 184
 185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
 186 {
 187    addHInstr(env->code, instr);
 188    if (vex_traceflags & VEX_TRACE_VCODE) {
 189       ppAMD64Instr(instr, True);
 190       vex_printf("\n");
 191    }
 192 }
 193
 194 static HReg newVRegI ( ISelEnv* env )
 195 {
 196    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
 197    env->vreg_ctr++;
 198    return reg;
 199 }
 200
 201 static HReg newVRegV ( ISelEnv* env )
 202 {
 203    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
 204    env->vreg_ctr++;
 205    return reg;
 206 }
 207
 208
 209 /*---------------------------------------------------------*/
 210 /*--- ISEL: Forward declarations                        ---*/
 211 /*---------------------------------------------------------*/
 212
 213 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
 214    iselXXX_wrk do the real work, but are not to be called directly.
 215    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
 216    checks that all returned registers are virtual.  You should not
 217    call the _wrk version directly.
 218 */
 219 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
 220 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
 221
 222 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, const IRExpr* e );
 223 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, const IRExpr* e );
 224
 225 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, const IRExpr* e );
 226 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, const IRExpr* e );
 227
 228 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, const IRExpr* e );
 229 static HReg          iselIntExpr_R       ( ISelEnv* env, const IRExpr* e );
 230
 231 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
 232 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
 233
 234 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 235                                           ISelEnv* env, const IRExpr* e );
 236 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
 237                                           ISelEnv* env, const IRExpr* e );
 238
 239 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, const IRExpr* e );
 240 static AMD64CondCode iselCondCode        ( ISelEnv* env, const IRExpr* e );
 241
 242 static HReg          iselDblExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 243 static HReg          iselDblExpr         ( ISelEnv* env, const IRExpr* e );
 244
 245 static HReg          iselFltExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 246 static HReg          iselFltExpr         ( ISelEnv* env, const IRExpr* e );
 247
 248 static HReg          iselVecExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 249 static HReg          iselVecExpr         ( ISelEnv* env, const IRExpr* e );
 250
 251 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 252                                         ISelEnv* env, const IRExpr* e );
 253 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
 254                                         ISelEnv* env, const IRExpr* e );
 255
 256
 257 /*---------------------------------------------------------*/
 258 /*--- ISEL: Misc helpers                                ---*/
 259 /*---------------------------------------------------------*/
 260
 261 static Bool sane_AMode ( AMD64AMode* am )
 262 {
 263    switch (am->tag) {
 264       case Aam_IR:
 265          return
 266             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
 267                     && (hregIsVirtual(am->Aam.IR.reg)
 268                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
 269       case Aam_IRRS:
 270          return
 271             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
 272                     && hregIsVirtual(am->Aam.IRRS.base)
 273                     && hregClass(am->Aam.IRRS.index) == HRcInt64
 274                     && hregIsVirtual(am->Aam.IRRS.index) );
 275       default:
 276         vpanic("sane_AMode: unknown amd64 amode tag");
 277    }
 278 }
 279
 280
 281 /* Can the lower 32 bits be signedly widened to produce the whole
 282    64-bit value?  In other words, are the top 33 bits either all 0 or
 283    all 1 ? */
 284 static Bool fitsIn32Bits ( ULong x )
 285 {
 286    Long y1;
 287    y1 = x << 32;
 288    y1 >>=/*s*/ 32;
 289    return toBool(x == y1);
 290 }
 291
 292 /* Is this a 64-bit zero expression? */
 293
 294 static Bool isZeroU64 ( const IRExpr* e )
 295 {
 296    return e->tag == Iex_Const
 297           && e->Iex.Const.con->tag == Ico_U64
 298           && e->Iex.Const.con->Ico.U64 == 0ULL;
 299 }
 300
 301 static Bool isZeroU32 ( const IRExpr* e )
 302 {
 303    return e->tag == Iex_Const
 304           && e->Iex.Const.con->tag == Ico_U32
 305           && e->Iex.Const.con->Ico.U32 == 0;
 306 }
 307
 308 /* Are both args atoms and the same?  This is copy of eqIRAtom
 309    that omits the assertions that the args are indeed atoms. */
 310
 311 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
 312 {
 313    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 314       return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
 315    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 316       return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
 317    return False;
 318 }
 319
 320 /* Make a int reg-reg move. */
 321
 322 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
 323 {
 324    vassert(hregClass(src) == HRcInt64);
 325    vassert(hregClass(dst) == HRcInt64);
 326    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
 327 }
 328
 329 /* Make a vector (128 bit) reg-reg move. */
 330
 331 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
 332 {
 333    vassert(hregClass(src) == HRcVec128);
 334    vassert(hregClass(dst) == HRcVec128);
 335    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
 336 }
 337
 338 /* Advance/retreat %rsp by n. */
 339
 340 static void add_to_rsp ( ISelEnv* env, Int n )
 341 {
 342    vassert(n > 0 && n < 256 && (n%8) == 0);
 343    addInstr(env,
 344             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
 345                                         hregAMD64_RSP()));
 346 }
 347
 348 static void sub_from_rsp ( ISelEnv* env, Int n )
 349 {
 350    vassert(n > 0 && n < 256 && (n%8) == 0);
 351    addInstr(env,
 352             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
 353                                         hregAMD64_RSP()));
 354 }
 355
 356 /* Push 64-bit constants on the stack. */
 357 static void push_uimm64( ISelEnv* env, ULong uimm64 )
 358 {
 359    /* If uimm64 can be expressed as the sign extension of its
 360       lower 32 bits, we can do it the easy way. */
 361    Long simm64 = (Long)uimm64;
 362    if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
 363       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
 364    } else {
 365       HReg tmp = newVRegI(env);
 366       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
 367       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
 368    }
 369 }
 370
 371
 372 /* Used only in doHelperCall.  If possible, produce a single
 373    instruction which computes 'e' into 'dst'.  If not possible, return
 374    NULL. */
 375
 376 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
 377                                                     HReg     dst,
 378                                                     IRExpr*  e )
 379 {
 380    /* Per comments in doHelperCall below, appearance of
 381       Iex_VECRET implies ill-formed IR. */
 382    vassert(e->tag != Iex_VECRET);
 383
 384    /* In this case we give out a copy of the BaseBlock pointer. */
 385    if (UNLIKELY(e->tag == Iex_GSPTR)) {
 386       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
 387    }
 388
 389    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
 390
 391    if (e->tag == Iex_Const) {
 392       vassert(e->Iex.Const.con->tag == Ico_U64);
 393       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
 394          return AMD64Instr_Alu64R(
 395                    Aalu_MOV,
 396                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
 397                    dst
 398                 );
 399       } else {
 400          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
 401       }
 402    }
 403
 404    if (e->tag == Iex_RdTmp) {
 405       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
 406       return mk_iMOVsd_RR(src, dst);
 407    }
 408
 409    if (e->tag == Iex_Get) {
 410       vassert(e->Iex.Get.ty == Ity_I64);
 411       return AMD64Instr_Alu64R(
 412                 Aalu_MOV,
 413                 AMD64RMI_Mem(
 414                    AMD64AMode_IR(e->Iex.Get.offset,
 415                                  hregAMD64_RBP())),
 416                 dst);
 417    }
 418
 419    if (e->tag == Iex_Unop
 420        && e->Iex.Unop.op == Iop_32Uto64
 421        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
 422       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
 423       return AMD64Instr_MovxLQ(False, src, dst);
 424    }
 425
 426    if (0) { ppIRExpr(e); vex_printf("\n"); }
 427
 428    return NULL;
 429 }
 430
 431
 432 /* Do a complete function call.  |guard| is a Ity_Bit expression
 433    indicating whether or not the call happens.  If guard==NULL, the
 434    call is unconditional.  |retloc| is set to indicate where the
 435    return value is after the call.  The caller (of this fn) must
 436    generate code to add |stackAdjustAfterCall| to the stack pointer
 437    after the call is done. */
 438
 439 static
 440 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
 441                     /*OUT*/RetLoc* retloc,
 442                     ISelEnv* env,
 443                     IRExpr* guard,
 444                     IRCallee* cee, IRType retTy, IRExpr** args )
 445 {
 446    AMD64CondCode cc;
 447    HReg          argregs[6];
 448    HReg          tmpregs[6];
 449    AMD64Instr*   fastinstrs[6];
 450    UInt          n_args, i;
 451
 452    /* Set default returns.  We'll update them later if needed. */
 453    *stackAdjustAfterCall = 0;
 454    *retloc               = mk_RetLoc_INVALID();
 455
 456    /* These are used for cross-checking that IR-level constraints on
 457       the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
 458    UInt nVECRETs = 0;
 459    UInt nGSPTRs  = 0;
 460
 461    /* Marshal args for a call and do the call.
 462
 463       This function only deals with a tiny set of possibilities, which
 464       cover all helpers in practice.  The restrictions are that only
 465       arguments in registers are supported, hence only 6x64 integer
 466       bits in total can be passed.  In fact the only supported arg
 467       type is I64.
 468
 469       The return type can be I{64,32,16,8} or V{128,256}.  In the
 470       latter two cases, it is expected that |args| will contain the
 471       special node IRExpr_VECRET(), in which case this routine
 472       generates code to allocate space on the stack for the vector
 473       return value.  Since we are not passing any scalars on the
 474       stack, it is enough to preallocate the return space before
 475       marshalling any arguments, in this case.
 476
 477       |args| may also contain IRExpr_GSPTR(), in which case the
 478       value in %rbp is passed as the corresponding argument.
 479
 480       Generating code which is both efficient and correct when
 481       parameters are to be passed in registers is difficult, for the
 482       reasons elaborated in detail in comments attached to
 483       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
 484       of the method described in those comments.
 485
 486       The problem is split into two cases: the fast scheme and the
 487       slow scheme.  In the fast scheme, arguments are computed
 488       directly into the target (real) registers.  This is only safe
 489       when we can be sure that computation of each argument will not
 490       trash any real registers set by computation of any other
 491       argument.
 492
 493       In the slow scheme, all args are first computed into vregs, and
 494       once they are all done, they are moved to the relevant real
 495       regs.  This always gives correct code, but it also gives a bunch
 496       of vreg-to-rreg moves which are usually redundant but are hard
 497       for the register allocator to get rid of.
 498
 499       To decide which scheme to use, all argument expressions are
 500       first examined.  If they are all so simple that it is clear they
 501       will be evaluated without use of any fixed registers, use the
 502       fast scheme, else use the slow scheme.  Note also that only
 503       unconditional calls may use the fast scheme, since having to
 504       compute a condition expression could itself trash real
 505       registers.  Note that for simplicity, in the case where
 506       IRExpr_VECRET() is present, we use the slow scheme.  This is
 507       motivated by the desire to avoid any possible complexity
 508       w.r.t. nested calls.
 509
 510       Note this requires being able to examine an expression and
 511       determine whether or not evaluation of it might use a fixed
 512       register.  That requires knowledge of how the rest of this insn
 513       selector works.  Currently just the following 3 are regarded as
 514       safe -- hopefully they cover the majority of arguments in
 515       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
 516    */
 517
 518    /* Note that the cee->regparms field is meaningless on AMD64 host
 519       (since there is only one calling convention) and so we always
 520       ignore it. */
 521    n_args = 0;
 522    for (i = 0; args[i]; i++)
 523       n_args++;
 524
 525    if (n_args > 6)
 526       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
 527
 528    argregs[0] = hregAMD64_RDI();
 529    argregs[1] = hregAMD64_RSI();
 530    argregs[2] = hregAMD64_RDX();
 531    argregs[3] = hregAMD64_RCX();
 532    argregs[4] = hregAMD64_R8();
 533    argregs[5] = hregAMD64_R9();
 534
 535    tmpregs[0] = tmpregs[1] = tmpregs[2] =
 536    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
 537
 538    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
 539    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
 540
 541    /* First decide which scheme (slow or fast) is to be used.  First
 542       assume the fast scheme, and select slow if any contraindications
 543       (wow) appear. */
 544
 545    /* We'll need space on the stack for the return value.  Avoid
 546       possible complications with nested calls by using the slow
 547       scheme. */
 548    if (retTy == Ity_V128 || retTy == Ity_V256)
 549       goto slowscheme;
 550
 551    if (guard) {
 552       if (guard->tag == Iex_Const
 553           && guard->Iex.Const.con->tag == Ico_U1
 554           && guard->Iex.Const.con->Ico.U1 == True) {
 555          /* unconditional */
 556       } else {
 557          /* Not manifestly unconditional -- be conservative. */
 558          goto slowscheme;
 559       }
 560    }
 561
 562    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
 563       use the slow scheme.  Because this is tentative, we can't call
 564       addInstr (that is, commit to) any instructions until we're
 565       handled all the arguments.  So park the resulting instructions
 566       in a buffer and emit that if we're successful. */
 567
 568    /* FAST SCHEME */
 569    /* In this loop, we process args that can be computed into the
 570       destination (real) register with a single instruction, without
 571       using any fixed regs.  That also includes IRExpr_GSPTR(), but
 572       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
 573       never see IRExpr_VECRET() at this point, since the return-type
 574       check above should ensure all those cases use the slow scheme
 575       instead. */
 576    vassert(n_args >= 0 && n_args <= 6);
 577    for (i = 0; i < n_args; i++) {
 578       IRExpr* arg = args[i];
 579       if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
 580          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 581       }
 582       fastinstrs[i]
 583          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
 584       if (fastinstrs[i] == NULL)
 585          goto slowscheme;
 586    }
 587
 588    /* Looks like we're in luck.  Emit the accumulated instructions and
 589       move on to doing the call itself. */
 590    for (i = 0; i < n_args; i++)
 591       addInstr(env, fastinstrs[i]);
 592
 593    /* Fast scheme only applies for unconditional calls.  Hence: */
 594    cc = Acc_ALWAYS;
 595
 596    goto handle_call;
 597
 598
 599    /* SLOW SCHEME; move via temporaries */
 600   slowscheme:
 601    {}
 602 #  if 0 /* debug only */
 603    if (n_args > 0) {for (i = 0; args[i]; i++) {
 604    ppIRExpr(args[i]); vex_printf(" "); }
 605    vex_printf("\n");}
 606 #  endif
 607
 608    /* If we have a vector return type, allocate a place for it on the
 609       stack and record its address. */
 610    HReg r_vecRetAddr = INVALID_HREG;
 611    if (retTy == Ity_V128) {
 612       r_vecRetAddr = newVRegI(env);
 613       sub_from_rsp(env, 16);
 614       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 615    }
 616    else if (retTy == Ity_V256) {
 617       r_vecRetAddr = newVRegI(env);
 618       sub_from_rsp(env, 32);
 619       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 620    }
 621
 622    vassert(n_args >= 0 && n_args <= 6);
 623    for (i = 0; i < n_args; i++) {
 624       IRExpr* arg = args[i];
 625       if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 626          tmpregs[i] = newVRegI(env);
 627          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
 628          nGSPTRs++;
 629       }
 630       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
 631          /* We stashed the address of the return slot earlier, so just
 632             retrieve it now. */
 633          vassert(!hregIsInvalid(r_vecRetAddr));
 634          tmpregs[i] = r_vecRetAddr;
 635          nVECRETs++;
 636       }
 637       else {
 638          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 639          tmpregs[i] = iselIntExpr_R(env, args[i]);
 640       }
 641    }
 642
 643    /* Now we can compute the condition.  We can't do it earlier
 644       because the argument computations could trash the condition
 645       codes.  Be a bit clever to handle the common case where the
 646       guard is 1:Bit. */
 647    cc = Acc_ALWAYS;
 648    if (guard) {
 649       if (guard->tag == Iex_Const
 650           && guard->Iex.Const.con->tag == Ico_U1
 651           && guard->Iex.Const.con->Ico.U1 == True) {
 652          /* unconditional -- do nothing */
 653       } else {
 654          cc = iselCondCode( env, guard );
 655       }
 656    }
 657
 658    /* Move the args to their final destinations. */
 659    for (i = 0; i < n_args; i++) {
 660       /* None of these insns, including any spill code that might
 661          be generated, may alter the condition codes. */
 662       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
 663    }
 664
 665
 666    /* Do final checks, set the return values, and generate the call
 667       instruction proper. */
 668   handle_call:
 669
 670    if (retTy == Ity_V128 || retTy == Ity_V256) {
 671       vassert(nVECRETs == 1);
 672    } else {
 673       vassert(nVECRETs == 0);
 674    }
 675
 676    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 677
 678    vassert(*stackAdjustAfterCall == 0);
 679    vassert(is_RetLoc_INVALID(*retloc));
 680    switch (retTy) {
 681          case Ity_INVALID:
 682             /* Function doesn't return a value. */
 683             *retloc = mk_RetLoc_simple(RLPri_None);
 684             break;
 685          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
 686             *retloc = mk_RetLoc_simple(RLPri_Int);
 687             break;
 688          case Ity_V128:
 689             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
 690             *stackAdjustAfterCall = 16;
 691             break;
 692          case Ity_V256:
 693             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
 694             *stackAdjustAfterCall = 32;
 695             break;
 696          default:
 697             /* IR can denote other possible return types, but we don't
 698                handle those here. */
 699            vassert(0);
 700    }
 701
 702    /* Finally, generate the call itself.  This needs the *retloc value
 703       set in the switch above, which is why it's at the end. */
 704    addInstr(env,
 705             AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
 706 }
 707
 708
 709 /* Given a guest-state array descriptor, an index expression and a
 710    bias, generate an AMD64AMode holding the relevant guest state
 711    offset. */
 712
 713 static
 714 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
 715                                   IRExpr* off, Int bias )
 716 {
 717    HReg tmp, roff;
 718    Int  elemSz = sizeofIRType(descr->elemTy);
 719    Int  nElems = descr->nElems;
 720
 721    /* Throw out any cases not generated by an amd64 front end.  In
 722       theory there might be a day where we need to handle them -- if
 723       we ever run non-amd64-guest on amd64 host. */
 724
 725    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
 726       vpanic("genGuestArrayOffset(amd64 host)");
 727
 728    /* Compute off into a reg, %off.  Then return:
 729
 730          movq %off, %tmp
 731          addq $bias, %tmp  (if bias != 0)
 732          andq %tmp, 7
 733          ... base(%rbp, %tmp, shift) ...
 734    */
 735    tmp  = newVRegI(env);
 736    roff = iselIntExpr_R(env, off);
 737    addInstr(env, mk_iMOVsd_RR(roff, tmp));
 738    if (bias != 0) {
 739       /* Make sure the bias is sane, in the sense that there are
 740          no significant bits above bit 30 in it. */
 741       vassert(-10000 < bias && bias < 10000);
 742       addInstr(env,
 743                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
 744    }
 745    addInstr(env,
 746             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
 747    vassert(elemSz == 1 || elemSz == 8);
 748    return
 749       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
 750                                     elemSz==8 ? 3 : 0);
 751 }
 752
 753
 754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
 755 static
 756 void set_SSE_rounding_default ( ISelEnv* env )
 757 {
 758    /* pushq $DEFAULT_MXCSR
 759       ldmxcsr 0(%rsp)
 760       addq $8, %rsp
 761    */
 762    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 763    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
 764    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 765    add_to_rsp(env, 8);
 766 }
 767
 768 /* Mess with the FPU's rounding mode: set to the default rounding mode
 769    (DEFAULT_FPUCW). */
 770 static
 771 void set_FPU_rounding_default ( ISelEnv* env )
 772 {
 773    /* movq $DEFAULT_FPUCW, -8(%rsp)
 774       fldcw -8(%esp)
 775    */
 776    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 777    addInstr(env, AMD64Instr_Alu64M(
 778                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
 779    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 780 }
 781
 782
 783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
 784    expression denoting a value in the range 0 .. 3, indicating a round
 785    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
 786    have the same rounding.
 787 */
 788 static
 789 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
 790 {
 791    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
 792       both rounding bits == 0.  If that wasn't the case, we couldn't
 793       create a new rounding field simply by ORing the new value into
 794       place. */
 795
 796    /* movq $3, %reg
 797       andq [[mode]], %reg  -- shouldn't be needed; paranoia
 798       shlq $13, %reg
 799       orq $DEFAULT_MXCSR, %reg
 800       pushq %reg
 801       ldmxcsr 0(%esp)
 802       addq $8, %rsp
 803    */
 804    HReg        reg      = newVRegI(env);
 805    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 806    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
 807    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
 808                                    iselIntExpr_RMI(env, mode), reg));
 809    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
 810    addInstr(env, AMD64Instr_Alu64R(
 811                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
 812    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
 813    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 814    add_to_rsp(env, 8);
 815 }
 816
 817
 818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
 819    expression denoting a value in the range 0 .. 3, indicating a round
 820    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
 821    the same rounding.
 822 */
 823 static
 824 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
 825 {
 826    HReg rrm  = iselIntExpr_R(env, mode);
 827    HReg rrm2 = newVRegI(env);
 828    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 829
 830    /* movq  %rrm, %rrm2
 831       andq  $3, %rrm2   -- shouldn't be needed; paranoia
 832       shlq  $10, %rrm2
 833       orq   $DEFAULT_FPUCW, %rrm2
 834       movq  %rrm2, -8(%rsp)
 835       fldcw -8(%esp)
 836    */
 837    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
 838    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
 839    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
 840    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
 841                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
 842    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
 843                                    AMD64RI_Reg(rrm2), m8_rsp));
 844    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 845 }
 846
 847
 848 /* Generate all-zeroes into a new vector register.
 849 */
 850 static HReg generate_zeroes_V128 ( ISelEnv* env )
 851 {
 852    HReg dst = newVRegV(env);
 853    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
 854    return dst;
 855 }
 856
 857 /* Generate all-ones into a new vector register.
 858 */
 859 static HReg generate_ones_V128 ( ISelEnv* env )
 860 {
 861    HReg dst = newVRegV(env);
 862    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
 863    return dst;
 864 }
 865
 866
 867 /* Generate !src into a new vector register.  Amazing that there isn't
 868    a less crappy way to do this.
 869 */
 870 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
 871 {
 872    HReg dst = generate_ones_V128(env);
 873    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
 874    return dst;
 875 }
 876
 877
 878 /* Expand the given byte into a 64-bit word, by cloning each bit
 879    8 times. */
 880 static ULong bitmask8_to_bytemask64 ( UShort w8 )
 881 {
 882    vassert(w8 == (w8 & 0xFF));
 883    ULong w64 = 0;
 884    Int i;
 885    for (i = 0; i < 8; i++) {
 886       if (w8 & (1<<i))
 887          w64 |= (0xFFULL << (8 * i));
 888    }
 889    return w64;
 890 }
 891
 892
 893 /*---------------------------------------------------------*/
 894 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
 895 /*---------------------------------------------------------*/
 896
 897 /* Select insns for an integer-typed expression, and add them to the
 898    code list.  Return a reg holding the result.  This reg will be a
 899    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
 900    want to modify it, ask for a new vreg, copy it in there, and modify
 901    the copy.  The register allocator will do its best to map both
 902    vregs to the same real register, so the copies will often disappear
 903    later in the game.
 904
 905    This should handle expressions of 64, 32, 16 and 8-bit type.  All
 906    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
 907    expressions, the upper 32/48/56 bits are arbitrary, so you should
 908    mask or sign extend partial values if necessary.
 909 */
 910
 911 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
 912 {
 913    HReg r = iselIntExpr_R_wrk(env, e);
 914    /* sanity checks ... */
 915 #  if 0
 916    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
 917 #  endif
 918    vassert(hregClass(r) == HRcInt64);
 919    vassert(hregIsVirtual(r));
 920    return r;
 921 }
 922
 923 /* DO NOT CALL THIS DIRECTLY ! */
 924 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
 925 {
 926    MatchInfo mi;
 927    DECLARE_PATTERN(p_1Uto8_64to1);
 928    DECLARE_PATTERN(p_LDle8_then_8Uto64);
 929    DECLARE_PATTERN(p_LDle16_then_16Uto64);
 930
 931    IRType ty = typeOfIRExpr(env->type_env,e);
 932    switch (ty) {
 933       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
 934       default: vassert(0);
 935    }
 936
 937    switch (e->tag) {
 938
 939    /* --------- TEMP --------- */
 940    case Iex_RdTmp: {
 941       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
 942    }
 943
 944    /* --------- LOAD --------- */
 945    case Iex_Load: {
 946       HReg dst = newVRegI(env);
 947       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
 948
 949       /* We can't handle big-endian loads, nor load-linked. */
 950       if (e->Iex.Load.end != Iend_LE)
 951          goto irreducible;
 952
 953       if (ty == Ity_I64) {
 954          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
 955                                          AMD64RMI_Mem(amode), dst) );
 956          return dst;
 957       }
 958       if (ty == Ity_I32) {
 959          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
 960          return dst;
 961       }
 962       if (ty == Ity_I16) {
 963          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
 964          return dst;
 965       }
 966       if (ty == Ity_I8) {
 967          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
 968          return dst;
 969       }
 970       break;
 971    }
 972
 973    /* --------- BINARY OP --------- */
 974    case Iex_Binop: {
 975       AMD64AluOp   aluOp;
 976       AMD64ShiftOp shOp;
 977
 978       /* Pattern: Sub64(0,x) */
 979       /*     and: Sub32(0,x) */
 980       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
 981           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
 982          HReg dst = newVRegI(env);
 983          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
 984          addInstr(env, mk_iMOVsd_RR(reg,dst));
 985          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
 986          return dst;
 987       }
 988
 989       /* Is it an addition or logical style op? */
 990       switch (e->Iex.Binop.op) {
 991          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
 992             aluOp = Aalu_ADD; break;
 993          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
 994             aluOp = Aalu_SUB; break;
 995          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
 996             aluOp = Aalu_AND; break;
 997          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
 998             aluOp = Aalu_OR; break;
 999          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
1000             aluOp = Aalu_XOR; break;
1001          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1002             aluOp = Aalu_MUL; break;
1003          default:
1004             aluOp = Aalu_INVALID; break;
1005       }
1006       /* For commutative ops we assume any literal
1007          values are on the second operand. */
1008       if (aluOp != Aalu_INVALID) {
1009          HReg dst      = newVRegI(env);
1010          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1011          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1012          addInstr(env, mk_iMOVsd_RR(reg,dst));
1013          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1014          return dst;
1015       }
1016
1017       /* Perhaps a shift op? */
1018       switch (e->Iex.Binop.op) {
1019          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1020             shOp = Ash_SHL; break;
1021          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1022             shOp = Ash_SHR; break;
1023          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1024             shOp = Ash_SAR; break;
1025          default:
1026             shOp = Ash_INVALID; break;
1027       }
1028       if (shOp != Ash_INVALID) {
1029          HReg dst = newVRegI(env);
1030
1031          /* regL = the value to be shifted */
1032          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1033          addInstr(env, mk_iMOVsd_RR(regL,dst));
1034
1035          /* Do any necessary widening for 32/16/8 bit operands */
1036          switch (e->Iex.Binop.op) {
1037             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1038                break;
1039             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1040                break;
1041             case Iop_Shr8:
1042                addInstr(env, AMD64Instr_Alu64R(
1043                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1044                break;
1045             case Iop_Shr16:
1046                addInstr(env, AMD64Instr_Alu64R(
1047                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1048                break;
1049             case Iop_Shr32:
1050                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1051                break;
1052             case Iop_Sar8:
1053                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1054                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1055                break;
1056             case Iop_Sar16:
1057                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1058                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1059                break;
1060             case Iop_Sar32:
1061                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1062                break;
1063             default:
1064                ppIROp(e->Iex.Binop.op);
1065                vassert(0);
1066          }
1067
1068          /* Now consider the shift amount.  If it's a literal, we
1069             can do a much better job than the general case. */
1070          if (e->Iex.Binop.arg2->tag == Iex_Const) {
1071             /* assert that the IR is well-typed */
1072             Int nshift;
1073             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1074             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1075             vassert(nshift >= 0);
1076             if (nshift > 0)
1077                /* Can't allow nshift==0 since that means %cl */
1078                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1079          } else {
1080             /* General case; we have to force the amount into %cl. */
1081             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1082             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1083             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1084          }
1085          return dst;
1086       }
1087
1088       /* Handle misc other scalar ops. */
1089       if (e->Iex.Binop.op == Iop_Max32U) {
1090          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1091          HReg dst  = newVRegI(env);
1092          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1093          addInstr(env, mk_iMOVsd_RR(src1, dst));
1094          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1095          addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1096          return dst;
1097       }
1098
1099       if (e->Iex.Binop.op == Iop_DivModS64to32
1100           || e->Iex.Binop.op == Iop_DivModU64to32) {
1101          /* 64 x 32 -> (32(rem),32(div)) division */
1102          /* Get the 64-bit operand into edx:eax, and the other into
1103             any old R/M. */
1104          HReg      rax     = hregAMD64_RAX();
1105          HReg      rdx     = hregAMD64_RDX();
1106          HReg      dst     = newVRegI(env);
1107          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1108          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1109          /* Compute the left operand into a reg, and then
1110             put the top half in edx and the bottom in eax. */
1111          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1112          addInstr(env, mk_iMOVsd_RR(left64, rdx));
1113          addInstr(env, mk_iMOVsd_RR(left64, rax));
1114          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1115          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1116          addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1117          addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1118          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1119          addInstr(env, mk_iMOVsd_RR(rax, dst));
1120          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1121          return dst;
1122       }
1123
1124       if (e->Iex.Binop.op == Iop_32HLto64) {
1125          HReg hi32  = newVRegI(env);
1126          HReg lo32  = newVRegI(env);
1127          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1128          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1129          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1130          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1131          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1132          addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1133          addInstr(env, AMD64Instr_Alu64R(
1134                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1135          return hi32;
1136       }
1137
1138       if (e->Iex.Binop.op == Iop_16HLto32) {
1139          HReg hi16  = newVRegI(env);
1140          HReg lo16  = newVRegI(env);
1141          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1142          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1143          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1144          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1145          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1146          addInstr(env, AMD64Instr_Alu64R(
1147                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1148          addInstr(env, AMD64Instr_Alu64R(
1149                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1150          return hi16;
1151       }
1152
1153       if (e->Iex.Binop.op == Iop_8HLto16) {
1154          HReg hi8  = newVRegI(env);
1155          HReg lo8  = newVRegI(env);
1156          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1157          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1158          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1159          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1160          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1161          addInstr(env, AMD64Instr_Alu64R(
1162                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1163          addInstr(env, AMD64Instr_Alu64R(
1164                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1165          return hi8;
1166       }
1167
1168       if (e->Iex.Binop.op == Iop_MullS32
1169           || e->Iex.Binop.op == Iop_MullS16
1170           || e->Iex.Binop.op == Iop_MullS8
1171           || e->Iex.Binop.op == Iop_MullU32
1172           || e->Iex.Binop.op == Iop_MullU16
1173           || e->Iex.Binop.op == Iop_MullU8) {
1174          HReg a32   = newVRegI(env);
1175          HReg b32   = newVRegI(env);
1176          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1177          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1178          Int          shift  = 0;
1179          AMD64ShiftOp shr_op = Ash_SHR;
1180          switch (e->Iex.Binop.op) {
1181             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1182             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1183             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1184             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1185             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1186             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1187             default: vassert(0);
1188          }
1189
1190          addInstr(env, mk_iMOVsd_RR(a32s, a32));
1191          addInstr(env, mk_iMOVsd_RR(b32s, b32));
1192          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1193          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1194          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1195          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1196          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1197          return b32;
1198       }
1199
1200       if (e->Iex.Binop.op == Iop_CmpF64) {
1201          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1202          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1203          HReg dst = newVRegI(env);
1204          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1205          /* Mask out irrelevant parts of the result so as to conform
1206             to the CmpF64 definition. */
1207          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1208          return dst;
1209       }
1210
1211       if (e->Iex.Binop.op == Iop_F64toI32S
1212           || e->Iex.Binop.op == Iop_F64toI64S) {
1213          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1214          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1215          HReg dst = newVRegI(env);
1216          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1217          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1218          set_SSE_rounding_default(env);
1219          return dst;
1220       }
1221
1222       /* Deal with 64-bit SIMD binary ops.  For the most part these are doable
1223          by using the equivalent 128-bit operation and ignoring the upper half
1224          of the result. */
1225       AMD64SseOp op = Asse_INVALID;
1226       Bool arg1isEReg = False;
1227       Bool preShift32R = False;
1228       switch (e->Iex.Binop.op) {
1229          // The following 3 could be done with 128 bit insns too, but
1230          // first require the inputs to be reformatted.
1231          //case Iop_QNarrowBin32Sto16Sx4:
1232          //op = Asse_PACKSSD; arg1isEReg = True; break;
1233          //case Iop_QNarrowBin16Sto8Sx8:
1234          //op = Asse_PACKSSW; arg1isEReg = True; break;
1235          //case Iop_QNarrowBin16Sto8Ux8:
1236          //op = Asse_PACKUSW; arg1isEReg = True; break;
1237
1238          case Iop_InterleaveHI8x8:
1239             op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1240             break;
1241          case Iop_InterleaveHI16x4:
1242             op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1243             break;
1244          case Iop_InterleaveHI32x2:
1245             op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1246             break;
1247          case Iop_InterleaveLO8x8:
1248             op = Asse_UNPCKLB; arg1isEReg = True;
1249             break;
1250          case Iop_InterleaveLO16x4:
1251             op = Asse_UNPCKLW; arg1isEReg = True;
1252             break;
1253          case Iop_InterleaveLO32x2:
1254             op = Asse_UNPCKLD; arg1isEReg = True;
1255             break;
1256
1257          case Iop_Add8x8:     op = Asse_ADD8;     break;
1258          case Iop_Add16x4:    op = Asse_ADD16;    break;
1259          case Iop_Add32x2:    op = Asse_ADD32;    break;
1260          case Iop_QAdd8Sx8:   op = Asse_QADD8S;   break;
1261          case Iop_QAdd16Sx4:  op = Asse_QADD16S;  break;
1262          case Iop_QAdd8Ux8:   op = Asse_QADD8U;   break;
1263          case Iop_QAdd16Ux4:  op = Asse_QADD16U;  break;
1264          case Iop_Avg8Ux8:    op = Asse_AVG8U;    break;
1265          case Iop_Avg16Ux4:   op = Asse_AVG16U;   break;
1266          case Iop_CmpEQ8x8:   op = Asse_CMPEQ8;   break;
1267          case Iop_CmpEQ16x4:  op = Asse_CMPEQ16;  break;
1268          case Iop_CmpEQ32x2:  op = Asse_CMPEQ32;  break;
1269          case Iop_CmpGT8Sx8:  op = Asse_CMPGT8S;  break;
1270          case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1271          case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1272          case Iop_Max16Sx4:   op = Asse_MAX16S;   break;
1273          case Iop_Max8Ux8:    op = Asse_MAX8U;    break;
1274          case Iop_Min16Sx4:   op = Asse_MIN16S;   break;
1275          case Iop_Min8Ux8:    op = Asse_MIN8U;    break;
1276          case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1277          case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1278          case Iop_Mul16x4:    op = Asse_MUL16;    break;
1279          case Iop_Sub8x8:     op = Asse_SUB8;     break;
1280          case Iop_Sub16x4:    op = Asse_SUB16;    break;
1281          case Iop_Sub32x2:    op = Asse_SUB32;    break;
1282          case Iop_QSub8Sx8:   op = Asse_QSUB8S;   break;
1283          case Iop_QSub16Sx4:  op = Asse_QSUB16S;  break;
1284          case Iop_QSub8Ux8:   op = Asse_QSUB8U;   break;
1285          case Iop_QSub16Ux4:  op = Asse_QSUB16U;  break;
1286          default: break;
1287       }
1288       if (op != Asse_INVALID) {
1289          /* This isn't pretty, but .. move each arg to the low half of an XMM
1290             register, do the operation on the whole register, and move the
1291             result back to an integer register. */
1292          const IRExpr* arg1 = e->Iex.Binop.arg1;
1293          const IRExpr* arg2 = e->Iex.Binop.arg2;
1294          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1295          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1296          HReg iarg1 = iselIntExpr_R(env, arg1);
1297          HReg iarg2 = iselIntExpr_R(env, arg2);
1298          HReg varg1 = newVRegV(env);
1299          HReg varg2 = newVRegV(env);
1300          HReg idst  = newVRegI(env);
1301          addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1302          addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1303          if (arg1isEReg) {
1304             if (preShift32R) {
1305                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1306                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1307             }
1308             addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1309             addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1310          } else {
1311             vassert(!preShift32R);
1312             addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1313             addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1314          }
1315          return idst;
1316       }
1317
1318       UInt laneBits = 0;
1319       op = Asse_INVALID;
1320       switch (e->Iex.Binop.op) {
1321          case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1322          case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1323          case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1324          case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1325          case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1326          case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1327          default: break;
1328       }
1329       if (op != Asse_INVALID) {
1330          const IRExpr* arg1 = e->Iex.Binop.arg1;
1331          const IRExpr* arg2 = e->Iex.Binop.arg2;
1332          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1333          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1334          HReg igreg = iselIntExpr_R(env, arg1);
1335          HReg vgreg = newVRegV(env);
1336          HReg idst  = newVRegI(env);
1337          addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1338          /* If it's a shift by an in-range immediate, generate a single
1339             instruction. */
1340          if (arg2->tag == Iex_Const) {
1341             IRConst* c = arg2->Iex.Const.con;
1342             vassert(c->tag == Ico_U8);
1343             UInt shift = c->Ico.U8;
1344             if (shift < laneBits) {
1345                addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1346                addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1347                return idst;
1348             }
1349          }
1350          /* Otherwise we have to do it the longwinded way. */
1351          HReg ishift = iselIntExpr_R(env, arg2);
1352          HReg vshift = newVRegV(env);
1353          addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1354          addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1355          addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1356          return idst;
1357       }
1358
1359       if (e->Iex.Binop.op == Iop_Mul32x2) {
1360          const IRExpr* arg1 = e->Iex.Binop.arg1;
1361          const IRExpr* arg2 = e->Iex.Binop.arg2;
1362          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1363          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1364          HReg s1 = iselIntExpr_R(env, arg1);
1365          HReg s2 = iselIntExpr_R(env, arg2);
1366          HReg resLo = newVRegI(env);
1367          // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1368          addInstr(env, mk_iMOVsd_RR(s1, resLo));
1369          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1370          addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1371
1372          // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1373          HReg resHi = newVRegI(env);
1374          addInstr(env, mk_iMOVsd_RR(s1, resHi));
1375          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1376          HReg tmp = newVRegI(env);
1377          addInstr(env, mk_iMOVsd_RR(s2, tmp));
1378          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1379          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1380          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1381
1382          // final result = resHi | resLo
1383          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1384          return resLo;
1385       }
1386
1387       // A few remaining SIMD64 ops require helper functions, at least for
1388       // now.
1389       Bool second_is_UInt = False;
1390       HWord fn = 0;
1391       switch (e->Iex.Binop.op) {
1392          case Iop_CatOddLanes16x4:
1393             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1394          case Iop_CatEvenLanes16x4:
1395             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1396          case Iop_PermOrZero8x8:
1397             fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1398
1399          case Iop_QNarrowBin32Sto16Sx4:
1400             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1401          case Iop_QNarrowBin16Sto8Sx8:
1402             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1403          case Iop_QNarrowBin16Sto8Ux8:
1404             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1405
1406          case Iop_NarrowBin16to8x8:
1407             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1408          case Iop_NarrowBin32to16x4:
1409             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1410
1411          case Iop_SarN8x8:
1412             fn = (HWord)h_generic_calc_SarN8x8;
1413             second_is_UInt = True;
1414             break;
1415
1416          default:
1417             fn = (HWord)0; break;
1418       }
1419       if (fn != (HWord)0) {
1420          /* Note: the following assumes all helpers are of signature
1421                ULong fn ( ULong, ULong ), and they are
1422             not marked as regparm functions.
1423          */
1424          HReg dst  = newVRegI(env);
1425          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1426          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1427          if (second_is_UInt)
1428             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1429          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1430          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1431          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1432                                         mk_RetLoc_simple(RLPri_Int) ));
1433          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1434          return dst;
1435       }
1436
1437       // Half-float vector conversion
1438       if (e->Iex.Binop.op == Iop_F32toF16x4
1439           && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1440          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1441          HReg dstV = newVRegV(env);
1442          HReg dstI = newVRegI(env);
1443          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1444          addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1445          set_SSE_rounding_default(env);
1446          addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1447          return dstI;
1448       }
1449
1450       break;
1451    }
1452
1453    /* --------- UNARY OP --------- */
1454    case Iex_Unop: {
1455
1456       /* 1Uto8(64to1(expr64)) */
1457       {
1458          DEFINE_PATTERN( p_1Uto8_64to1,
1459                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1460          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1461             const IRExpr* expr64 = mi.bindee[0];
1462             HReg    dst    = newVRegI(env);
1463             HReg    src    = iselIntExpr_R(env, expr64);
1464             addInstr(env, mk_iMOVsd_RR(src,dst) );
1465             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1466                                             AMD64RMI_Imm(1), dst));
1467             return dst;
1468          }
1469       }
1470
1471       /* 8Uto64(LDle(expr64)) */
1472       {
1473          DEFINE_PATTERN(p_LDle8_then_8Uto64,
1474                         unop(Iop_8Uto64,
1475                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1476          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1477             HReg dst = newVRegI(env);
1478             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1479             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1480             return dst;
1481          }
1482       }
1483
1484       /* 16Uto64(LDle(expr64)) */
1485       {
1486          DEFINE_PATTERN(p_LDle16_then_16Uto64,
1487                         unop(Iop_16Uto64,
1488                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1489          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1490             HReg dst = newVRegI(env);
1491             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1492             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1493             return dst;
1494          }
1495       }
1496
1497       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1498          Use 32 bit arithmetic and let the default zero-extend rule
1499          do the 32Uto64 for free. */
1500       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1501          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1502          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1503          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1504          AMD64AluOp aluOp = Aalu_INVALID;
1505          switch (opi) {
1506             case Iop_Add32: aluOp = Aalu_ADD; break;
1507             case Iop_Sub32: aluOp = Aalu_SUB; break;
1508             case Iop_And32: aluOp = Aalu_AND; break;
1509             case Iop_Or32:  aluOp = Aalu_OR;  break;
1510             case Iop_Xor32: aluOp = Aalu_XOR; break;
1511             default: break;
1512          }
1513          if (aluOp != Aalu_INVALID) {
1514             /* For commutative ops we assume any literal values are on
1515                the second operand. */
1516             HReg dst      = newVRegI(env);
1517             HReg reg      = iselIntExpr_R(env, argL);
1518             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1519             addInstr(env, mk_iMOVsd_RR(reg,dst));
1520             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1521             return dst;
1522          }
1523          /* just fall through to normal handling for Iop_32Uto64 */
1524       }
1525
1526       /* Fallback cases */
1527       switch (e->Iex.Unop.op) {
1528          case Iop_32Uto64:
1529          case Iop_32Sto64: {
1530             HReg dst = newVRegI(env);
1531             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1532             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1533                                             src, dst) );
1534             return dst;
1535          }
1536          case Iop_128HIto64: {
1537             HReg rHi, rLo;
1538             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1539             return rHi; /* and abandon rLo */
1540          }
1541          case Iop_128to64: {
1542             HReg rHi, rLo;
1543             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1544             return rLo; /* and abandon rHi */
1545          }
1546          case Iop_8Uto16:
1547          case Iop_8Uto32:
1548          case Iop_8Uto64:
1549          case Iop_16Uto64:
1550          case Iop_16Uto32: {
1551             HReg dst     = newVRegI(env);
1552             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1553             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1554                                    || e->Iex.Unop.op==Iop_16Uto64 );
1555             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1556             addInstr(env, mk_iMOVsd_RR(src,dst) );
1557             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1558                                             AMD64RMI_Imm(mask), dst));
1559             return dst;
1560          }
1561          case Iop_8Sto16:
1562          case Iop_8Sto64:
1563          case Iop_8Sto32:
1564          case Iop_16Sto32:
1565          case Iop_16Sto64: {
1566             HReg dst     = newVRegI(env);
1567             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1568             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1569                                    || e->Iex.Unop.op==Iop_16Sto64 );
1570             UInt amt     = srcIs16 ? 48 : 56;
1571             addInstr(env, mk_iMOVsd_RR(src,dst) );
1572             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1573             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1574             return dst;
1575          }
1576          case Iop_Not8:
1577          case Iop_Not16:
1578          case Iop_Not32:
1579          case Iop_Not64: {
1580             HReg dst = newVRegI(env);
1581             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1582             addInstr(env, mk_iMOVsd_RR(src,dst) );
1583             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1584             return dst;
1585          }
1586          case Iop_16HIto8:
1587          case Iop_32HIto16:
1588          case Iop_64HIto32: {
1589             HReg dst  = newVRegI(env);
1590             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1591             Int shift = 0;
1592             switch (e->Iex.Unop.op) {
1593                case Iop_16HIto8:  shift = 8;  break;
1594                case Iop_32HIto16: shift = 16; break;
1595                case Iop_64HIto32: shift = 32; break;
1596                default: vassert(0);
1597             }
1598             addInstr(env, mk_iMOVsd_RR(src,dst) );
1599             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1600             return dst;
1601          }
1602          case Iop_1Uto64:
1603          case Iop_1Uto32:
1604          case Iop_1Uto8: {
1605             HReg dst           = newVRegI(env);
1606             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1607             addInstr(env, AMD64Instr_Set64(cond,dst));
1608             return dst;
1609          }
1610          case Iop_1Sto8:
1611          case Iop_1Sto16:
1612          case Iop_1Sto32:
1613          case Iop_1Sto64: {
1614             /* could do better than this, but for now ... */
1615             HReg dst           = newVRegI(env);
1616             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1617             addInstr(env, AMD64Instr_Set64(cond,dst));
1618             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1619             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1620             return dst;
1621          }
1622          case Iop_Ctz64: {
1623             /* Count trailing zeroes, implemented by amd64 'bsfq' */
1624             HReg dst = newVRegI(env);
1625             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1626             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1627             return dst;
1628          }
1629          case Iop_Clz64: {
1630             /* Count leading zeroes.  Do 'bsrq' to establish the index
1631                of the highest set bit, and subtract that value from
1632                63. */
1633             HReg tmp = newVRegI(env);
1634             HReg dst = newVRegI(env);
1635             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1636             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1637             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1638                                             AMD64RMI_Imm(63), dst));
1639             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1640                                             AMD64RMI_Reg(tmp), dst));
1641             return dst;
1642          }
1643
1644          case Iop_CmpwNEZ64: {
1645             HReg dst = newVRegI(env);
1646             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1647             addInstr(env, mk_iMOVsd_RR(src,dst));
1648             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1649             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1650                                             AMD64RMI_Reg(src), dst));
1651             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1652             return dst;
1653          }
1654
1655          case Iop_CmpwNEZ32: {
1656             HReg src = newVRegI(env);
1657             HReg dst = newVRegI(env);
1658             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1659             addInstr(env, mk_iMOVsd_RR(pre,src));
1660             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1661             addInstr(env, mk_iMOVsd_RR(src,dst));
1662             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1663             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1664                                             AMD64RMI_Reg(src), dst));
1665             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1666             return dst;
1667          }
1668
1669          case Iop_Left8:
1670          case Iop_Left16:
1671          case Iop_Left32:
1672          case Iop_Left64: {
1673             HReg dst = newVRegI(env);
1674             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1675             addInstr(env, mk_iMOVsd_RR(src, dst));
1676             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1677             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1678             return dst;
1679          }
1680
1681          case Iop_V128to32: {
1682             HReg        dst     = newVRegI(env);
1683             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1684             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1685             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1686             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1687             return dst;
1688          }
1689
1690          /* V128{HI}to64 */
1691          case Iop_V128to64: {
1692             HReg dst = newVRegI(env);
1693             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1694             addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1695             return dst;
1696          }
1697          case Iop_V128HIto64: {
1698             HReg dst  = newVRegI(env);
1699             HReg vec  = iselVecExpr(env, e->Iex.Unop.arg);
1700             HReg vec2 = newVRegV(env);
1701             addInstr(env, mk_vMOVsd_RR(vec, vec2));
1702             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1703             addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1704             return dst;
1705          }
1706
1707          /* V256to64_{3,2,1,0} */
1708          case Iop_V256to64_0: case Iop_V256to64_1:
1709          case Iop_V256to64_2: case Iop_V256to64_3: {
1710             HReg vHi, vLo, vec;
1711             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1712             /* Do the first part of the selection by deciding which of
1713                the 128 bit registers to look at, and second part using
1714                the same scheme as for V128{HI}to64 above. */
1715             Bool low64of128 = True;
1716             switch (e->Iex.Unop.op) {
1717                case Iop_V256to64_0: vec = vLo; low64of128 = True;  break;
1718                case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1719                case Iop_V256to64_2: vec = vHi; low64of128 = True;  break;
1720                case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1721                default: vassert(0);
1722             }
1723             HReg dst = newVRegI(env);
1724             if (low64of128) {
1725                addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1726             } else {
1727                HReg vec2 = newVRegV(env);
1728                addInstr(env, mk_vMOVsd_RR(vec, vec2));
1729                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1730                addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1731             }
1732             return dst;
1733          }
1734
1735          /* ReinterpF64asI64(e) */
1736          /* Given an IEEE754 double, produce an I64 with the same bit
1737             pattern. */
1738          case Iop_ReinterpF64asI64: {
1739             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1740             HReg        dst    = newVRegI(env);
1741             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1742             /* paranoia */
1743             set_SSE_rounding_default(env);
1744             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1745             addInstr(env, AMD64Instr_Alu64R(
1746                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1747             return dst;
1748          }
1749
1750          /* ReinterpF32asI32(e) */
1751          /* Given an IEEE754 single, produce an I64 with the same bit
1752             pattern in the lower half. */
1753          case Iop_ReinterpF32asI32: {
1754             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1755             HReg        dst    = newVRegI(env);
1756             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1757             /* paranoia */
1758             set_SSE_rounding_default(env);
1759             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1760             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1761             return dst;
1762          }
1763
1764          case Iop_16to8:
1765          case Iop_32to8:
1766          case Iop_64to8:
1767          case Iop_32to16:
1768          case Iop_64to16:
1769          case Iop_64to32:
1770             /* These are no-ops. */
1771             return iselIntExpr_R(env, e->Iex.Unop.arg);
1772
1773          case Iop_GetMSBs8x8: {
1774             /* Note: the following assumes the helper is of
1775                signature
1776                   UInt fn ( ULong ), and is not a regparm fn.
1777             */
1778             HReg dst = newVRegI(env);
1779             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1780             HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1781             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1782             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1783                                            1, mk_RetLoc_simple(RLPri_Int) ));
1784             /* MovxLQ is not exactly the right thing here.  We just
1785                need to get the bottom 8 bits of RAX into dst, and zero
1786                out everything else.  Assuming that the helper returns
1787                a UInt with the top 24 bits zeroed out, it'll do,
1788                though. */
1789             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1790             return dst;
1791          }
1792
1793          case Iop_GetMSBs8x16: {
1794             /* Note: the following assumes the helper is of signature
1795                   UInt fn ( ULong w64hi, ULong w64Lo ),
1796                and is not a regparm fn. */
1797             HReg dst = newVRegI(env);
1798             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1799             HReg rsp = hregAMD64_RSP();
1800             HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1801             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1802             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1803             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1804                                              16, vec, m16_rsp));
1805             /* hi 64 bits into RDI -- the first arg */
1806             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1807                                              AMD64RMI_Mem(m8_rsp),
1808                                              hregAMD64_RDI() )); /* 1st arg */
1809             /* lo 64 bits into RSI -- the 2nd arg */
1810             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1811                                              AMD64RMI_Mem(m16_rsp),
1812                                              hregAMD64_RSI() )); /* 2nd arg */
1813             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1814                                            2, mk_RetLoc_simple(RLPri_Int) ));
1815             /* MovxLQ is not exactly the right thing here.  We just
1816                need to get the bottom 16 bits of RAX into dst, and zero
1817                out everything else.  Assuming that the helper returns
1818                a UInt with the top 16 bits zeroed out, it'll do,
1819                though. */
1820             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1821             return dst;
1822          }
1823
1824          default:
1825             break;
1826       }
1827
1828       /* Deal with unary 64-bit SIMD ops. */
1829       HWord fn = 0;
1830       switch (e->Iex.Unop.op) {
1831          case Iop_CmpNEZ32x2:
1832             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1833          case Iop_CmpNEZ16x4:
1834             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1835          case Iop_CmpNEZ8x8:
1836             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1837          default:
1838             fn = (HWord)0; break;
1839       }
1840       if (fn != (HWord)0) {
1841          /* Note: the following assumes all helpers are of
1842             signature
1843                ULong fn ( ULong ), and they are
1844             not marked as regparm functions.
1845          */
1846          HReg dst = newVRegI(env);
1847          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1848          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1849          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1850                                         mk_RetLoc_simple(RLPri_Int) ));
1851          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1852          return dst;
1853       }
1854
1855       break;
1856    }
1857
1858    /* --------- GET --------- */
1859    case Iex_Get: {
1860       if (ty == Ity_I64) {
1861          HReg dst = newVRegI(env);
1862          addInstr(env, AMD64Instr_Alu64R(
1863                           Aalu_MOV,
1864                           AMD64RMI_Mem(
1865                              AMD64AMode_IR(e->Iex.Get.offset,
1866                                            hregAMD64_RBP())),
1867                           dst));
1868          return dst;
1869       }
1870       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1871          HReg dst = newVRegI(env);
1872          addInstr(env, AMD64Instr_LoadEX(
1873                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1874                           False,
1875                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1876                           dst));
1877          return dst;
1878       }
1879       break;
1880    }
1881
1882    case Iex_GetI: {
1883       AMD64AMode* am
1884          = genGuestArrayOffset(
1885               env, e->Iex.GetI.descr,
1886                    e->Iex.GetI.ix, e->Iex.GetI.bias );
1887       HReg dst = newVRegI(env);
1888       if (ty == Ity_I8) {
1889          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1890          return dst;
1891       }
1892       if (ty == Ity_I64) {
1893          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1894          return dst;
1895       }
1896       break;
1897    }
1898
1899    /* --------- CCALL --------- */
1900    case Iex_CCall: {
1901       HReg    dst = newVRegI(env);
1902       vassert(ty == e->Iex.CCall.retty);
1903
1904       /* be very restrictive for now.  Only 64-bit ints allowed for
1905          args, and 64 or 32 bits for return type. */
1906       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1907          goto irreducible;
1908
1909       /* Marshal args, do the call. */
1910       UInt   addToSp = 0;
1911       RetLoc rloc    = mk_RetLoc_INVALID();
1912       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1913                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1914       vassert(is_sane_RetLoc(rloc));
1915       vassert(rloc.pri == RLPri_Int);
1916       vassert(addToSp == 0);
1917
1918       /* Move to dst, and zero out the top 32 bits if the result type is
1919          Ity_I32.  Probably overkill, but still .. */
1920       if (e->Iex.CCall.retty == Ity_I64)
1921          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1922       else
1923          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1924
1925       return dst;
1926    }
1927
1928    /* --------- LITERAL --------- */
1929    /* 64/32/16/8-bit literals */
1930    case Iex_Const:
1931       if (ty == Ity_I64) {
1932          HReg r = newVRegI(env);
1933          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1934          return r;
1935       } else {
1936          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1937          HReg      r   = newVRegI(env);
1938          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1939          return r;
1940       }
1941
1942    /* --------- MULTIPLEX --------- */
1943    case Iex_ITE: { // VFD
1944       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1945           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1946          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1947          HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1948          HReg     dst = newVRegI(env);
1949          addInstr(env, mk_iMOVsd_RR(r1,dst));
1950          AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1951          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1952          return dst;
1953       }
1954       break;
1955    }
1956
1957    /* --------- TERNARY OP --------- */
1958    case Iex_Triop: {
1959       IRTriop *triop = e->Iex.Triop.details;
1960       /* C3210 flags following FPU partial remainder (fprem), both
1961          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1962       if (triop->op == Iop_PRemC3210F64
1963           || triop->op == Iop_PRem1C3210F64) {
1964          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1965          HReg        arg1   = iselDblExpr(env, triop->arg2);
1966          HReg        arg2   = iselDblExpr(env, triop->arg3);
1967          HReg        dst    = newVRegI(env);
1968          addInstr(env, AMD64Instr_A87Free(2));
1969
1970          /* one arg -> top of x87 stack */
1971          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1972          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1973
1974          /* other arg -> top of x87 stack */
1975          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1976          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1977
1978          switch (triop->op) {
1979             case Iop_PRemC3210F64:
1980                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1981                break;
1982             case Iop_PRem1C3210F64:
1983                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1984                break;
1985             default:
1986                vassert(0);
1987          }
1988          /* Ignore the result, and instead make off with the FPU's
1989             C3210 flags (in the status word). */
1990          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1991          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1992          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1993          return dst;
1994       }
1995       break;
1996    }
1997
1998    default:
1999    break;
2000    } /* switch (e->tag) */
2001
2002    /* We get here if no pattern matched. */
2003   irreducible:
2004    ppIRExpr(e);
2005    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2006 }
2007
2008
2009 /*---------------------------------------------------------*/
2010 /*--- ISEL: Integer expression auxiliaries              ---*/
2011 /*---------------------------------------------------------*/
2012
2013 /* --------------------- AMODEs --------------------- */
2014
2015 /* Return an AMode which computes the value of the specified
2016    expression, possibly also adding insns to the code list as a
2017    result.  The expression may only be a 32-bit one.
2018 */
2019
2020 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2021 {
2022    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2023    vassert(sane_AMode(am));
2024    return am;
2025 }
2026
2027 /* DO NOT CALL THIS DIRECTLY ! */
2028 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2029 {
2030    MatchInfo mi;
2031    DECLARE_PATTERN(p_complex);
2032    IRType ty = typeOfIRExpr(env->type_env,e);
2033    vassert(ty == Ity_I64);
2034
2035    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2036    /*              bind0        bind1  bind2   bind3   */
2037    DEFINE_PATTERN(p_complex,
2038       binop( Iop_Add64,
2039              binop( Iop_Add64,
2040                     bind(0),
2041                     binop(Iop_Shl64, bind(1), bind(2))
2042                   ),
2043              bind(3)
2044            )
2045    );
2046    if (matchIRExpr(&mi, p_complex, e)) {
2047       const IRExpr* expr1  = mi.bindee[0];
2048       const IRExpr* expr2  = mi.bindee[1];
2049       const IRExpr* imm8   = mi.bindee[2];
2050       const IRExpr* simm32 = mi.bindee[3];
2051       if (imm8->tag == Iex_Const
2052           && imm8->Iex.Const.con->tag == Ico_U8
2053           && imm8->Iex.Const.con->Ico.U8 < 4
2054           /* imm8 is OK, now check simm32 */
2055           && simm32->tag == Iex_Const
2056           && simm32->Iex.Const.con->tag == Ico_U64
2057           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2058          UInt shift = imm8->Iex.Const.con->Ico.U8;
2059          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2060          HReg r1 = iselIntExpr_R(env, expr1);
2061          HReg r2 = iselIntExpr_R(env, expr2);
2062          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2063          return AMD64AMode_IRRS(offset, r1, r2, shift);
2064       }
2065    }
2066
2067    /* Add64(expr1, Shl64(expr2, imm)) */
2068    if (e->tag == Iex_Binop
2069        && e->Iex.Binop.op == Iop_Add64
2070        && e->Iex.Binop.arg2->tag == Iex_Binop
2071        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2072        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2073        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2074       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2075       if (shift == 1 || shift == 2 || shift == 3) {
2076          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2077          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2078          return AMD64AMode_IRRS(0, r1, r2, shift);
2079       }
2080    }
2081
2082    /* Add64(expr,i) */
2083    if (e->tag == Iex_Binop
2084        && e->Iex.Binop.op == Iop_Add64
2085        && e->Iex.Binop.arg2->tag == Iex_Const
2086        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2087        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2088       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2089       return AMD64AMode_IR(
2090                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2091                 r1
2092              );
2093    }
2094
2095    /* Doesn't match anything in particular.  Generate it into
2096       a register and use that. */
2097    {
2098       HReg r1 = iselIntExpr_R(env, e);
2099       return AMD64AMode_IR(0, r1);
2100    }
2101 }
2102
2103
2104 /* --------------------- RMIs --------------------- */
2105
2106 /* Similarly, calculate an expression into an X86RMI operand.  As with
2107    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2108
2109 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2110 {
2111    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2112    /* sanity checks ... */
2113    switch (rmi->tag) {
2114       case Armi_Imm:
2115          return rmi;
2116       case Armi_Reg:
2117          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2118          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2119          return rmi;
2120       case Armi_Mem:
2121          vassert(sane_AMode(rmi->Armi.Mem.am));
2122          return rmi;
2123       default:
2124          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2125    }
2126 }
2127
2128 /* DO NOT CALL THIS DIRECTLY ! */
2129 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2130 {
2131    IRType ty = typeOfIRExpr(env->type_env,e);
2132    vassert(ty == Ity_I64 || ty == Ity_I32
2133            || ty == Ity_I16 || ty == Ity_I8);
2134
2135    /* special case: immediate 64/32/16/8 */
2136    if (e->tag == Iex_Const) {
2137       switch (e->Iex.Const.con->tag) {
2138         case Ico_U64:
2139            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2140               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2141            }
2142            break;
2143          case Ico_U32:
2144             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2145          case Ico_U16:
2146             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2147          case Ico_U8:
2148             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2149          default:
2150             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2151       }
2152    }
2153
2154    /* special case: 64-bit GET */
2155    if (e->tag == Iex_Get && ty == Ity_I64) {
2156       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2157                                         hregAMD64_RBP()));
2158    }
2159
2160    /* special case: 64-bit load from memory */
2161    if (e->tag == Iex_Load && ty == Ity_I64
2162        && e->Iex.Load.end == Iend_LE) {
2163       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2164       return AMD64RMI_Mem(am);
2165    }
2166
2167    /* default case: calculate into a register and return that */
2168    {
2169       HReg r = iselIntExpr_R ( env, e );
2170       return AMD64RMI_Reg(r);
2171    }
2172 }
2173
2174
2175 /* --------------------- RIs --------------------- */
2176
2177 /* Calculate an expression into an AMD64RI operand.  As with
2178    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2179    bits. */
2180
2181 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2182 {
2183    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2184    /* sanity checks ... */
2185    switch (ri->tag) {
2186       case Ari_Imm:
2187          return ri;
2188       case Ari_Reg:
2189          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2190          vassert(hregIsVirtual(ri->Ari.Reg.reg));
2191          return ri;
2192       default:
2193          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2194    }
2195 }
2196
2197 /* DO NOT CALL THIS DIRECTLY ! */
2198 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2199 {
2200    IRType ty = typeOfIRExpr(env->type_env,e);
2201    vassert(ty == Ity_I64 || ty == Ity_I32
2202            || ty == Ity_I16 || ty == Ity_I8);
2203
2204    /* special case: immediate */
2205    if (e->tag == Iex_Const) {
2206       switch (e->Iex.Const.con->tag) {
2207         case Ico_U64:
2208            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2209               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2210            }
2211            break;
2212          case Ico_U32:
2213             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2214          case Ico_U16:
2215             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2216          case Ico_U8:
2217             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2218          default:
2219             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2220       }
2221    }
2222
2223    /* default case: calculate into a register and return that */
2224    {
2225       HReg r = iselIntExpr_R ( env, e );
2226       return AMD64RI_Reg(r);
2227    }
2228 }
2229
2230
2231 /* --------------------- RMs --------------------- */
2232
2233 /* Similarly, calculate an expression into an AMD64RM operand.  As
2234    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2235    bits.  */
2236
2237 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2238 {
2239    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2240    /* sanity checks ... */
2241    switch (rm->tag) {
2242       case Arm_Reg:
2243          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2244          vassert(hregIsVirtual(rm->Arm.Reg.reg));
2245          return rm;
2246       case Arm_Mem:
2247          vassert(sane_AMode(rm->Arm.Mem.am));
2248          return rm;
2249       default:
2250          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2251    }
2252 }
2253
2254 /* DO NOT CALL THIS DIRECTLY ! */
2255 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2256 {
2257    IRType ty = typeOfIRExpr(env->type_env,e);
2258    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2259
2260    /* special case: 64-bit GET */
2261    if (e->tag == Iex_Get && ty == Ity_I64) {
2262       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2263                                        hregAMD64_RBP()));
2264    }
2265
2266    /* special case: load from memory */
2267
2268    /* default case: calculate into a register and return that */
2269    {
2270       HReg r = iselIntExpr_R ( env, e );
2271       return AMD64RM_Reg(r);
2272    }
2273 }
2274
2275
2276 /* --------------------- CONDCODE --------------------- */
2277
2278 /* Generate code to evaluated a bit-typed expression, returning the
2279    condition code which would correspond when the expression would
2280    notionally have returned 1. */
2281
2282 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2283 {
2284    /* Uh, there's nothing we can sanity check here, unfortunately. */
2285    return iselCondCode_wrk(env,e);
2286 }
2287
2288 /* DO NOT CALL THIS DIRECTLY ! */
2289 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2290 {
2291    vassert(e);
2292    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2293
2294    /* var */
2295    if (e->tag == Iex_RdTmp) {
2296       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2297       HReg dst = newVRegI(env);
2298       addInstr(env, mk_iMOVsd_RR(r64,dst));
2299       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2300       return Acc_NZ;
2301    }
2302
2303    /* Constant 1:Bit */
2304    if (e->tag == Iex_Const) {
2305       HReg r;
2306       vassert(e->Iex.Const.con->tag == Ico_U1);
2307       vassert(e->Iex.Const.con->Ico.U1 == True
2308               || e->Iex.Const.con->Ico.U1 == False);
2309       r = newVRegI(env);
2310       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2311       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2312       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2313    }
2314
2315    /* Not1(...) */
2316    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2317       /* Generate code for the arg, and negate the test condition */
2318       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2319    }
2320
2321    /* --- patterns rooted at: 64to1 --- */
2322
2323    /* 64to1 */
2324    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2325       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2326       addInstr(env, AMD64Instr_Test64(1,reg));
2327       return Acc_NZ;
2328    }
2329
2330    /* --- patterns rooted at: 32to1 --- */
2331
2332    /* 32to1 */
2333    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2334       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2335       addInstr(env, AMD64Instr_Test64(1,reg));
2336       return Acc_NZ;
2337    }
2338
2339    /* --- patterns rooted at: CmpNEZ8 --- */
2340
2341    /* CmpNEZ8(x) */
2342    if (e->tag == Iex_Unop
2343        && e->Iex.Unop.op == Iop_CmpNEZ8) {
2344       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2345       addInstr(env, AMD64Instr_Test64(0xFF,r));
2346       return Acc_NZ;
2347    }
2348
2349    /* --- patterns rooted at: CmpNEZ16 --- */
2350
2351    /* CmpNEZ16(x) */
2352    if (e->tag == Iex_Unop
2353        && e->Iex.Unop.op == Iop_CmpNEZ16) {
2354       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2355       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2356       return Acc_NZ;
2357    }
2358
2359    /* --- patterns rooted at: CmpNEZ32 --- */
2360
2361    if (e->tag == Iex_Unop
2362        && e->Iex.Unop.op == Iop_CmpNEZ32) {
2363       IRExpr* arg = e->Iex.Unop.arg;
2364       if (arg->tag == Iex_Binop
2365           && (arg->Iex.Binop.op == Iop_Or32
2366               || arg->Iex.Binop.op == Iop_And32)) {
2367          /* CmpNEZ32(Or32(x,y)) */
2368          /* CmpNEZ32(And32(x,y)) */
2369          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2370          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2371          HReg      tmp  = newVRegI(env);
2372          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2373          addInstr(env, AMD64Instr_Alu32R(
2374                           arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2375                           rmi1, tmp));
2376          return Acc_NZ;
2377       }
2378       /* CmpNEZ32(x) */
2379       HReg      r1   = iselIntExpr_R(env, arg);
2380       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2381       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2382       return Acc_NZ;
2383    }
2384
2385    /* --- patterns rooted at: CmpNEZ64 --- */
2386
2387    if (e->tag == Iex_Unop
2388        && e->Iex.Unop.op == Iop_CmpNEZ64) {
2389       IRExpr* arg = e->Iex.Unop.arg;
2390       if (arg->tag == Iex_Binop
2391           && (arg->Iex.Binop.op == Iop_Or64
2392               || arg->Iex.Binop.op == Iop_And64)) {
2393          /* CmpNEZ64(Or64(x,y)) */
2394          /* CmpNEZ64(And64(x,y)) */
2395          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2396          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2397          HReg      tmp  = newVRegI(env);
2398          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2399          addInstr(env, AMD64Instr_Alu64R(
2400                           arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2401                           rmi1, tmp));
2402          return Acc_NZ;
2403       }
2404       /* CmpNEZ64(x) */
2405       HReg      r1   = iselIntExpr_R(env, arg);
2406       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2407       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2408       return Acc_NZ;
2409    }
2410
2411    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2412
2413    /* CmpEQ8 / CmpNE8 */
2414    if (e->tag == Iex_Binop
2415        && (e->Iex.Binop.op == Iop_CmpEQ8
2416            || e->Iex.Binop.op == Iop_CmpNE8
2417            || e->Iex.Binop.op == Iop_CasCmpEQ8
2418            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2419       if (isZeroU8(e->Iex.Binop.arg2)) {
2420          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2421          addInstr(env, AMD64Instr_Test64(0xFF,r1));
2422          switch (e->Iex.Binop.op) {
2423             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2424             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2425             default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2426          }
2427       } else {
2428          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2429          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2430          HReg      r    = newVRegI(env);
2431          addInstr(env, mk_iMOVsd_RR(r1,r));
2432          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2433          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2434          switch (e->Iex.Binop.op) {
2435             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2436             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2437             default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2438          }
2439       }
2440    }
2441
2442    /* CmpEQ16 / CmpNE16 */
2443    if (e->tag == Iex_Binop
2444        && (e->Iex.Binop.op == Iop_CmpEQ16
2445            || e->Iex.Binop.op == Iop_CmpNE16
2446            || e->Iex.Binop.op == Iop_CasCmpEQ16
2447            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2448       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2449       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2450       HReg      r    = newVRegI(env);
2451       addInstr(env, mk_iMOVsd_RR(r1,r));
2452       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2453       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2454       switch (e->Iex.Binop.op) {
2455          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2456          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2457          default: vpanic("iselCondCode(amd64): CmpXX16");
2458       }
2459    }
2460
2461    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2462       Saves a "movq %rax, %tmp" compared to the default route. */
2463    if (e->tag == Iex_Binop
2464        && e->Iex.Binop.op == Iop_CmpNE64
2465        && e->Iex.Binop.arg1->tag == Iex_CCall
2466        && e->Iex.Binop.arg2->tag == Iex_Const) {
2467       IRExpr* cal = e->Iex.Binop.arg1;
2468       IRExpr* con = e->Iex.Binop.arg2;
2469       HReg    tmp = newVRegI(env);
2470       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2471       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2472       vassert(con->Iex.Const.con->tag == Ico_U64);
2473       /* Marshal args, do the call. */
2474       UInt   addToSp = 0;
2475       RetLoc rloc    = mk_RetLoc_INVALID();
2476       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2477                     cal->Iex.CCall.cee,
2478                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
2479       vassert(is_sane_RetLoc(rloc));
2480       vassert(rloc.pri == RLPri_Int);
2481       vassert(addToSp == 0);
2482       /* */
2483       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2484       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2485                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2486       return Acc_NZ;
2487    }
2488
2489    /* Cmp*64*(x,y) */
2490    if (e->tag == Iex_Binop
2491        && (e->Iex.Binop.op == Iop_CmpEQ64
2492            || e->Iex.Binop.op == Iop_CmpNE64
2493            || e->Iex.Binop.op == Iop_CmpLT64S
2494            || e->Iex.Binop.op == Iop_CmpLT64U
2495            || e->Iex.Binop.op == Iop_CmpLE64S
2496            || e->Iex.Binop.op == Iop_CmpLE64U
2497            || e->Iex.Binop.op == Iop_CasCmpEQ64
2498            || e->Iex.Binop.op == Iop_CasCmpNE64
2499            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2500       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2501       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2502       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2503       switch (e->Iex.Binop.op) {
2504          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2505          case Iop_CmpNE64:
2506          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2507          case Iop_CmpLT64S: return Acc_L;
2508          case Iop_CmpLT64U: return Acc_B;
2509          case Iop_CmpLE64S: return Acc_LE;
2510          case Iop_CmpLE64U: return Acc_BE;
2511          default: vpanic("iselCondCode(amd64): CmpXX64");
2512       }
2513    }
2514
2515    /* Cmp*32*(x,y) */
2516    if (e->tag == Iex_Binop
2517        && (e->Iex.Binop.op == Iop_CmpEQ32
2518            || e->Iex.Binop.op == Iop_CmpNE32
2519            || e->Iex.Binop.op == Iop_CmpLT32S
2520            || e->Iex.Binop.op == Iop_CmpLT32U
2521            || e->Iex.Binop.op == Iop_CmpLE32S
2522            || e->Iex.Binop.op == Iop_CmpLE32U
2523            || e->Iex.Binop.op == Iop_CasCmpEQ32
2524            || e->Iex.Binop.op == Iop_CasCmpNE32
2525            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2526       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2527       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2528       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2529       switch (e->Iex.Binop.op) {
2530          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2531          case Iop_CmpNE32:
2532          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2533          case Iop_CmpLT32S: return Acc_L;
2534          case Iop_CmpLT32U: return Acc_B;
2535          case Iop_CmpLE32S: return Acc_LE;
2536          case Iop_CmpLE32U: return Acc_BE;
2537          default: vpanic("iselCondCode(amd64): CmpXX32");
2538       }
2539    }
2540
2541    ppIRExpr(e);
2542    vpanic("iselCondCode(amd64)");
2543 }
2544
2545
2546 /*---------------------------------------------------------*/
2547 /*--- ISEL: Integer expressions (128 bit)               ---*/
2548 /*---------------------------------------------------------*/
2549
2550 /* Compute a 128-bit value into a register pair, which is returned as
2551    the first two parameters.  As with iselIntExpr_R, these may be
2552    either real or virtual regs; in any case they must not be changed
2553    by subsequent code emitted by the caller.  */
2554
2555 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2556                              ISelEnv* env, const IRExpr* e )
2557 {
2558    iselInt128Expr_wrk(rHi, rLo, env, e);
2559 #  if 0
2560    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2561 #  endif
2562    vassert(hregClass(*rHi) == HRcInt64);
2563    vassert(hregIsVirtual(*rHi));
2564    vassert(hregClass(*rLo) == HRcInt64);
2565    vassert(hregIsVirtual(*rLo));
2566 }
2567
2568 /* DO NOT CALL THIS DIRECTLY ! */
2569 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2570                                  ISelEnv* env, const IRExpr* e )
2571 {
2572    vassert(e);
2573    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2574
2575    /* read 128-bit IRTemp */
2576    if (e->tag == Iex_RdTmp) {
2577       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2578       return;
2579    }
2580
2581    /* --------- BINARY ops --------- */
2582    if (e->tag == Iex_Binop) {
2583       switch (e->Iex.Binop.op) {
2584          /* 64 x 64 -> 128 multiply */
2585          case Iop_MullU64:
2586          case Iop_MullS64: {
2587             /* get one operand into %rax, and the other into a R/M.
2588                Need to make an educated guess about which is better in
2589                which. */
2590             HReg     tLo    = newVRegI(env);
2591             HReg     tHi    = newVRegI(env);
2592             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2593             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2594             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2595             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2596             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2597             /* Result is now in RDX:RAX.  Tell the caller. */
2598             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2599             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2600             *rHi = tHi;
2601             *rLo = tLo;
2602             return;
2603          }
2604
2605          /* 128 x 64 -> (64(rem),64(div)) division */
2606          case Iop_DivModU128to64:
2607          case Iop_DivModS128to64: {
2608             /* Get the 128-bit operand into rdx:rax, and the other into
2609                any old R/M. */
2610             HReg sHi, sLo;
2611             HReg     tLo     = newVRegI(env);
2612             HReg     tHi     = newVRegI(env);
2613             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2614             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2615             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2616             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2617             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2618             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2619             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2620             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2621             *rHi = tHi;
2622             *rLo = tLo;
2623             return;
2624          }
2625
2626          /* 64HLto128(e1,e2) */
2627          case Iop_64HLto128:
2628             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2629             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2630             return;
2631
2632          default:
2633             break;
2634       }
2635    } /* if (e->tag == Iex_Binop) */
2636
2637    ppIRExpr(e);
2638    vpanic("iselInt128Expr");
2639 }
2640
2641
2642 /*---------------------------------------------------------*/
2643 /*--- ISEL: Floating point expressions (32 bit)         ---*/
2644 /*---------------------------------------------------------*/
2645
2646 /* Nothing interesting here; really just wrappers for
2647    64-bit stuff. */
2648
2649 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2650 {
2651    HReg r = iselFltExpr_wrk( env, e );
2652 #  if 0
2653    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2654 #  endif
2655    vassert(hregClass(r) == HRcVec128);
2656    vassert(hregIsVirtual(r));
2657    return r;
2658 }
2659
2660 /* DO NOT CALL THIS DIRECTLY */
2661 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2662 {
2663    IRType ty = typeOfIRExpr(env->type_env,e);
2664    vassert(ty == Ity_F32);
2665
2666    if (e->tag == Iex_RdTmp) {
2667       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2668    }
2669
2670    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2671       AMD64AMode* am;
2672       HReg res = newVRegV(env);
2673       vassert(e->Iex.Load.ty == Ity_F32);
2674       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2675       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2676       return res;
2677    }
2678
2679    if (e->tag == Iex_Binop
2680        && e->Iex.Binop.op == Iop_F64toF32) {
2681       /* Although the result is still held in a standard SSE register,
2682          we need to round it to reflect the loss of accuracy/range
2683          entailed in casting it to a 32-bit float. */
2684       HReg dst = newVRegV(env);
2685       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2686       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2687       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2688       set_SSE_rounding_default( env );
2689       return dst;
2690    }
2691
2692    if (e->tag == Iex_Get) {
2693       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2694                                        hregAMD64_RBP() );
2695       HReg res = newVRegV(env);
2696       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2697       return res;
2698    }
2699
2700    if (e->tag == Iex_Unop
2701        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2702        /* Given an I32, produce an IEEE754 float with the same bit
2703           pattern. */
2704        HReg        dst    = newVRegV(env);
2705        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2706        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2707        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2708        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2709        return dst;
2710    }
2711
2712    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2713       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2714       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2715       HReg        dst    = newVRegV(env);
2716
2717       /* rf now holds the value to be rounded.  The first thing to do
2718          is set the FPU's rounding mode accordingly. */
2719
2720       /* Set host x87 rounding mode */
2721       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2722
2723       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2724       addInstr(env, AMD64Instr_A87Free(1));
2725       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2726       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2727       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2728       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2729
2730       /* Restore default x87 rounding. */
2731       set_FPU_rounding_default( env );
2732
2733       return dst;
2734    }
2735
2736    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2737       /* Sigh ... very rough code.  Could do much better. */
2738       /* Get the 128-bit literal 00---0 10---0 into a register
2739          and xor it with the value to be negated. */
2740       HReg r1  = newVRegI(env);
2741       HReg dst = newVRegV(env);
2742       HReg tmp = newVRegV(env);
2743       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2744       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2745       addInstr(env, mk_vMOVsd_RR(src,tmp));
2746       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2747       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2748       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2749       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2750       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2751       add_to_rsp(env, 16);
2752       return dst;
2753    }
2754
2755    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2756       IRQop *qop = e->Iex.Qop.details;
2757       HReg dst  = newVRegV(env);
2758       HReg argX = iselFltExpr(env, qop->arg2);
2759       HReg argY = iselFltExpr(env, qop->arg3);
2760       HReg argZ = iselFltExpr(env, qop->arg4);
2761       /* XXXROUNDINGFIXME */
2762       /* set roundingmode here */
2763       /* subq $16, %rsp         -- make a space*/
2764       sub_from_rsp(env, 16);
2765       /* Prepare 4 arg regs:
2766          leaq 0(%rsp), %rdi
2767          leaq 4(%rsp), %rsi
2768          leaq 8(%rsp), %rdx
2769          leaq 12(%rsp), %rcx
2770       */
2771       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2772                                      hregAMD64_RDI()));
2773       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2774                                      hregAMD64_RSI()));
2775       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2776                                      hregAMD64_RDX()));
2777       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2778                                      hregAMD64_RCX()));
2779       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2780          movss  %argX, 0(%rsi)
2781          movss  %argY, 0(%rdx)
2782          movss  %argZ, 0(%rcx)
2783          */
2784       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2785                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2786       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2787                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2788       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2789                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2790       /* call the helper */
2791       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2792                                      (ULong)(HWord)h_generic_calc_MAddF32,
2793                                      4, mk_RetLoc_simple(RLPri_None) ));
2794       /* fetch the result from memory, using %r_argp, which the
2795          register allocator will keep alive across the call. */
2796       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2797                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2798       /* and finally, clear the space */
2799       add_to_rsp(env, 16);
2800       return dst;
2801    }
2802
2803    ppIRExpr(e);
2804    vpanic("iselFltExpr_wrk");
2805 }
2806
2807
2808 /*---------------------------------------------------------*/
2809 /*--- ISEL: Floating point expressions (64 bit)         ---*/
2810 /*---------------------------------------------------------*/
2811
2812 /* Compute a 64-bit floating point value into the lower half of an xmm
2813    register, the identity of which is returned.  As with
2814    iselIntExpr_R, the returned reg will be virtual, and it must not be
2815    changed by subsequent code emitted by the caller.
2816 */
2817
2818 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2819
2820     Type                  S (1 bit)   E (11 bits)   F (52 bits)
2821     ----                  ---------   -----------   -----------
2822     signalling NaN        u           2047 (max)    .0uuuuu---u
2823                                                     (with at least
2824                                                      one 1 bit)
2825     quiet NaN             u           2047 (max)    .1uuuuu---u
2826
2827     negative infinity     1           2047 (max)    .000000---0
2828
2829     positive infinity     0           2047 (max)    .000000---0
2830
2831     negative zero         1           0             .000000---0
2832
2833     positive zero         0           0             .000000---0
2834 */
2835
2836 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2837 {
2838    HReg r = iselDblExpr_wrk( env, e );
2839 #  if 0
2840    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2841 #  endif
2842    vassert(hregClass(r) == HRcVec128);
2843    vassert(hregIsVirtual(r));
2844    return r;
2845 }
2846
2847 /* DO NOT CALL THIS DIRECTLY */
2848 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2849 {
2850    IRType ty = typeOfIRExpr(env->type_env,e);
2851    vassert(e);
2852    vassert(ty == Ity_F64);
2853
2854    if (e->tag == Iex_RdTmp) {
2855       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2856    }
2857
2858    if (e->tag == Iex_Const) {
2859       union { ULong u64; Double f64; } u;
2860       HReg res = newVRegV(env);
2861       HReg tmp = newVRegI(env);
2862       vassert(sizeof(u) == 8);
2863       vassert(sizeof(u.u64) == 8);
2864       vassert(sizeof(u.f64) == 8);
2865
2866       if (e->Iex.Const.con->tag == Ico_F64) {
2867          u.f64 = e->Iex.Const.con->Ico.F64;
2868       }
2869       else if (e->Iex.Const.con->tag == Ico_F64i) {
2870          u.u64 = e->Iex.Const.con->Ico.F64i;
2871       }
2872       else
2873          vpanic("iselDblExpr(amd64): const");
2874
2875       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2876       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2877       addInstr(env, AMD64Instr_SseLdSt(
2878                        True/*load*/, 8, res,
2879                        AMD64AMode_IR(0, hregAMD64_RSP())
2880               ));
2881       add_to_rsp(env, 8);
2882       return res;
2883    }
2884
2885    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2886       AMD64AMode* am;
2887       HReg res = newVRegV(env);
2888       vassert(e->Iex.Load.ty == Ity_F64);
2889       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2890       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2891       return res;
2892    }
2893
2894    if (e->tag == Iex_Get) {
2895       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2896                                       hregAMD64_RBP() );
2897       HReg res = newVRegV(env);
2898       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2899       return res;
2900    }
2901
2902    if (e->tag == Iex_GetI) {
2903       AMD64AMode* am
2904          = genGuestArrayOffset(
2905               env, e->Iex.GetI.descr,
2906                    e->Iex.GetI.ix, e->Iex.GetI.bias );
2907       HReg res = newVRegV(env);
2908       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2909       return res;
2910    }
2911
2912    if (e->tag == Iex_Triop) {
2913       IRTriop *triop = e->Iex.Triop.details;
2914       AMD64SseOp op = Asse_INVALID;
2915       switch (triop->op) {
2916          case Iop_AddF64: op = Asse_ADDF; break;
2917          case Iop_SubF64: op = Asse_SUBF; break;
2918          case Iop_MulF64: op = Asse_MULF; break;
2919          case Iop_DivF64: op = Asse_DIVF; break;
2920          default: break;
2921       }
2922       if (op != Asse_INVALID) {
2923          HReg dst  = newVRegV(env);
2924          HReg argL = iselDblExpr(env, triop->arg2);
2925          HReg argR = iselDblExpr(env, triop->arg3);
2926          addInstr(env, mk_vMOVsd_RR(argL, dst));
2927          /* XXXROUNDINGFIXME */
2928          /* set roundingmode here */
2929          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2930          return dst;
2931       }
2932    }
2933
2934    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2935       IRQop *qop = e->Iex.Qop.details;
2936       HReg dst  = newVRegV(env);
2937       HReg argX = iselDblExpr(env, qop->arg2);
2938       HReg argY = iselDblExpr(env, qop->arg3);
2939       HReg argZ = iselDblExpr(env, qop->arg4);
2940       /* XXXROUNDINGFIXME */
2941       /* set roundingmode here */
2942       /* subq $32, %rsp         -- make a space*/
2943       sub_from_rsp(env, 32);
2944       /* Prepare 4 arg regs:
2945          leaq 0(%rsp), %rdi
2946          leaq 8(%rsp), %rsi
2947          leaq 16(%rsp), %rdx
2948          leaq 24(%rsp), %rcx
2949       */
2950       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2951                                      hregAMD64_RDI()));
2952       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2953                                      hregAMD64_RSI()));
2954       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2955                                      hregAMD64_RDX()));
2956       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2957                                      hregAMD64_RCX()));
2958       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2959          movsd  %argX, 0(%rsi)
2960          movsd  %argY, 0(%rdx)
2961          movsd  %argZ, 0(%rcx)
2962          */
2963       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2964                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2965       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2966                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2967       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2968                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2969       /* call the helper */
2970       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2971                                      (ULong)(HWord)h_generic_calc_MAddF64,
2972                                      4, mk_RetLoc_simple(RLPri_None) ));
2973       /* fetch the result from memory, using %r_argp, which the
2974          register allocator will keep alive across the call. */
2975       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2976                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2977       /* and finally, clear the space */
2978       add_to_rsp(env, 32);
2979       return dst;
2980    }
2981
2982    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2983       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2984       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2985       HReg        dst    = newVRegV(env);
2986
2987       /* rf now holds the value to be rounded.  The first thing to do
2988          is set the FPU's rounding mode accordingly. */
2989
2990       /* Set host x87 rounding mode */
2991       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2992
2993       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2994       addInstr(env, AMD64Instr_A87Free(1));
2995       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2996       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2997       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2998       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2999
3000       /* Restore default x87 rounding. */
3001       set_FPU_rounding_default( env );
3002
3003       return dst;
3004    }
3005
3006    IRTriop *triop = e->Iex.Triop.details;
3007    if (e->tag == Iex_Triop
3008        && (triop->op == Iop_ScaleF64
3009            || triop->op == Iop_AtanF64
3010            || triop->op == Iop_Yl2xF64
3011            || triop->op == Iop_Yl2xp1F64
3012            || triop->op == Iop_PRemF64
3013            || triop->op == Iop_PRem1F64)
3014       ) {
3015       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3016       HReg        arg1   = iselDblExpr(env, triop->arg2);
3017       HReg        arg2   = iselDblExpr(env, triop->arg3);
3018       HReg        dst    = newVRegV(env);
3019       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
3020                                   || triop->op == Iop_PRemF64
3021                                   || triop->op == Iop_PRem1F64);
3022       addInstr(env, AMD64Instr_A87Free(2));
3023
3024       /* one arg -> top of x87 stack */
3025       addInstr(env, AMD64Instr_SseLdSt(
3026                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3027       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3028
3029       /* other arg -> top of x87 stack */
3030       addInstr(env, AMD64Instr_SseLdSt(
3031                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3032       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3033
3034       /* do it */
3035       /* XXXROUNDINGFIXME */
3036       /* set roundingmode here */
3037       switch (triop->op) {
3038          case Iop_ScaleF64:
3039             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3040             break;
3041          case Iop_AtanF64:
3042             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3043             break;
3044          case Iop_Yl2xF64:
3045             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3046             break;
3047          case Iop_Yl2xp1F64:
3048             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3049             break;
3050          case Iop_PRemF64:
3051             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3052             break;
3053          case Iop_PRem1F64:
3054             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3055             break;
3056          default:
3057             vassert(0);
3058       }
3059
3060       /* save result */
3061       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3062       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3063       return dst;
3064    }
3065
3066    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3067       HReg dst = newVRegV(env);
3068       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3069       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3070       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3071       set_SSE_rounding_default( env );
3072       return dst;
3073    }
3074
3075    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3076       HReg dst = newVRegV(env);
3077       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3078       set_SSE_rounding_default( env );
3079       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3080       return dst;
3081    }
3082
3083    if (e->tag == Iex_Unop
3084        && (e->Iex.Unop.op == Iop_NegF64
3085            || e->Iex.Unop.op == Iop_AbsF64)) {
3086       /* Sigh ... very rough code.  Could do much better. */
3087       /* Get the 128-bit literal 00---0 10---0 into a register
3088          and xor/nand it with the value to be negated. */
3089       HReg r1  = newVRegI(env);
3090       HReg dst = newVRegV(env);
3091       HReg tmp = newVRegV(env);
3092       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3093       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3094       addInstr(env, mk_vMOVsd_RR(src,tmp));
3095       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3096       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3097       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3098       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3099
3100       if (e->Iex.Unop.op == Iop_NegF64)
3101          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3102       else
3103          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3104
3105       add_to_rsp(env, 16);
3106       return dst;
3107    }
3108
3109    if (e->tag == Iex_Binop) {
3110       A87FpOp fpop = Afp_INVALID;
3111       switch (e->Iex.Binop.op) {
3112          case Iop_SqrtF64: fpop = Afp_SQRT; break;
3113          case Iop_SinF64:  fpop = Afp_SIN;  break;
3114          case Iop_CosF64:  fpop = Afp_COS;  break;
3115          case Iop_TanF64:  fpop = Afp_TAN;  break;
3116          case Iop_2xm1F64: fpop = Afp_2XM1; break;
3117          default: break;
3118       }
3119       if (fpop != Afp_INVALID) {
3120          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3121          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3122          HReg        dst    = newVRegV(env);
3123          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3124          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3125          addInstr(env, AMD64Instr_A87Free(nNeeded));
3126          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3127          /* XXXROUNDINGFIXME */
3128          /* set roundingmode here */
3129          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3130             codes.  I don't think that matters, since this insn
3131             selector never generates such an instruction intervening
3132             between an flag-setting instruction and a flag-using
3133             instruction. */
3134          addInstr(env, AMD64Instr_A87FpOp(fpop));
3135          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3136          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3137          return dst;
3138       }
3139    }
3140
3141    if (e->tag == Iex_Unop) {
3142       switch (e->Iex.Unop.op) {
3143 //..          case Iop_I32toF64: {
3144 //..             HReg dst = newVRegF(env);
3145 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3146 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3147 //..             set_FPU_rounding_default(env);
3148 //..             addInstr(env, X86Instr_FpLdStI(
3149 //..                              True/*load*/, 4, dst,
3150 //..                              X86AMode_IR(0, hregX86_ESP())));
3151 //..             add_to_esp(env, 4);
3152 //..             return dst;
3153 //..          }
3154          case Iop_ReinterpI64asF64: {
3155             /* Given an I64, produce an IEEE754 double with the same
3156                bit pattern. */
3157             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3158             HReg        dst    = newVRegV(env);
3159             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3160             /* paranoia */
3161             set_SSE_rounding_default(env);
3162             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3163             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3164             return dst;
3165          }
3166          case Iop_F32toF64: {
3167             HReg f32;
3168             HReg f64 = newVRegV(env);
3169             /* this shouldn't be necessary, but be paranoid ... */
3170             set_SSE_rounding_default(env);
3171             f32 = iselFltExpr(env, e->Iex.Unop.arg);
3172             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3173             return f64;
3174          }
3175          default:
3176             break;
3177       }
3178    }
3179
3180    /* --------- MULTIPLEX --------- */
3181    if (e->tag == Iex_ITE) { // VFD
3182       HReg r1, r0, dst;
3183       vassert(ty == Ity_F64);
3184       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3185       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3186       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3187       dst = newVRegV(env);
3188       addInstr(env, mk_vMOVsd_RR(r1,dst));
3189       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3190       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3191       return dst;
3192    }
3193
3194    ppIRExpr(e);
3195    vpanic("iselDblExpr_wrk");
3196 }
3197
3198
3199 /*---------------------------------------------------------*/
3200 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3201 /*---------------------------------------------------------*/
3202
3203 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3204 {
3205    HReg r = iselVecExpr_wrk( env, e );
3206 #  if 0
3207    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3208 #  endif
3209    vassert(hregClass(r) == HRcVec128);
3210    vassert(hregIsVirtual(r));
3211    return r;
3212 }
3213
3214
3215 /* DO NOT CALL THIS DIRECTLY */
3216 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3217 {
3218    HWord      fn = 0; /* address of helper fn, if required */
3219    Bool       arg1isEReg = False;
3220    AMD64SseOp op = Asse_INVALID;
3221    vassert(e);
3222    IRType ty = typeOfIRExpr(env->type_env, e);
3223    vassert(ty == Ity_V128);
3224    UInt laneBits = 0;
3225
3226    if (e->tag == Iex_RdTmp) {
3227       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3228    }
3229
3230    if (e->tag == Iex_Get) {
3231       HReg dst = newVRegV(env);
3232       addInstr(env, AMD64Instr_SseLdSt(
3233                        True/*load*/,
3234                        16,
3235                        dst,
3236                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3237                     )
3238               );
3239       return dst;
3240    }
3241
3242    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3243       HReg        dst = newVRegV(env);
3244       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3245       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3246       return dst;
3247    }
3248
3249    if (e->tag == Iex_Const) {
3250       HReg dst = newVRegV(env);
3251       vassert(e->Iex.Const.con->tag == Ico_V128);
3252       switch (e->Iex.Const.con->Ico.V128) {
3253          case 0x0000:
3254             dst = generate_zeroes_V128(env);
3255             break;
3256          case 0xFFFF:
3257             dst = generate_ones_V128(env);
3258             break;
3259          default: {
3260             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3261             /* do push_uimm64 twice, first time for the high-order half. */
3262             push_uimm64(env, bitmask8_to_bytemask64(
3263                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3264                        ));
3265             push_uimm64(env, bitmask8_to_bytemask64(
3266                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3267                        ));
3268             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3269             add_to_rsp(env, 16);
3270             break;
3271          }
3272       }
3273       return dst;
3274    }
3275
3276    if (e->tag == Iex_Unop) {
3277    switch (e->Iex.Unop.op) {
3278
3279       case Iop_NotV128: {
3280          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3281          return do_sse_NotV128(env, arg);
3282       }
3283
3284       case Iop_CmpNEZ64x2: {
3285          /* We can use SSE2 instructions for this. */
3286          /* Ideally, we want to do a 64Ix2 comparison against zero of
3287             the operand.  Problem is no such insn exists.  Solution
3288             therefore is to do a 32Ix4 comparison instead, and bitwise-
3289             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3290             let the not'd result of this initial comparison be a:b:c:d.
3291             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3292             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3293             giving the required result.
3294
3295             The required selection sequence is 2,3,0,1, which
3296             according to Intel's documentation means the pshufd
3297             literal value is 0xB1, that is,
3298             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3299          */
3300          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3301          HReg tmp  = generate_zeroes_V128(env);
3302          HReg dst  = newVRegV(env);
3303          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3304          tmp = do_sse_NotV128(env, tmp);
3305          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3306          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3307          return dst;
3308       }
3309
3310       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3311       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3312       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3313       do_CmpNEZ_vector:
3314       {
3315          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3316          HReg tmp  = newVRegV(env);
3317          HReg zero = generate_zeroes_V128(env);
3318          HReg dst;
3319          addInstr(env, mk_vMOVsd_RR(arg, tmp));
3320          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3321          dst = do_sse_NotV128(env, tmp);
3322          return dst;
3323       }
3324
3325       case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3326       case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3327       do_32Fx4_unary:
3328       {
3329          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3330          HReg dst = newVRegV(env);
3331          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3332          return dst;
3333       }
3334
3335       case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3336       case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3337       case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
3338       do_32F0x4_unary:
3339       {
3340          /* A bit subtle.  We have to copy the arg to the result
3341             register first, because actually doing the SSE scalar insn
3342             leaves the upper 3/4 of the destination register
3343             unchanged.  Whereas the required semantics of these
3344             primops is that the upper 3/4 is simply copied in from the
3345             argument. */
3346          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3347          HReg dst = newVRegV(env);
3348          addInstr(env, mk_vMOVsd_RR(arg, dst));
3349          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3350          return dst;
3351       }
3352
3353       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3354       do_64F0x2_unary:
3355       {
3356          /* A bit subtle.  We have to copy the arg to the result
3357             register first, because actually doing the SSE scalar insn
3358             leaves the upper half of the destination register
3359             unchanged.  Whereas the required semantics of these
3360             primops is that the upper half is simply copied in from the
3361             argument. */
3362          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3363          HReg dst = newVRegV(env);
3364          addInstr(env, mk_vMOVsd_RR(arg, dst));
3365          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3366          return dst;
3367       }
3368
3369       case Iop_32UtoV128: {
3370          // FIXME maybe just use MOVQ here?
3371          HReg        dst     = newVRegV(env);
3372          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3373          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3374          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3375          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3376          return dst;
3377       }
3378
3379       case Iop_64UtoV128: {
3380          // FIXME maybe just use MOVQ here?
3381          HReg        dst  = newVRegV(env);
3382          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3383          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3384          addInstr(env, AMD64Instr_Push(rmi));
3385          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3386          add_to_rsp(env, 8);
3387          return dst;
3388       }
3389
3390       case Iop_V256toV128_0:
3391       case Iop_V256toV128_1: {
3392          HReg vHi, vLo;
3393          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3394          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3395       }
3396
3397       case Iop_F16toF32x4: {
3398          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3399             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3400             HReg dst = newVRegV(env);
3401             addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3402             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3403             return dst;
3404          }
3405          break;
3406       }
3407
3408       default:
3409          break;
3410    } /* switch (e->Iex.Unop.op) */
3411    } /* if (e->tag == Iex_Unop) */
3412
3413    if (e->tag == Iex_Binop) {
3414    switch (e->Iex.Binop.op) {
3415
3416       case Iop_Sqrt64Fx2:
3417       case Iop_Sqrt32Fx4: {
3418          /* :: (rmode, vec) -> vec */
3419          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3420          HReg dst = newVRegV(env);
3421          /* XXXROUNDINGFIXME */
3422          /* set roundingmode here */
3423          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3424                            ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3425                        (Asse_SQRTF, arg, dst));
3426          return dst;
3427       }
3428
3429       /* FIXME: could we generate MOVQ here? */
3430       case Iop_SetV128lo64: {
3431          HReg dst  = newVRegV(env);
3432          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3433          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3434          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3435          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3436          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3437          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3438          return dst;
3439       }
3440
3441       /* FIXME: could we generate MOVD here? */
3442       case Iop_SetV128lo32: {
3443          HReg dst  = newVRegV(env);
3444          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3445          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3446          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3447          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3448          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3449          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3450          return dst;
3451       }
3452
3453       case Iop_64HLtoV128: {
3454          const IRExpr* arg1 = e->Iex.Binop.arg1;
3455          const IRExpr* arg2 = e->Iex.Binop.arg2;
3456          HReg dst = newVRegV(env);
3457          HReg tmp = newVRegV(env);
3458          HReg qHi = iselIntExpr_R(env, arg1);
3459          // If the args are trivially the same (tmp or const), use the same
3460          // source register for both, and only one movq since those are
3461          // (relatively) expensive.
3462          if (areAtomsAndEqual(arg1, arg2)) {
3463             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3464             addInstr(env, mk_vMOVsd_RR(dst, tmp));
3465             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3466             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3467          } else {
3468             HReg qLo = iselIntExpr_R(env, arg2);
3469             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3470             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3471             addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3472             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3473          }
3474          return dst;
3475       }
3476
3477       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3478       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3479       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3480       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3481       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3482       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3483       do_32Fx4:
3484       {
3485          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3486          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3487          HReg dst = newVRegV(env);
3488          addInstr(env, mk_vMOVsd_RR(argL, dst));
3489          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3490          return dst;
3491       }
3492
3493       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3494       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3495       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3496       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3497       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3498       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3499       do_64Fx2:
3500       {
3501          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3502          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3503          HReg dst = newVRegV(env);
3504          addInstr(env, mk_vMOVsd_RR(argL, dst));
3505          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3506          return dst;
3507       }
3508
3509       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3510       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3511       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3512       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3513       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3514       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3515       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3516       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3517       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3518       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3519       do_32F0x4: {
3520          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3521          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3522          HReg dst = newVRegV(env);
3523          addInstr(env, mk_vMOVsd_RR(argL, dst));
3524          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3525          return dst;
3526       }
3527
3528       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3529       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3530       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3531       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3532       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3533       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3534       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3535       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3536       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3537       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3538       do_64F0x2: {
3539          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3540          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3541          HReg dst = newVRegV(env);
3542          addInstr(env, mk_vMOVsd_RR(argL, dst));
3543          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3544          return dst;
3545       }
3546
3547       case Iop_PermOrZero8x16:
3548          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3549             op = Asse_PSHUFB;
3550             goto do_SseReRg;
3551          }
3552          // Otherwise we'll have to generate a call to
3553          // h_generic_calc_PermOrZero8x16 (ATK).  But that would only be for a
3554          // host which doesn't have SSSE3, in which case we don't expect this
3555          // IROp to enter the compilation pipeline in the first place.
3556          break;
3557
3558       case Iop_PwExtUSMulQAdd8x16:
3559          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3560             op = Asse_PMADDUBSW;
3561             goto do_SseReRg;
3562          }
3563          break;
3564
3565       case Iop_QNarrowBin32Sto16Sx8:
3566          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3567       case Iop_QNarrowBin16Sto8Sx16:
3568          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3569       case Iop_QNarrowBin16Sto8Ux16:
3570          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3571
3572       case Iop_InterleaveHI8x16:
3573          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3574       case Iop_InterleaveHI16x8:
3575          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3576       case Iop_InterleaveHI32x4:
3577          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3578       case Iop_InterleaveHI64x2:
3579          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3580
3581       case Iop_InterleaveLO8x16:
3582          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3583       case Iop_InterleaveLO16x8:
3584          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3585       case Iop_InterleaveLO32x4:
3586          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3587       case Iop_InterleaveLO64x2:
3588          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3589
3590       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3591       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3592       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3593       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3594       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3595       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3596       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3597       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3598       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3599       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3600       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3601       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3602       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3603       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3604       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3605       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3606       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3607       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3608       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3609       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3610       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3611       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3612       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3613       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3614       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3615       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3616       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3617       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3618       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3619       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3620       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3621       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3622       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3623       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3624       do_SseReRg: {
3625          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3626          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3627          HReg dst = newVRegV(env);
3628          if (arg1isEReg) {
3629             addInstr(env, mk_vMOVsd_RR(arg2, dst));
3630             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3631          } else {
3632             addInstr(env, mk_vMOVsd_RR(arg1, dst));
3633             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3634          }
3635          return dst;
3636       }
3637
3638       case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3639       case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3640       case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3641       case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3642       case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3643       case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3644       case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3645       case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3646       do_SseShift: {
3647          HReg dst  = newVRegV(env);
3648          HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3649          /* If it's a shift by an in-range immediate, generate a single
3650             instruction. */
3651          if (e->Iex.Binop.arg2->tag == Iex_Const) {
3652             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3653             vassert(c->tag == Ico_U8);
3654             UInt shift = c->Ico.U8;
3655             if (shift < laneBits) {
3656                addInstr(env, mk_vMOVsd_RR(greg, dst));
3657                addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3658                return dst;
3659             }
3660          }
3661          /* Otherwise we have to do it the longwinded way. */
3662          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3663          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3664          HReg        ereg = newVRegV(env);
3665          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3666          addInstr(env, AMD64Instr_Push(rmi));
3667          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3668          addInstr(env, mk_vMOVsd_RR(greg, dst));
3669          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3670          add_to_rsp(env, 16);
3671          return dst;
3672       }
3673
3674       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3675                            goto do_SseAssistedBinary;
3676       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3677                            goto do_SseAssistedBinary;
3678       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3679                            goto do_SseAssistedBinary;
3680       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3681                            goto do_SseAssistedBinary;
3682       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3683                            goto do_SseAssistedBinary;
3684       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3685                            goto do_SseAssistedBinary;
3686       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3687                            goto do_SseAssistedBinary;
3688       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3689                            goto do_SseAssistedBinary;
3690       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3691                            goto do_SseAssistedBinary;
3692       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3693                            goto do_SseAssistedBinary;
3694       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3695                            goto do_SseAssistedBinary;
3696       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3697                            goto do_SseAssistedBinary;
3698       case Iop_QNarrowBin32Sto16Ux8:
3699                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3700                            goto do_SseAssistedBinary;
3701       case Iop_NarrowBin16to8x16:
3702                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3703                            goto do_SseAssistedBinary;
3704       case Iop_NarrowBin32to16x8:
3705                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3706                            goto do_SseAssistedBinary;
3707       do_SseAssistedBinary: {
3708          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3709             well. */
3710          vassert(fn != 0);
3711          HReg dst = newVRegV(env);
3712          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3713          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3714          HReg argp = newVRegI(env);
3715          /* subq $112, %rsp         -- make a space*/
3716          sub_from_rsp(env, 112);
3717          /* leaq 48(%rsp), %r_argp  -- point into it */
3718          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3719                                         argp));
3720          /* andq $-16, %r_argp      -- 16-align the pointer */
3721          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3722                                          AMD64RMI_Imm( ~(UInt)15 ),
3723                                          argp));
3724          /* Prepare 3 arg regs:
3725             leaq 0(%r_argp), %rdi
3726             leaq 16(%r_argp), %rsi
3727             leaq 32(%r_argp), %rdx
3728          */
3729          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3730                                         hregAMD64_RDI()));
3731          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3732                                         hregAMD64_RSI()));
3733          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3734                                         hregAMD64_RDX()));
3735          /* Store the two args, at (%rsi) and (%rdx):
3736             movupd  %argL, 0(%rsi)
3737             movupd  %argR, 0(%rdx)
3738          */
3739          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3740                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3741          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3742                                           AMD64AMode_IR(0, hregAMD64_RDX())));
3743          /* call the helper */
3744          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3745                                         3, mk_RetLoc_simple(RLPri_None) ));
3746          /* fetch the result from memory, using %r_argp, which the
3747             register allocator will keep alive across the call. */
3748          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3749                                           AMD64AMode_IR(0, argp)));
3750          /* and finally, clear the space */
3751          add_to_rsp(env, 112);
3752          return dst;
3753       }
3754
3755       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3756                          goto do_SseAssistedVectorAndScalar;
3757       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3758                          goto do_SseAssistedVectorAndScalar;
3759       do_SseAssistedVectorAndScalar: {
3760          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3761             well. */
3762          vassert(fn != 0);
3763          HReg dst = newVRegV(env);
3764          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3765          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3766          HReg argp = newVRegI(env);
3767          /* subq $112, %rsp         -- make a space*/
3768          sub_from_rsp(env, 112);
3769          /* leaq 48(%rsp), %r_argp  -- point into it */
3770          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3771                                         argp));
3772          /* andq $-16, %r_argp      -- 16-align the pointer */
3773          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3774                                          AMD64RMI_Imm( ~(UInt)15 ),
3775                                          argp));
3776          /* Prepare 2 vector arg regs:
3777             leaq 0(%r_argp), %rdi
3778             leaq 16(%r_argp), %rsi
3779          */
3780          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3781                                         hregAMD64_RDI()));
3782          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3783                                         hregAMD64_RSI()));
3784          /* Store the vector arg, at (%rsi):
3785             movupd  %argL, 0(%rsi)
3786          */
3787          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3788                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3789          /* And get the scalar value into rdx */
3790          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3791
3792          /* call the helper */
3793          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3794                                         3, mk_RetLoc_simple(RLPri_None) ));
3795          /* fetch the result from memory, using %r_argp, which the
3796             register allocator will keep alive across the call. */
3797          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3798                                           AMD64AMode_IR(0, argp)));
3799          /* and finally, clear the space */
3800          add_to_rsp(env, 112);
3801          return dst;
3802       }
3803
3804       case Iop_I32StoF32x4:
3805       case Iop_F32toI32Sx4: {
3806          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3807          HReg dst = newVRegV(env);
3808          AMD64SseOp mop
3809             = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3810          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3811          addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3812          set_SSE_rounding_default(env);
3813          return dst;
3814       }
3815
3816       // Half-float vector conversion
3817       case Iop_F32toF16x8: {
3818          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3819             HReg srcHi, srcLo;
3820             iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
3821             HReg dstHi = newVRegV(env);
3822             HReg dstLo = newVRegV(env);
3823             set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3824             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
3825             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
3826             set_SSE_rounding_default(env);
3827             // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
3828             // need to compact all that into one register.  There's probably a
3829             // more elegant way to do this, but ..
3830             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
3831             // dstHi is now 127:64 = useful data, 63:0 = zero
3832             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
3833             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
3834             // dstLo is now 127:64 = zero, 63:0 = useful data
3835             addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
3836             return dstLo;
3837          }
3838          break;
3839       }
3840
3841       default:
3842          break;
3843    } /* switch (e->Iex.Binop.op) */
3844    } /* if (e->tag == Iex_Binop) */
3845
3846    if (e->tag == Iex_Triop) {
3847    IRTriop *triop = e->Iex.Triop.details;
3848    switch (triop->op) {
3849
3850       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3851       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3852       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3853       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3854       do_64Fx2_w_rm:
3855       {
3856          HReg argL = iselVecExpr(env, triop->arg2);
3857          HReg argR = iselVecExpr(env, triop->arg3);
3858          HReg dst = newVRegV(env);
3859          addInstr(env, mk_vMOVsd_RR(argL, dst));
3860          /* XXXROUNDINGFIXME */
3861          /* set roundingmode here */
3862          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3863          return dst;
3864       }
3865
3866       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3867       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3868       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3869       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3870       do_32Fx4_w_rm:
3871       {
3872          HReg argL = iselVecExpr(env, triop->arg2);
3873          HReg argR = iselVecExpr(env, triop->arg3);
3874          HReg dst = newVRegV(env);
3875          addInstr(env, mk_vMOVsd_RR(argL, dst));
3876          /* XXXROUNDINGFIXME */
3877          /* set roundingmode here */
3878          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3879          return dst;
3880       }
3881
3882       default:
3883          break;
3884    } /* switch (triop->op) */
3885    } /* if (e->tag == Iex_Triop) */
3886
3887    if (e->tag == Iex_ITE) { // VFD
3888       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3889       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3890       HReg dst = newVRegV(env);
3891       addInstr(env, mk_vMOVsd_RR(r1,dst));
3892       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3893       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3894       return dst;
3895    }
3896
3897    //vec_fail:
3898    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3899               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3900    ppIRExpr(e);
3901    vpanic("iselVecExpr_wrk");
3902 }
3903
3904
3905 /*---------------------------------------------------------*/
3906 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3907 /*---------------------------------------------------------*/
3908
3909 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3910                            ISelEnv* env, const IRExpr* e )
3911 {
3912    iselDVecExpr_wrk( rHi, rLo, env, e );
3913 #  if 0
3914    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3915 #  endif
3916    vassert(hregClass(*rHi) == HRcVec128);
3917    vassert(hregClass(*rLo) == HRcVec128);
3918    vassert(hregIsVirtual(*rHi));
3919    vassert(hregIsVirtual(*rLo));
3920 }
3921
3922
3923 /* DO NOT CALL THIS DIRECTLY */
3924 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3925                                ISelEnv* env, const IRExpr* e )
3926 {
3927    HWord fn = 0; /* address of helper fn, if required */
3928    vassert(e);
3929    IRType ty = typeOfIRExpr(env->type_env, e);
3930    vassert(ty == Ity_V256);
3931    UInt laneBits = 0;
3932
3933    AMD64SseOp op = Asse_INVALID;
3934
3935    /* read 256-bit IRTemp */
3936    if (e->tag == Iex_RdTmp) {
3937       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3938       return;
3939    }
3940
3941    if (e->tag == Iex_Get) {
3942       HReg        vHi  = newVRegV(env);
3943       HReg        vLo  = newVRegV(env);
3944       HReg        rbp  = hregAMD64_RBP();
3945       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3946       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3947       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3948       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3949       *rHi = vHi;
3950       *rLo = vLo;
3951       return;
3952    }
3953
3954    if (e->tag == Iex_Load) {
3955       HReg        vHi  = newVRegV(env);
3956       HReg        vLo  = newVRegV(env);
3957       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3958       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3959       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3960       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3961       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3962       *rHi = vHi;
3963       *rLo = vLo;
3964       return;
3965    }
3966
3967    if (e->tag == Iex_Const) {
3968       vassert(e->Iex.Const.con->tag == Ico_V256);
3969       switch (e->Iex.Const.con->Ico.V256) {
3970          case 0x00000000: {
3971             HReg vHi = generate_zeroes_V128(env);
3972             HReg vLo = newVRegV(env);
3973             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3974             *rHi = vHi;
3975             *rLo = vLo;
3976             return;
3977          }
3978          default:
3979             break; /* give up.   Until such time as is necessary. */
3980       }
3981    }
3982
3983    if (e->tag == Iex_Unop) {
3984    switch (e->Iex.Unop.op) {
3985
3986       case Iop_NotV256: {
3987          HReg argHi, argLo;
3988          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3989          *rHi = do_sse_NotV128(env, argHi);
3990          *rLo = do_sse_NotV128(env, argLo);
3991          return;
3992       }
3993
3994       case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
3995       case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
3996       case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3997       do_32Fx8_unary:
3998       {
3999          HReg argHi, argLo;
4000          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4001          HReg dstHi = newVRegV(env);
4002          HReg dstLo = newVRegV(env);
4003          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4004          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4005          *rHi = dstHi;
4006          *rLo = dstLo;
4007          return;
4008       }
4009
4010       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
4011       do_64Fx4_unary:
4012       {
4013          HReg argHi, argLo;
4014          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4015          HReg dstHi = newVRegV(env);
4016          HReg dstLo = newVRegV(env);
4017          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4018          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4019          *rHi = dstHi;
4020          *rLo = dstLo;
4021          return;
4022       }
4023
4024       case Iop_CmpNEZ64x4: {
4025          /* We can use SSE2 instructions for this. */
4026          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4027             (obviously).  See comment on Iop_CmpNEZ64x2 for
4028             explanation of what's going on here. */
4029          HReg argHi, argLo;
4030          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4031          HReg tmpHi  = generate_zeroes_V128(env);
4032          HReg tmpLo  = newVRegV(env);
4033          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4034          HReg dstHi  = newVRegV(env);
4035          HReg dstLo  = newVRegV(env);
4036          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4037          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4038          tmpHi = do_sse_NotV128(env, tmpHi);
4039          tmpLo = do_sse_NotV128(env, tmpLo);
4040          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4041          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4042          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4043          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4044          *rHi = dstHi;
4045          *rLo = dstLo;
4046          return;
4047       }
4048
4049       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4050       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4051       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
4052       do_CmpNEZ_vector:
4053       {
4054          HReg argHi, argLo;
4055          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4056          HReg tmpHi = newVRegV(env);
4057          HReg tmpLo = newVRegV(env);
4058          HReg zero  = generate_zeroes_V128(env);
4059          HReg dstHi, dstLo;
4060          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4061          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4062          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4063          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4064          dstHi = do_sse_NotV128(env, tmpHi);
4065          dstLo = do_sse_NotV128(env, tmpLo);
4066          *rHi = dstHi;
4067          *rLo = dstLo;
4068          return;
4069       }
4070
4071       case Iop_F16toF32x8: {
4072          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4073             HReg src     = iselVecExpr(env, e->Iex.Unop.arg);
4074             HReg srcCopy = newVRegV(env);
4075             HReg dstHi   = newVRegV(env);
4076             HReg dstLo   = newVRegV(env);
4077             // Copy src, since we'll need to modify it.
4078             addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4079             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4080             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4081             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4082             *rHi = dstHi;
4083             *rLo = dstLo;
4084             return;
4085          }
4086          break;
4087       }
4088
4089       default:
4090          break;
4091    } /* switch (e->Iex.Unop.op) */
4092    } /* if (e->tag == Iex_Unop) */
4093
4094    if (e->tag == Iex_Binop) {
4095    switch (e->Iex.Binop.op) {
4096
4097       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
4098       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
4099       do_64Fx4:
4100       {
4101          HReg argLhi, argLlo, argRhi, argRlo;
4102          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4103          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4104          HReg dstHi = newVRegV(env);
4105          HReg dstLo = newVRegV(env);
4106          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4107          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4108          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4109          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4110          *rHi = dstHi;
4111          *rLo = dstLo;
4112          return;
4113       }
4114
4115       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
4116       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
4117       do_32Fx8:
4118       {
4119          HReg argLhi, argLlo, argRhi, argRlo;
4120          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4121          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4122          HReg dstHi = newVRegV(env);
4123          HReg dstLo = newVRegV(env);
4124          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4125          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4126          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4127          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4128          *rHi = dstHi;
4129          *rLo = dstLo;
4130          return;
4131       }
4132
4133       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
4134       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
4135       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
4136       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
4137       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
4138       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
4139       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
4140       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
4141       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
4142       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
4143       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
4144       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
4145       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
4146       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
4147       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
4148       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
4149       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
4150       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4151       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4152       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
4153       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
4154       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
4155       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
4156       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4157       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4158       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
4159       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
4160       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
4161       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
4162       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
4163       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
4164       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
4165       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
4166       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
4167       do_SseReRg:
4168       {
4169          HReg argLhi, argLlo, argRhi, argRlo;
4170          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4171          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4172          HReg dstHi = newVRegV(env);
4173          HReg dstLo = newVRegV(env);
4174          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4175          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4176          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4177          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4178          *rHi = dstHi;
4179          *rLo = dstLo;
4180          return;
4181       }
4182
4183       case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4184       case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4185       case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4186       case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4187       case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4188       case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4189       case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4190       case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4191       do_SseShift: {
4192          HReg dstHi = newVRegV(env);
4193          HReg dstLo = newVRegV(env);
4194          HReg gregHi, gregLo;
4195          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4196          /* If it's a shift by an in-range immediate, generate two single
4197             instructions. */
4198          if (e->Iex.Binop.arg2->tag == Iex_Const) {
4199             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4200             vassert(c->tag == Ico_U8);
4201             UInt shift = c->Ico.U8;
4202             if (shift < laneBits) {
4203                addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4204                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4205                addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4206                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4207                *rHi = dstHi;
4208                *rLo = dstLo;
4209                return;
4210             }
4211          }
4212          /* Otherwise we have to do it the longwinded way. */
4213          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4214          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
4215          HReg        ereg  = newVRegV(env);
4216          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4217          addInstr(env, AMD64Instr_Push(rmi));
4218          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4219          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4220          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4221          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4222          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4223          add_to_rsp(env, 16);
4224          *rHi = dstHi;
4225          *rLo = dstLo;
4226          return;
4227       }
4228
4229       case Iop_V128HLtoV256: {
4230          // Curiously, there doesn't seem to be any benefit to be had here by
4231          // checking whether arg1 and arg2 are the same, in the style of how
4232          // (eg) 64HLtoV128 is handled elsewhere in this file.
4233          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4234          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4235          return;
4236       }
4237
4238       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4239                            goto do_SseAssistedBinary;
4240       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4241                            goto do_SseAssistedBinary;
4242       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4243                            goto do_SseAssistedBinary;
4244       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4245                            goto do_SseAssistedBinary;
4246       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4247                            goto do_SseAssistedBinary;
4248       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4249                            goto do_SseAssistedBinary;
4250       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4251                            goto do_SseAssistedBinary;
4252       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4253                            goto do_SseAssistedBinary;
4254       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4255                            goto do_SseAssistedBinary;
4256       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4257                            goto do_SseAssistedBinary;
4258       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4259                            goto do_SseAssistedBinary;
4260       do_SseAssistedBinary: {
4261          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4262             well. */
4263          vassert(fn != 0);
4264          HReg dstHi = newVRegV(env);
4265          HReg dstLo = newVRegV(env);
4266          HReg argLhi, argLlo, argRhi, argRlo;
4267          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4268          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4269          HReg argp = newVRegI(env);
4270          /* subq $160, %rsp         -- make a space*/
4271          sub_from_rsp(env, 160);
4272          /* leaq 48(%rsp), %r_argp  -- point into it */
4273          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4274                                         argp));
4275          /* andq $-16, %r_argp      -- 16-align the pointer */
4276          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4277                                          AMD64RMI_Imm( ~(UInt)15 ),
4278                                          argp));
4279          /* Prepare 3 arg regs:
4280             leaq 0(%r_argp), %rdi
4281             leaq 16(%r_argp), %rsi
4282             leaq 32(%r_argp), %rdx
4283          */
4284          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4285                                         hregAMD64_RDI()));
4286          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4287                                         hregAMD64_RSI()));
4288          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4289                                         hregAMD64_RDX()));
4290          /* Store the two high args, at (%rsi) and (%rdx):
4291             movupd  %argLhi, 0(%rsi)
4292             movupd  %argRhi, 0(%rdx)
4293          */
4294          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4295                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4296          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4297                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4298          /* Store the two low args, at 48(%rsi) and 48(%rdx):
4299             movupd  %argLlo, 48(%rsi)
4300             movupd  %argRlo, 48(%rdx)
4301          */
4302          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4303                                           AMD64AMode_IR(48, hregAMD64_RSI())));
4304          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4305                                           AMD64AMode_IR(48, hregAMD64_RDX())));
4306          /* call the helper */
4307          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4308                                         mk_RetLoc_simple(RLPri_None) ));
4309          /* Prepare 3 arg regs:
4310             leaq 48(%r_argp), %rdi
4311             leaq 64(%r_argp), %rsi
4312             leaq 80(%r_argp), %rdx
4313          */
4314          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4315                                         hregAMD64_RDI()));
4316          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4317                                         hregAMD64_RSI()));
4318          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4319                                         hregAMD64_RDX()));
4320          /* call the helper */
4321          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4322                                         mk_RetLoc_simple(RLPri_None) ));
4323          /* fetch the result from memory, using %r_argp, which the
4324             register allocator will keep alive across the call. */
4325          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4326                                           AMD64AMode_IR(0, argp)));
4327          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4328                                           AMD64AMode_IR(48, argp)));
4329          /* and finally, clear the space */
4330          add_to_rsp(env, 160);
4331          *rHi = dstHi;
4332          *rLo = dstLo;
4333          return;
4334       }
4335
4336       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4337                            goto do_SseAssistedBinary256;
4338       do_SseAssistedBinary256: {
4339          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4340             well. */
4341          vassert(fn != 0);
4342          HReg dstHi = newVRegV(env);
4343          HReg dstLo = newVRegV(env);
4344          HReg argLhi, argLlo, argRhi, argRlo;
4345          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4346          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4347          HReg argp = newVRegI(env);
4348          /* subq $160, %rsp         -- make a space*/
4349          sub_from_rsp(env, 160);
4350          /* leaq 48(%rsp), %r_argp  -- point into it */
4351          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4352                                         argp));
4353          /* andq $-16, %r_argp      -- 16-align the pointer */
4354          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4355                                          AMD64RMI_Imm( ~(UInt)15 ),
4356                                          argp));
4357          /* Prepare 3 arg regs:
4358             leaq 0(%r_argp), %rdi
4359             leaq 32(%r_argp), %rsi
4360             leaq 64(%r_argp), %rdx
4361          */
4362          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4363                                         hregAMD64_RDI()));
4364          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4365                                         hregAMD64_RSI()));
4366          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4367                                         hregAMD64_RDX()));
4368          /* Store the two args, at (%rsi) and (%rdx):
4369             movupd  %argLlo, 0(%rsi)
4370             movupd  %argLhi, 16(%rsi)
4371             movupd  %argRlo, 0(%rdx)
4372             movupd  %argRhi, 16(%rdx)
4373          */
4374          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4375                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4376          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4377                                           AMD64AMode_IR(16, hregAMD64_RSI())));
4378          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4379                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4380          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4381                                           AMD64AMode_IR(16, hregAMD64_RDX())));
4382          /* call the helper */
4383          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4384                                         mk_RetLoc_simple(RLPri_None) ));
4385          /* fetch the result from memory, using %r_argp, which the
4386             register allocator will keep alive across the call. */
4387          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4388                                           AMD64AMode_IR(0, argp)));
4389          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4390                                           AMD64AMode_IR(16, argp)));
4391          /* and finally, clear the space */
4392          add_to_rsp(env, 160);
4393          *rHi = dstHi;
4394          *rLo = dstLo;
4395          return;
4396       }
4397
4398       case Iop_I32StoF32x8:
4399       case Iop_F32toI32Sx8: {
4400          HReg argHi, argLo;
4401          iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4402          HReg dstHi = newVRegV(env);
4403          HReg dstLo = newVRegV(env);
4404          AMD64SseOp mop
4405             = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4406          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4407          addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4408          addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4409          set_SSE_rounding_default(env);
4410          *rHi = dstHi;
4411          *rLo = dstLo;
4412          return;
4413       }
4414
4415       default:
4416          break;
4417    } /* switch (e->Iex.Binop.op) */
4418    } /* if (e->tag == Iex_Binop) */
4419
4420    if (e->tag == Iex_Triop) {
4421    IRTriop *triop = e->Iex.Triop.details;
4422    switch (triop->op) {
4423
4424       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4425       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4426       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4427       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4428       do_64Fx4_w_rm:
4429       {
4430          HReg argLhi, argLlo, argRhi, argRlo;
4431          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4432          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4433          HReg dstHi = newVRegV(env);
4434          HReg dstLo = newVRegV(env);
4435          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4436          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4437          /* XXXROUNDINGFIXME */
4438          /* set roundingmode here */
4439          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4440          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4441          *rHi = dstHi;
4442          *rLo = dstLo;
4443          return;
4444       }
4445
4446       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4447       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4448       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4449       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4450       do_32Fx8_w_rm:
4451       {
4452          HReg argLhi, argLlo, argRhi, argRlo;
4453          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4454          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4455          HReg dstHi = newVRegV(env);
4456          HReg dstLo = newVRegV(env);
4457          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4458          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4459          /* XXXROUNDINGFIXME */
4460          /* set roundingmode here */
4461          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4462          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4463          *rHi = dstHi;
4464          *rLo = dstLo;
4465          return;
4466       }
4467
4468       default:
4469          break;
4470    } /* switch (triop->op) */
4471    } /* if (e->tag == Iex_Triop) */
4472
4473
4474    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4475       const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4476       const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4477       const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4478       const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4479       // If the args are trivially the same (tmp or const), use the same
4480       // source register for all four, and only one movq since those are
4481       // (relatively) expensive.
4482       if (areAtomsAndEqual(arg1, arg2)
4483           && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4484          HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4485          HReg tmp = newVRegV(env);
4486          HReg dst = newVRegV(env);
4487          addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4488          addInstr(env, mk_vMOVsd_RR(dst, tmp));
4489          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4490          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4491          *rHi = dst;
4492          *rLo = dst;
4493       } else {
4494          /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4495          HReg q3 = iselIntExpr_R(env, arg1);
4496          HReg q2 = iselIntExpr_R(env, arg2);
4497          HReg q1 = iselIntExpr_R(env, arg3);
4498          HReg q0 = iselIntExpr_R(env, arg4);
4499          HReg tmp = newVRegV(env);
4500          HReg dstHi = newVRegV(env);
4501          HReg dstLo = newVRegV(env);
4502          addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4503          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4504          addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4505          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4506          addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4507          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4508          addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4509          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4510          *rHi = dstHi;
4511          *rLo = dstLo;
4512       }
4513       return;
4514    }
4515
4516    if (e->tag == Iex_ITE) {
4517       HReg r1Hi, r1Lo, r0Hi, r0Lo;
4518       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4519       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4520       HReg dstHi = newVRegV(env);
4521       HReg dstLo = newVRegV(env);
4522       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4523       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4524       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4525       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4526       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4527       *rHi = dstHi;
4528       *rLo = dstLo;
4529       return;
4530    }
4531
4532    //avx_fail:
4533    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4534               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4535    ppIRExpr(e);
4536    vpanic("iselDVecExpr_wrk");
4537 }
4538
4539
4540 /*---------------------------------------------------------*/
4541 /*--- ISEL: Statements                                  ---*/
4542 /*---------------------------------------------------------*/
4543
4544 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4545 {
4546    if (vex_traceflags & VEX_TRACE_VCODE) {
4547       vex_printf("\n-- ");
4548       ppIRStmt(stmt);
4549       vex_printf("\n");
4550    }
4551
4552    switch (stmt->tag) {
4553
4554    /* --------- LOADG (guarded load) --------- */
4555    case Ist_LoadG: {
4556       IRLoadG* lg = stmt->Ist.LoadG.details;
4557       if (lg->end != Iend_LE)
4558          goto stmt_fail;
4559
4560       UChar szB = 0; /* invalid */
4561       switch (lg->cvt) {
4562          case ILGop_Ident32:   szB = 4;  break;
4563          case ILGop_Ident64:   szB = 8;  break;
4564          case ILGop_IdentV128: szB = 16; break;
4565          default: break;
4566       }
4567       if (szB == 0)
4568          goto stmt_fail;
4569
4570       AMD64AMode* amAddr
4571          = iselIntExpr_AMode(env, lg->addr);
4572       HReg rAlt
4573          = szB == 16 ? iselVecExpr(env, lg->alt)
4574                      : iselIntExpr_R(env, lg->alt);
4575       HReg rDst
4576          = lookupIRTemp(env, lg->dst);
4577
4578       /* Get the alt value into the dst.  We'll do a conditional load
4579          which overwrites it -- or not -- with loaded data. */
4580       if (szB == 16) {
4581          addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4582       } else {
4583          addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4584       }
4585       AMD64CondCode cc = iselCondCode(env, lg->guard);
4586       if (szB == 16) {
4587          addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4588       } else {
4589          addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4590       }
4591       return;
4592    }
4593
4594    /* --------- STOREG (guarded store) --------- */
4595    case Ist_StoreG: {
4596       IRStoreG* sg = stmt->Ist.StoreG.details;
4597       if (sg->end != Iend_LE)
4598          goto stmt_fail;
4599
4600       UChar szB = 0; /* invalid */
4601       switch (typeOfIRExpr(env->type_env, sg->data)) {
4602          case Ity_I32:  szB = 4; break;
4603          case Ity_I64:  szB = 8; break;
4604          case Ity_V128: szB = 16; break;
4605          default: break;
4606       }
4607       if (szB == 0)
4608          goto stmt_fail;
4609
4610       AMD64AMode* amAddr
4611          = iselIntExpr_AMode(env, sg->addr);
4612       HReg rSrc
4613          = szB == 16 ? iselVecExpr(env, sg->data)
4614                      : iselIntExpr_R(env, sg->data);
4615       AMD64CondCode cc
4616          = iselCondCode(env, sg->guard);
4617       if (szB == 16) {
4618          addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4619       } else {
4620          addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4621       }
4622       return;
4623    }
4624
4625    /* --------- STORE --------- */
4626    case Ist_Store: {
4627       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4628       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4629       IREndness end   = stmt->Ist.Store.end;
4630
4631       if (tya != Ity_I64 || end != Iend_LE)
4632          goto stmt_fail;
4633
4634       if (tyd == Ity_I64) {
4635          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4636          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4637          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4638          return;
4639       }
4640       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4641          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4642          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4643          addInstr(env, AMD64Instr_Store(
4644                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4645                           r,am));
4646          return;
4647       }
4648       if (tyd == Ity_F64) {
4649          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4650          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4651          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4652          return;
4653       }
4654       if (tyd == Ity_F32) {
4655          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4656          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4657          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4658          return;
4659       }
4660       if (tyd == Ity_V128) {
4661          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4662          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4663          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4664          return;
4665       }
4666       if (tyd == Ity_V256) {
4667          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4668          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4669          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4670          HReg vHi, vLo;
4671          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4672          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4673          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4674          return;
4675       }
4676       break;
4677    }
4678
4679    /* --------- PUT --------- */
4680    case Ist_Put: {
4681       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4682       if (ty == Ity_I64) {
4683          /* We're going to write to memory, so compute the RHS into an
4684             AMD64RI. */
4685          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4686          addInstr(env,
4687                   AMD64Instr_Alu64M(
4688                      Aalu_MOV,
4689                      ri,
4690                      AMD64AMode_IR(stmt->Ist.Put.offset,
4691                                    hregAMD64_RBP())
4692                  ));
4693          return;
4694       }
4695       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4696          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4697          addInstr(env, AMD64Instr_Store(
4698                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4699                           r,
4700                           AMD64AMode_IR(stmt->Ist.Put.offset,
4701                                         hregAMD64_RBP())));
4702          return;
4703       }
4704       if (ty == Ity_F32) {
4705          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4706          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4707          set_SSE_rounding_default(env); /* paranoia */
4708          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4709          return;
4710       }
4711       if (ty == Ity_F64) {
4712          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4713          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4714                                          hregAMD64_RBP() );
4715          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4716          return;
4717       }
4718       if (ty == Ity_V128) {
4719          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4720          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4721                                          hregAMD64_RBP());
4722          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4723          return;
4724       }
4725       if (ty == Ity_V256) {
4726          HReg vHi, vLo;
4727          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4728          HReg        rbp  = hregAMD64_RBP();
4729          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4730          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4731          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4732          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4733          return;
4734       }
4735       break;
4736    }
4737
4738    /* --------- Indexed PUT --------- */
4739    case Ist_PutI: {
4740       IRPutI *puti = stmt->Ist.PutI.details;
4741
4742       AMD64AMode* am
4743          = genGuestArrayOffset(
4744               env, puti->descr,
4745                    puti->ix, puti->bias );
4746
4747       IRType ty = typeOfIRExpr(env->type_env, puti->data);
4748       if (ty == Ity_F64) {
4749          HReg val = iselDblExpr(env, puti->data);
4750          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4751          return;
4752       }
4753       if (ty == Ity_I8) {
4754          HReg r = iselIntExpr_R(env, puti->data);
4755          addInstr(env, AMD64Instr_Store( 1, r, am ));
4756          return;
4757       }
4758       if (ty == Ity_I64) {
4759          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4760          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4761          return;
4762       }
4763       break;
4764    }
4765
4766    /* --------- TMP --------- */
4767    case Ist_WrTmp: {
4768       IRTemp tmp = stmt->Ist.WrTmp.tmp;
4769       IRType ty = typeOfIRTemp(env->type_env, tmp);
4770
4771       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4772          compute it into an AMode and then use LEA.  This usually
4773          produces fewer instructions, often because (for memcheck
4774          created IR) we get t = address-expression, (t is later used
4775          twice) and so doing this naturally turns address-expression
4776          back into an AMD64 amode. */
4777       if (ty == Ity_I64
4778           && stmt->Ist.WrTmp.data->tag == Iex_Binop
4779           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4780          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4781          HReg dst = lookupIRTemp(env, tmp);
4782          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4783             /* Hmm, iselIntExpr_AMode wimped out and just computed the
4784                value into a register.  Just emit a normal reg-reg move
4785                so reg-alloc can coalesce it away in the usual way. */
4786             HReg src = am->Aam.IR.reg;
4787             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4788          } else {
4789             addInstr(env, AMD64Instr_Lea64(am,dst));
4790          }
4791          return;
4792       }
4793
4794       if (ty == Ity_I64 || ty == Ity_I32
4795           || ty == Ity_I16 || ty == Ity_I8) {
4796          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4797          HReg dst = lookupIRTemp(env, tmp);
4798          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4799          return;
4800       }
4801       if (ty == Ity_I128) {
4802          HReg rHi, rLo, dstHi, dstLo;
4803          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4804          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4805          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4806          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4807          return;
4808       }
4809       if (ty == Ity_I1) {
4810          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4811          HReg dst = lookupIRTemp(env, tmp);
4812          addInstr(env, AMD64Instr_Set64(cond, dst));
4813          return;
4814       }
4815       if (ty == Ity_F64) {
4816          HReg dst = lookupIRTemp(env, tmp);
4817          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4818          addInstr(env, mk_vMOVsd_RR(src, dst));
4819          return;
4820       }
4821       if (ty == Ity_F32) {
4822          HReg dst = lookupIRTemp(env, tmp);
4823          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4824          addInstr(env, mk_vMOVsd_RR(src, dst));
4825          return;
4826       }
4827       if (ty == Ity_V128) {
4828          HReg dst = lookupIRTemp(env, tmp);
4829          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4830          addInstr(env, mk_vMOVsd_RR(src, dst));
4831          return;
4832       }
4833       if (ty == Ity_V256) {
4834          HReg rHi, rLo, dstHi, dstLo;
4835          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4836          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4837          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4838          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4839          return;
4840       }
4841       break;
4842    }
4843
4844    /* --------- Call to DIRTY helper --------- */
4845    case Ist_Dirty: {
4846       IRDirty* d = stmt->Ist.Dirty.details;
4847
4848       /* Figure out the return type, if any. */
4849       IRType retty = Ity_INVALID;
4850       if (d->tmp != IRTemp_INVALID)
4851          retty = typeOfIRTemp(env->type_env, d->tmp);
4852
4853       /* Throw out any return types we don't know about. */
4854       Bool retty_ok = False;
4855       switch (retty) {
4856          case Ity_INVALID: /* function doesn't return anything */
4857          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4858          case Ity_V128: case Ity_V256:
4859             retty_ok = True; break;
4860          default:
4861             break;
4862       }
4863       if (!retty_ok)
4864          break; /* will go to stmt_fail: */
4865
4866       /* Marshal args, do the call, and set the return value to
4867          0x555..555 if this is a conditional call that returns a value
4868          and the call is skipped. */
4869       UInt   addToSp = 0;
4870       RetLoc rloc    = mk_RetLoc_INVALID();
4871       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4872       vassert(is_sane_RetLoc(rloc));
4873
4874       /* Now figure out what to do with the returned value, if any. */
4875       switch (retty) {
4876          case Ity_INVALID: {
4877             /* No return value.  Nothing to do. */
4878             vassert(d->tmp == IRTemp_INVALID);
4879             vassert(rloc.pri == RLPri_None);
4880             vassert(addToSp == 0);
4881             return;
4882          }
4883          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4884             /* The returned value is in %rax.  Park it in the register
4885                associated with tmp. */
4886             vassert(rloc.pri == RLPri_Int);
4887             vassert(addToSp == 0);
4888             HReg dst = lookupIRTemp(env, d->tmp);
4889             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4890             return;
4891          }
4892          case Ity_V128: {
4893             /* The returned value is on the stack, and rloc.spOff
4894                tells us where.  Fish it off the stack and then move
4895                the stack pointer upwards to clear it, as directed by
4896                doHelperCall. */
4897             vassert(rloc.pri == RLPri_V128SpRel);
4898             vassert(addToSp >= 16);
4899             HReg        dst = lookupIRTemp(env, d->tmp);
4900             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4901             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4902             add_to_rsp(env, addToSp);
4903             return;
4904          }
4905          case Ity_V256: {
4906             /* See comments for Ity_V128. */
4907             vassert(rloc.pri == RLPri_V256SpRel);
4908             vassert(addToSp >= 32);
4909             HReg        dstLo, dstHi;
4910             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4911             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4912             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4913             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4914             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4915             add_to_rsp(env, addToSp);
4916             return;
4917          }
4918          default:
4919             /*NOTREACHED*/
4920             vassert(0);
4921       }
4922       break;
4923    }
4924
4925    /* --------- MEM FENCE --------- */
4926    case Ist_MBE:
4927       switch (stmt->Ist.MBE.event) {
4928          case Imbe_Fence:
4929             addInstr(env, AMD64Instr_MFence());
4930             return;
4931          default:
4932             break;
4933       }
4934       break;
4935
4936    /* --------- ACAS --------- */
4937    case Ist_CAS:
4938       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4939          /* "normal" singleton CAS */
4940          UChar  sz;
4941          IRCAS* cas = stmt->Ist.CAS.details;
4942          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4943          /* get: cas->expd into %rax, and cas->data into %rbx */
4944          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4945          HReg rData = iselIntExpr_R(env, cas->dataLo);
4946          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4947          HReg rOld  = lookupIRTemp(env, cas->oldLo);
4948          vassert(cas->expdHi == NULL);
4949          vassert(cas->dataHi == NULL);
4950          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4951          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4952          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4953          switch (ty) {
4954             case Ity_I64: sz = 8; break;
4955             case Ity_I32: sz = 4; break;
4956             case Ity_I16: sz = 2; break;
4957             case Ity_I8:  sz = 1; break;
4958             default: goto unhandled_cas;
4959          }
4960          addInstr(env, AMD64Instr_ACAS(am, sz));
4961          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4962          return;
4963       } else {
4964          /* double CAS */
4965          UChar  sz;
4966          IRCAS* cas = stmt->Ist.CAS.details;
4967          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4968          /* only 32-bit and 64-bit allowed in this case */
4969          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4970          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4971          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4972          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4973          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4974          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4975          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4976          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4977          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4978          switch (ty) {
4979             case Ity_I64:
4980                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4981                   goto unhandled_cas; /* we'd have to generate
4982                                          cmpxchg16b, but the host
4983                                          doesn't support that */
4984                sz = 8;
4985                break;
4986             case Ity_I32:
4987                sz = 4;
4988                break;
4989             default:
4990                goto unhandled_cas;
4991          }
4992          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4993          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4994          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4995          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4996          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4997          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4998          addInstr(env, AMD64Instr_DACAS(am, sz));
4999          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
5000          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
5001          return;
5002       }
5003       unhandled_cas:
5004       break;
5005
5006    /* --------- INSTR MARK --------- */
5007    /* Doesn't generate any executable code ... */
5008    case Ist_IMark:
5009        return;
5010
5011    /* --------- ABI HINT --------- */
5012    /* These have no meaning (denotation in the IR) and so we ignore
5013       them ... if any actually made it this far. */
5014    case Ist_AbiHint:
5015        return;
5016
5017    /* --------- NO-OP --------- */
5018    case Ist_NoOp:
5019        return;
5020
5021    /* --------- EXIT --------- */
5022    case Ist_Exit: {
5023       if (stmt->Ist.Exit.dst->tag != Ico_U64)
5024          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5025
5026       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
5027       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5028                                           hregAMD64_RBP());
5029
5030       /* Case: boring transfer to known address */
5031       if (stmt->Ist.Exit.jk == Ijk_Boring) {
5032          if (env->chainingAllowed) {
5033             /* .. almost always true .. */
5034             /* Skip the event check at the dst if this is a forwards
5035                edge. */
5036             Bool toFastEP
5037                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5038             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5039             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5040                                              amRIP, cc, toFastEP));
5041          } else {
5042             /* .. very occasionally .. */
5043             /* We can't use chaining, so ask for an assisted transfer,
5044                as that's the only alternative that is allowable. */
5045             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5046             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5047          }
5048          return;
5049       }
5050
5051       /* Case: assisted transfer to arbitrary address */
5052       switch (stmt->Ist.Exit.jk) {
5053          /* Keep this list in sync with that in iselNext below */
5054          case Ijk_ClientReq:
5055          case Ijk_EmWarn:
5056          case Ijk_NoDecode:
5057          case Ijk_NoRedir:
5058          case Ijk_SigSEGV:
5059          case Ijk_SigTRAP:
5060          case Ijk_Sys_syscall:
5061          case Ijk_Sys_int210:
5062          case Ijk_InvalICache:
5063          case Ijk_Yield:
5064          {
5065             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5066             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5067             return;
5068          }
5069          default:
5070             break;
5071       }
5072
5073       /* Do we ever expect to see any other kind? */
5074       goto stmt_fail;
5075    }
5076
5077    default: break;
5078    }
5079   stmt_fail:
5080    ppIRStmt(stmt);
5081    vpanic("iselStmt(amd64)");
5082 }
5083
5084
5085 /*---------------------------------------------------------*/
5086 /*--- ISEL: Basic block terminators (Nexts)             ---*/
5087 /*---------------------------------------------------------*/
5088
5089 static void iselNext ( ISelEnv* env,
5090                        IRExpr* next, IRJumpKind jk, Int offsIP )
5091 {
5092    if (vex_traceflags & VEX_TRACE_VCODE) {
5093       vex_printf( "\n-- PUT(%d) = ", offsIP);
5094       ppIRExpr( next );
5095       vex_printf( "; exit-");
5096       ppIRJumpKind(jk);
5097       vex_printf( "\n");
5098    }
5099
5100    /* Case: boring transfer to known address */
5101    if (next->tag == Iex_Const) {
5102       IRConst* cdst = next->Iex.Const.con;
5103       vassert(cdst->tag == Ico_U64);
5104       if (jk == Ijk_Boring || jk == Ijk_Call) {
5105          /* Boring transfer to known address */
5106          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5107          if (env->chainingAllowed) {
5108             /* .. almost always true .. */
5109             /* Skip the event check at the dst if this is a forwards
5110                edge. */
5111             Bool toFastEP
5112                = ((Addr64)cdst->Ico.U64) > env->max_ga;
5113             if (0) vex_printf("%s", toFastEP ? "X" : ".");
5114             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5115                                              amRIP, Acc_ALWAYS,
5116                                              toFastEP));
5117          } else {
5118             /* .. very occasionally .. */
5119             /* We can't use chaining, so ask for an indirect transfer,
5120                as that's the cheapest alternative that is
5121                allowable. */
5122             HReg r = iselIntExpr_R(env, next);
5123             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5124                                                Ijk_Boring));
5125          }
5126          return;
5127       }
5128    }
5129
5130    /* Case: call/return (==boring) transfer to any address */
5131    switch (jk) {
5132       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5133          HReg        r     = iselIntExpr_R(env, next);
5134          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5135          if (env->chainingAllowed) {
5136             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5137          } else {
5138             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5139                                                Ijk_Boring));
5140          }
5141          return;
5142       }
5143       default:
5144          break;
5145    }
5146
5147    /* Case: assisted transfer to arbitrary address */
5148    switch (jk) {
5149       /* Keep this list in sync with that for Ist_Exit above */
5150       case Ijk_ClientReq:
5151       case Ijk_EmWarn:
5152       case Ijk_NoDecode:
5153       case Ijk_NoRedir:
5154       case Ijk_SigSEGV:
5155       case Ijk_SigTRAP:
5156       case Ijk_Sys_syscall:
5157       case Ijk_Sys_int210:
5158       case Ijk_InvalICache:
5159       case Ijk_Yield: {
5160          HReg        r     = iselIntExpr_R(env, next);
5161          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5162          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5163          return;
5164       }
5165       default:
5166          break;
5167    }
5168
5169    vex_printf( "\n-- PUT(%d) = ", offsIP);
5170    ppIRExpr( next );
5171    vex_printf( "; exit-");
5172    ppIRJumpKind(jk);
5173    vex_printf( "\n");
5174    vassert(0); // are we expecting any other kind?
5175 }
5176
5177
5178 /*---------------------------------------------------------*/
5179 /*--- Insn selector top-level                           ---*/
5180 /*---------------------------------------------------------*/
5181
5182 /* Translate an entire SB to amd64 code. */
5183
5184 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5185                             VexArch      arch_host,
5186                             const VexArchInfo* archinfo_host,
5187                             const VexAbiInfo*  vbi/*UNUSED*/,
5188                             Int offs_Host_EvC_Counter,
5189                             Int offs_Host_EvC_FailAddr,
5190                             Bool chainingAllowed,
5191                             Bool addProfInc,
5192                             Addr max_ga )
5193 {
5194    Int        i, j;
5195    HReg       hreg, hregHI;
5196    ISelEnv*   env;
5197    UInt       hwcaps_host = archinfo_host->hwcaps;
5198    AMD64AMode *amCounter, *amFailAddr;
5199
5200    /* sanity ... */
5201    vassert(arch_host == VexArchAMD64);
5202    vassert(0 == (hwcaps_host
5203                  & ~(VEX_HWCAPS_AMD64_SSE3
5204                      | VEX_HWCAPS_AMD64_SSSE3
5205                      | VEX_HWCAPS_AMD64_CX16
5206                      | VEX_HWCAPS_AMD64_LZCNT
5207                      | VEX_HWCAPS_AMD64_AVX
5208                      | VEX_HWCAPS_AMD64_RDTSCP
5209                      | VEX_HWCAPS_AMD64_BMI
5210                      | VEX_HWCAPS_AMD64_AVX2
5211                      | VEX_HWCAPS_AMD64_F16C
5212                      | VEX_HWCAPS_AMD64_RDRAND)));
5213
5214    /* Check that the host's endianness is as expected. */
5215    vassert(archinfo_host->endness == VexEndnessLE);
5216
5217    /* Make up an initial environment to use. */
5218    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5219    env->vreg_ctr = 0;
5220
5221    /* Set up output code array. */
5222    env->code = newHInstrArray();
5223
5224    /* Copy BB's type env. */
5225    env->type_env = bb->tyenv;
5226
5227    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
5228       change as we go along. */
5229    env->n_vregmap = bb->tyenv->types_used;
5230    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5231    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5232
5233    /* and finally ... */
5234    env->chainingAllowed = chainingAllowed;
5235    env->hwcaps          = hwcaps_host;
5236    env->max_ga          = max_ga;
5237
5238    /* For each IR temporary, allocate a suitably-kinded virtual
5239       register. */
5240    j = 0;
5241    for (i = 0; i < env->n_vregmap; i++) {
5242       hregHI = hreg = INVALID_HREG;
5243       switch (bb->tyenv->types[i]) {
5244          case Ity_I1:
5245          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5246             hreg = mkHReg(True, HRcInt64, 0, j++);
5247             break;
5248          case Ity_I128:
5249             hreg   = mkHReg(True, HRcInt64, 0, j++);
5250             hregHI = mkHReg(True, HRcInt64, 0, j++);
5251             break;
5252          case Ity_F32:
5253          case Ity_F64:
5254          case Ity_V128:
5255             hreg = mkHReg(True, HRcVec128, 0, j++);
5256             break;
5257          case Ity_V256:
5258             hreg   = mkHReg(True, HRcVec128, 0, j++);
5259             hregHI = mkHReg(True, HRcVec128, 0, j++);
5260             break;
5261          default:
5262             ppIRType(bb->tyenv->types[i]);
5263             vpanic("iselBB(amd64): IRTemp type");
5264       }
5265       env->vregmap[i]   = hreg;
5266       env->vregmapHI[i] = hregHI;
5267    }
5268    env->vreg_ctr = j;
5269
5270    /* The very first instruction must be an event check. */
5271    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
5272    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5273    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5274
5275    /* Possibly a block counter increment (for profiling).  At this
5276       point we don't know the address of the counter, so just pretend
5277       it is zero.  It will have to be patched later, but before this
5278       translation is used, by a call to LibVEX_patchProfCtr. */
5279    if (addProfInc) {
5280       addInstr(env, AMD64Instr_ProfInc());
5281    }
5282
5283    /* Ok, finally we can iterate over the statements. */
5284    for (i = 0; i < bb->stmts_used; i++)
5285       if (bb->stmts[i])
5286          iselStmt(env, bb->stmts[i]);
5287
5288    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5289
5290    /* record the number of vregs we used. */
5291    env->code->n_vregs = env->vreg_ctr;
5292    return env->code;
5293 }
5294
5295
5296 /*---------------------------------------------------------------*/
5297 /*--- end                                   host_amd64_isel.c ---*/
5298 /*---------------------------------------------------------------*/