VEX/priv/guest_amd64_toIR.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- begin                                     guest_amd64_toIR.c ---*/
   4 /*--------------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 /* Translates AMD64 code to IR. */
  35
  36 /* TODO:
  37
  38    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
  39    to ensure a 64-bit value is being written.
  40
  41    x87 FP Limitations:
  42
  43    * all arithmetic done at 64 bits
  44
  45    * no FP exceptions, except for handling stack over/underflow
  46
  47    * FP rounding mode observed only for float->int conversions and
  48      int->float conversions which could lose accuracy, and for
  49      float-to-float rounding.  For all other operations,
  50      round-to-nearest is used, regardless.
  51
  52    * some of the FCOM cases could do with testing -- not convinced
  53      that the args are the right way round.
  54
  55    * FSAVE does not re-initialise the FPU; it should do
  56
  57    * FINIT not only initialises the FPU environment, it also zeroes
  58      all the FP registers.  It should leave the registers unchanged.
  59
  60     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
  61     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
  62     only way to observe eflags[1], a proper fix would be to make that
  63     bit be set by PUSHF.
  64
  65     This module uses global variables and so is not MT-safe (if that
  66     should ever become relevant).
  67 */
  68
  69 /* Notes re address size overrides (0x67).
  70
  71    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
  72    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
  73    and System Instructions"), Section 1.2.3 ("Address-Size Override
  74    Prefix"):
  75
  76    0x67 applies to all explicit memory references, causing the top
  77    32 bits of the effective address to become zero.
  78
  79    0x67 has no effect on stack references (push/pop); these always
  80    use a 64-bit address.
  81
  82    0x67 changes the interpretation of instructions which implicitly
  83    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
  84    instead.  These are:
  85
  86       cmp{s,sb,sw,sd,sq}
  87       in{s,sb,sw,sd}
  88       jcxz, jecxz, jrcxz
  89       lod{s,sb,sw,sd,sq}
  90       loop{,e,bz,be,z}
  91       mov{s,sb,sw,sd,sq}
  92       out{s,sb,sw,sd}
  93       rep{,e,ne,nz}
  94       sca{s,sb,sw,sd,sq}
  95       sto{s,sb,sw,sd,sq}
  96       xlat{,b} */
  97
  98 /* "Special" instructions.
  99
 100    This instruction decoder can decode three special instructions
 101    which mean nothing natively (are no-ops as far as regs/mem are
 102    concerned) but have meaning for supporting Valgrind.  A special
 103    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
 104    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
 105    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
 106    Following that, one of the following 3 are allowed (standard
 107    interpretation in parentheses):
 108
 109       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
 110       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
 111       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
 112       4887F6 (xchgq %rdi,%rdi)   IR injection
 113
 114    Any other bytes following the 16-byte preamble are illegal and
 115    constitute a failure in instruction decoding.  This all assumes
 116    that the preamble will never occur except in specific code
 117    fragments designed for Valgrind to catch.
 118
 119    No prefixes may precede a "Special" instruction.
 120 */
 121
 122 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
 123    insns: the side-exit back to the start of the insn is done with
 124    Ijk_Boring.  This is quite wrong, it should be done with
 125    Ijk_NoRedir, since otherwise the side exit, which is intended to
 126    restart the instruction for whatever reason, could go somewhere
 127    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
 128    no-redir jumps performance critical, at least for rep-prefixed
 129    instructions, since all iterations thereof would involve such a
 130    jump.  It's not such a big deal with casLE since the side exit is
 131    only taken if the CAS fails, that is, the location is contended,
 132    which is relatively unlikely.
 133
 134    Note also, the test for CAS success vs failure is done using
 135    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
 136    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
 137    shouldn't definedness-check these comparisons.  See
 138    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
 139    background/rationale.
 140 */
 141
 142 /* LOCK prefixed instructions.  These are translated using IR-level
 143    CAS statements (IRCAS) and are believed to preserve atomicity, even
 144    from the point of view of some other process racing against a
 145    simulated one (presumably they communicate via a shared memory
 146    segment).
 147
 148    Handlers which are aware of LOCK prefixes are:
 149       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
 150       dis_cmpxchg_G_E  (cmpxchg)
 151       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
 152       dis_Grp3         (not, neg)
 153       dis_Grp4         (inc, dec)
 154       dis_Grp5         (inc, dec)
 155       dis_Grp8_Imm     (bts, btc, btr)
 156       dis_bt_G_E       (bts, btc, btr)
 157       dis_xadd_G_E     (xadd)
 158 */
 159
 160
 161 #include "libvex_basictypes.h"
 162 #include "libvex_ir.h"
 163 #include "libvex.h"
 164 #include "libvex_guest_amd64.h"
 165
 166 #include "main_util.h"
 167 #include "main_globals.h"
 168 #include "guest_generic_bb_to_IR.h"
 169 #include "guest_generic_x87.h"
 170 #include "guest_amd64_defs.h"
 171
 172
 173 /*------------------------------------------------------------*/
 174 /*--- Globals                                              ---*/
 175 /*------------------------------------------------------------*/
 176
 177 /* These are set at the start of the translation of an insn, right
 178    down in disInstr_AMD64, so that we don't have to pass them around
 179    endlessly.  They are all constant during the translation of any
 180    given insn. */
 181
 182 /* These are set at the start of the translation of a BB, so
 183    that we don't have to pass them around endlessly. */
 184
 185 /* We need to know this to do sub-register accesses correctly. */
 186 static VexEndness host_endness;
 187
 188 /* Pointer to the guest code area (points to start of BB, not to the
 189    insn being processed). */
 190 static const UChar* guest_code;
 191
 192 /* The guest address corresponding to guest_code[0]. */
 193 static Addr64 guest_RIP_bbstart;
 194
 195 /* The guest address for the instruction currently being
 196    translated. */
 197 static Addr64 guest_RIP_curr_instr;
 198
 199 /* The IRSB* into which we're generating code. */
 200 static IRSB* irsb;
 201
 202 /* For ensuring that %rip-relative addressing is done right.  A read
 203    of %rip generates the address of the next instruction.  It may be
 204    that we don't conveniently know that inside disAMode().  For sanity
 205    checking, if the next insn %rip is needed, we make a guess at what
 206    it is, record that guess here, and set the accompanying Bool to
 207    indicate that -- after this insn's decode is finished -- that guess
 208    needs to be checked.  */
 209
 210 /* At the start of each insn decode, is set to (0, False).
 211    After the decode, if _mustcheck is now True, _assumed is
 212    checked. */
 213
 214 static Addr64 guest_RIP_next_assumed;
 215 static Bool   guest_RIP_next_mustcheck;
 216
 217
 218 /*------------------------------------------------------------*/
 219 /*--- Helpers for constructing IR.                         ---*/
 220 /*------------------------------------------------------------*/
 221
 222 /* Generate a new temporary of the given type. */
 223 static IRTemp newTemp ( IRType ty )
 224 {
 225    vassert(isPlausibleIRType(ty));
 226    return newIRTemp( irsb->tyenv, ty );
 227 }
 228
 229 /* Add a statement to the list held by "irsb". */
 230 static void stmt ( IRStmt* st )
 231 {
 232    addStmtToIRSB( irsb, st );
 233 }
 234
 235 /* Generate a statement "dst := e". */
 236 static void assign ( IRTemp dst, IRExpr* e )
 237 {
 238    stmt( IRStmt_WrTmp(dst, e) );
 239 }
 240
 241 static IRExpr* unop ( IROp op, IRExpr* a )
 242 {
 243    return IRExpr_Unop(op, a);
 244 }
 245
 246 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 247 {
 248    return IRExpr_Binop(op, a1, a2);
 249 }
 250
 251 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 252 {
 253    return IRExpr_Triop(op, a1, a2, a3);
 254 }
 255
 256 static IRExpr* mkexpr ( IRTemp tmp )
 257 {
 258    return IRExpr_RdTmp(tmp);
 259 }
 260
 261 static IRExpr* mkU8 ( ULong i )
 262 {
 263    vassert(i < 256);
 264    return IRExpr_Const(IRConst_U8( (UChar)i ));
 265 }
 266
 267 static IRExpr* mkU16 ( ULong i )
 268 {
 269    vassert(i < 0x10000ULL);
 270    return IRExpr_Const(IRConst_U16( (UShort)i ));
 271 }
 272
 273 static IRExpr* mkU32 ( ULong i )
 274 {
 275    vassert(i < 0x100000000ULL);
 276    return IRExpr_Const(IRConst_U32( (UInt)i ));
 277 }
 278
 279 static IRExpr* mkU64 ( ULong i )
 280 {
 281    return IRExpr_Const(IRConst_U64(i));
 282 }
 283
 284 static IRExpr* mkU ( IRType ty, ULong i )
 285 {
 286    switch (ty) {
 287       case Ity_I8:  return mkU8(i);
 288       case Ity_I16: return mkU16(i);
 289       case Ity_I32: return mkU32(i);
 290       case Ity_I64: return mkU64(i);
 291       default: vpanic("mkU(amd64)");
 292    }
 293 }
 294
 295 static void storeLE ( IRExpr* addr, IRExpr* data )
 296 {
 297    stmt( IRStmt_Store(Iend_LE, addr, data) );
 298 }
 299
 300 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 301 {
 302    return IRExpr_Load(Iend_LE, ty, addr);
 303 }
 304
 305 static IROp mkSizedOp ( IRType ty, IROp op8 )
 306 {
 307    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
 308            || op8 == Iop_Mul8
 309            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
 310            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
 311            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
 312            || op8 == Iop_CasCmpNE8
 313            || op8 == Iop_Not8 );
 314    switch (ty) {
 315       case Ity_I8:  return 0 +op8;
 316       case Ity_I16: return 1 +op8;
 317       case Ity_I32: return 2 +op8;
 318       case Ity_I64: return 3 +op8;
 319       default: vpanic("mkSizedOp(amd64)");
 320    }
 321 }
 322
 323 static
 324 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
 325 {
 326    if (szSmall == 1 && szBig == 4) {
 327       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
 328    }
 329    if (szSmall == 1 && szBig == 2) {
 330       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
 331    }
 332    if (szSmall == 2 && szBig == 4) {
 333       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
 334    }
 335    if (szSmall == 1 && szBig == 8 && !signd) {
 336       return unop(Iop_8Uto64, src);
 337    }
 338    if (szSmall == 1 && szBig == 8 && signd) {
 339       return unop(Iop_8Sto64, src);
 340    }
 341    if (szSmall == 2 && szBig == 8 && !signd) {
 342       return unop(Iop_16Uto64, src);
 343    }
 344    if (szSmall == 2 && szBig == 8 && signd) {
 345       return unop(Iop_16Sto64, src);
 346    }
 347    vpanic("doScalarWidening(amd64)");
 348 }
 349
 350 static
 351 void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
 352 {
 353    IRType ty = typeOfIRExpr(irsb->tyenv, value);
 354    stmt( IRStmt_Put(gstOffB,
 355                     IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
 356 }
 357
 358
 359 /*------------------------------------------------------------*/
 360 /*--- Debugging output                                     ---*/
 361 /*------------------------------------------------------------*/
 362
 363 /* Bomb out if we can't handle something. */
 364 __attribute__ ((noreturn))
 365 static void unimplemented ( const HChar* str )
 366 {
 367    vex_printf("amd64toIR: unimplemented feature\n");
 368    vpanic(str);
 369 }
 370
 371 #define DIP(format, args...)           \
 372    if (vex_traceflags & VEX_TRACE_FE)  \
 373       vex_printf(format, ## args)
 374
 375 #define DIS(buf, format, args...)      \
 376    if (vex_traceflags & VEX_TRACE_FE)  \
 377       vex_sprintf(buf, format, ## args)
 378
 379
 380 /*------------------------------------------------------------*/
 381 /*--- Offsets of various parts of the amd64 guest state.   ---*/
 382 /*------------------------------------------------------------*/
 383
 384 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
 385 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
 386 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
 387 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
 388 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
 389 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
 390 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
 391 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
 392 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
 393 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
 394 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
 395 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
 396 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
 397 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
 398 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
 399 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
 400
 401 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
 402
 403 #define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
 404 #define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
 405
 406 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
 407 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
 408 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
 409 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
 410
 411 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
 412 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
 413 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
 414 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
 415 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
 416 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
 417 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
 418 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
 419
 420 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
 421 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
 422 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
 423 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
 424 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
 425 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
 426 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
 427 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
 428 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
 429 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
 430 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
 431 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
 432 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
 433 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
 434 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
 435 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
 436 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
 437 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
 438
 439 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
 440 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
 441 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
 442
 443 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
 444
 445
 446 /*------------------------------------------------------------*/
 447 /*--- Helper bits and pieces for deconstructing the        ---*/
 448 /*--- amd64 insn stream.                                   ---*/
 449 /*------------------------------------------------------------*/
 450
 451 /* This is the AMD64 register encoding -- integer regs. */
 452 #define R_RAX 0
 453 #define R_RCX 1
 454 #define R_RDX 2
 455 #define R_RBX 3
 456 #define R_RSP 4
 457 #define R_RBP 5
 458 #define R_RSI 6
 459 #define R_RDI 7
 460 #define R_R8  8
 461 #define R_R9  9
 462 #define R_R10 10
 463 #define R_R11 11
 464 #define R_R12 12
 465 #define R_R13 13
 466 #define R_R14 14
 467 #define R_R15 15
 468
 469 /* This is the Intel register encoding -- segment regs. */
 470 #define R_ES 0
 471 #define R_CS 1
 472 #define R_SS 2
 473 #define R_DS 3
 474 #define R_FS 4
 475 #define R_GS 5
 476
 477
 478 /* Various simple conversions */
 479
 480 static ULong extend_s_8to64 ( UChar x )
 481 {
 482    return (ULong)((Long)(((ULong)x) << 56) >> 56);
 483 }
 484
 485 static ULong extend_s_16to64 ( UShort x )
 486 {
 487    return (ULong)((Long)(((ULong)x) << 48) >> 48);
 488 }
 489
 490 static ULong extend_s_32to64 ( UInt x )
 491 {
 492    return (ULong)((Long)(((ULong)x) << 32) >> 32);
 493 }
 494
 495 /* Figure out whether the mod and rm parts of a modRM byte refer to a
 496    register or memory.  If so, the byte will have the form 11XXXYYY,
 497    where YYY is the register number. */
 498 inline
 499 static Bool epartIsReg ( UChar mod_reg_rm )
 500 {
 501    return toBool(0xC0 == (mod_reg_rm & 0xC0));
 502 }
 503
 504 /* Extract the 'g' field from a modRM byte.  This only produces 3
 505    bits, which is not a complete register number.  You should avoid
 506    this function if at all possible. */
 507 inline
 508 static Int gregLO3ofRM ( UChar mod_reg_rm )
 509 {
 510    return (Int)( (mod_reg_rm >> 3) & 7 );
 511 }
 512
 513 /* Ditto the 'e' field of a modRM byte. */
 514 inline
 515 static Int eregLO3ofRM ( UChar mod_reg_rm )
 516 {
 517    return (Int)(mod_reg_rm & 0x7);
 518 }
 519
 520 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
 521
 522 static inline UChar getUChar ( Long delta )
 523 {
 524    UChar v = guest_code[delta+0];
 525    return v;
 526 }
 527
 528 static UInt getUDisp16 ( Long delta )
 529 {
 530    UInt v = guest_code[delta+1]; v <<= 8;
 531    v |= guest_code[delta+0];
 532    return v & 0xFFFF;
 533 }
 534
 535 //.. static UInt getUDisp ( Int size, Long delta )
 536 //.. {
 537 //..    switch (size) {
 538 //..       case 4: return getUDisp32(delta);
 539 //..       case 2: return getUDisp16(delta);
 540 //..       case 1: return getUChar(delta);
 541 //..       default: vpanic("getUDisp(x86)");
 542 //..    }
 543 //..    return 0; /*notreached*/
 544 //.. }
 545
 546
 547 /* Get a byte value out of the insn stream and sign-extend to 64
 548    bits. */
 549 static Long getSDisp8 ( Long delta )
 550 {
 551    return extend_s_8to64( guest_code[delta] );
 552 }
 553
 554 /* Get a 16-bit value out of the insn stream and sign-extend to 64
 555    bits. */
 556 static Long getSDisp16 ( Long delta )
 557 {
 558    UInt v = guest_code[delta+1]; v <<= 8;
 559    v |= guest_code[delta+0];
 560    return extend_s_16to64( (UShort)v );
 561 }
 562
 563 /* Get a 32-bit value out of the insn stream and sign-extend to 64
 564    bits. */
 565 static Long getSDisp32 ( Long delta )
 566 {
 567    UInt v = guest_code[delta+3]; v <<= 8;
 568    v |= guest_code[delta+2]; v <<= 8;
 569    v |= guest_code[delta+1]; v <<= 8;
 570    v |= guest_code[delta+0];
 571    return extend_s_32to64( v );
 572 }
 573
 574 /* Get a 64-bit value out of the insn stream. */
 575 static Long getDisp64 ( Long delta )
 576 {
 577    ULong v = 0;
 578    v |= guest_code[delta+7]; v <<= 8;
 579    v |= guest_code[delta+6]; v <<= 8;
 580    v |= guest_code[delta+5]; v <<= 8;
 581    v |= guest_code[delta+4]; v <<= 8;
 582    v |= guest_code[delta+3]; v <<= 8;
 583    v |= guest_code[delta+2]; v <<= 8;
 584    v |= guest_code[delta+1]; v <<= 8;
 585    v |= guest_code[delta+0];
 586    return v;
 587 }
 588
 589 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
 590    if this is called with size==8.  Should not happen. */
 591 static Long getSDisp ( Int size, Long delta )
 592 {
 593    switch (size) {
 594       case 4: return getSDisp32(delta);
 595       case 2: return getSDisp16(delta);
 596       case 1: return getSDisp8(delta);
 597       default: vpanic("getSDisp(amd64)");
 598   }
 599 }
 600
 601 static ULong mkSizeMask ( Int sz )
 602 {
 603    switch (sz) {
 604       case 1: return 0x00000000000000FFULL;
 605       case 2: return 0x000000000000FFFFULL;
 606       case 4: return 0x00000000FFFFFFFFULL;
 607       case 8: return 0xFFFFFFFFFFFFFFFFULL;
 608       default: vpanic("mkSzMask(amd64)");
 609    }
 610 }
 611
 612 static Int imin ( Int a, Int b )
 613 {
 614    return (a < b) ? a : b;
 615 }
 616
 617 static IRType szToITy ( Int n )
 618 {
 619    switch (n) {
 620       case 1: return Ity_I8;
 621       case 2: return Ity_I16;
 622       case 4: return Ity_I32;
 623       case 8: return Ity_I64;
 624       default: vex_printf("\nszToITy(%d)\n", n);
 625                vpanic("szToITy(amd64)");
 626    }
 627 }
 628
 629
 630 /*------------------------------------------------------------*/
 631 /*--- For dealing with prefixes.                           ---*/
 632 /*------------------------------------------------------------*/
 633
 634 /* The idea is to pass around an int holding a bitmask summarising
 635    info from the prefixes seen on the current instruction, including
 636    info from the REX byte.  This info is used in various places, but
 637    most especially when making sense of register fields in
 638    instructions.
 639
 640    The top 8 bits of the prefix are 0x55, just as a hacky way to
 641    ensure it really is a valid prefix.
 642
 643    Things you can safely assume about a well-formed prefix:
 644    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
 645    * if REX is not present then REXW,REXR,REXX,REXB will read
 646      as zero.
 647    * F2 and F3 will not both be 1.
 648 */
 649
 650 typedef UInt  Prefix;
 651
 652 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
 653 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
 654 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
 655 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
 656 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
 657 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
 658 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
 659 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
 660 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
 661 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
 662 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
 663 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
 664 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
 665 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
 666 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
 667 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
 668 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
 669 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
 670 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
 671    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
 672    positions. */
 673 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
 674 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
 675 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
 676 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
 677
 678
 679 #define PFX_EMPTY 0x55000000
 680
 681 static Bool IS_VALID_PFX ( Prefix pfx ) {
 682    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
 683 }
 684
 685 static Bool haveREX ( Prefix pfx ) {
 686    return toBool(pfx & PFX_REX);
 687 }
 688
 689 static Int getRexW ( Prefix pfx ) {
 690    return (pfx & PFX_REXW) ? 1 : 0;
 691 }
 692 static Int getRexR ( Prefix pfx ) {
 693    return (pfx & PFX_REXR) ? 1 : 0;
 694 }
 695 static Int getRexX ( Prefix pfx ) {
 696    return (pfx & PFX_REXX) ? 1 : 0;
 697 }
 698 static Int getRexB ( Prefix pfx ) {
 699    return (pfx & PFX_REXB) ? 1 : 0;
 700 }
 701
 702 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
 703    completely changes what instruction it really is. */
 704 static Bool haveF2orF3 ( Prefix pfx ) {
 705    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
 706 }
 707 static Bool haveF2andF3 ( Prefix pfx ) {
 708    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
 709 }
 710 static Bool haveF2 ( Prefix pfx ) {
 711    return toBool((pfx & PFX_F2) > 0);
 712 }
 713 static Bool haveF3 ( Prefix pfx ) {
 714    return toBool((pfx & PFX_F3) > 0);
 715 }
 716
 717 static Bool have66 ( Prefix pfx ) {
 718    return toBool((pfx & PFX_66) > 0);
 719 }
 720 static Bool haveASO ( Prefix pfx ) {
 721    return toBool((pfx & PFX_ASO) > 0);
 722 }
 723 static Bool haveLOCK ( Prefix pfx ) {
 724    return toBool((pfx & PFX_LOCK) > 0);
 725 }
 726
 727 /* Return True iff pfx has 66 set and F2 and F3 clear */
 728 static Bool have66noF2noF3 ( Prefix pfx )
 729 {
 730   return
 731      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
 732 }
 733
 734 /* Return True iff pfx has F2 set and 66 and F3 clear */
 735 static Bool haveF2no66noF3 ( Prefix pfx )
 736 {
 737   return
 738      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
 739 }
 740
 741 /* Return True iff pfx has F3 set and 66 and F2 clear */
 742 static Bool haveF3no66noF2 ( Prefix pfx )
 743 {
 744   return
 745      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
 746 }
 747
 748 /* Return True iff pfx has F3 set and F2 clear */
 749 static Bool haveF3noF2 ( Prefix pfx )
 750 {
 751   return
 752      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
 753 }
 754
 755 /* Return True iff pfx has F2 set and F3 clear */
 756 static Bool haveF2noF3 ( Prefix pfx )
 757 {
 758   return
 759      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
 760 }
 761
 762 /* Return True iff pfx has F2 and F3 clear */
 763 static Bool haveNoF2noF3 ( Prefix pfx )
 764 {
 765   return
 766      toBool((pfx & (PFX_F2|PFX_F3)) == 0);
 767 }
 768
 769 /* Return True iff pfx has 66, F2 and F3 clear */
 770 static Bool haveNo66noF2noF3 ( Prefix pfx )
 771 {
 772   return
 773      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
 774 }
 775
 776 /* Return True iff pfx has any of 66, F2 and F3 set */
 777 static Bool have66orF2orF3 ( Prefix pfx )
 778 {
 779   return toBool( ! haveNo66noF2noF3(pfx) );
 780 }
 781
 782 /* Return True iff pfx has 66 or F3 set */
 783 static Bool have66orF3 ( Prefix pfx )
 784 {
 785    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
 786 }
 787
 788 /* Clear all the segment-override bits in a prefix. */
 789 static Prefix clearSegBits ( Prefix p )
 790 {
 791    return
 792       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
 793 }
 794
 795 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
 796 static UInt getVexNvvvv ( Prefix pfx ) {
 797    UInt r = (UInt)pfx;
 798    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
 799    return r & 0xF;
 800 }
 801
 802 static Bool haveVEX ( Prefix pfx ) {
 803    return toBool(pfx & PFX_VEX);
 804 }
 805
 806 static Int getVexL ( Prefix pfx ) {
 807    return (pfx & PFX_VEXL) ? 1 : 0;
 808 }
 809
 810
 811 /*------------------------------------------------------------*/
 812 /*--- For dealing with escapes                             ---*/
 813 /*------------------------------------------------------------*/
 814
 815
 816 /* Escapes come after the prefixes, but before the primary opcode
 817    byte.  They escape the primary opcode byte into a bigger space.
 818    The 0xF0000000 isn't significant, except so as to make it not
 819    overlap valid Prefix values, for sanity checking.
 820 */
 821
 822 typedef
 823    enum {
 824       ESC_NONE=0xF0000000, // none
 825       ESC_0F,              // 0F
 826       ESC_0F38,            // 0F 38
 827       ESC_0F3A             // 0F 3A
 828    }
 829    Escape;
 830
 831
 832 /*------------------------------------------------------------*/
 833 /*--- For dealing with integer registers                   ---*/
 834 /*------------------------------------------------------------*/
 835
 836 /* This is somewhat complex.  The rules are:
 837
 838    For 64, 32 and 16 bit register references, the e or g fields in the
 839    modrm bytes supply the low 3 bits of the register number.  The
 840    fourth (most-significant) bit of the register number is supplied by
 841    the REX byte, if it is present; else that bit is taken to be zero.
 842
 843    The REX.R bit supplies the high bit corresponding to the g register
 844    field, and the REX.B bit supplies the high bit corresponding to the
 845    e register field (when the mod part of modrm indicates that modrm's
 846    e component refers to a register and not to memory).
 847
 848    The REX.X bit supplies a high register bit for certain registers
 849    in SIB address modes, and is generally rarely used.
 850
 851    For 8 bit register references, the presence of the REX byte itself
 852    has significance.  If there is no REX present, then the 3-bit
 853    number extracted from the modrm e or g field is treated as an index
 854    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
 855    old x86 encoding scheme.
 856
 857    But if there is a REX present, the register reference is
 858    interpreted in the same way as for 64/32/16-bit references: a high
 859    bit is extracted from REX, giving a 4-bit number, and the denoted
 860    register is the lowest 8 bits of the 16 integer registers denoted
 861    by the number.  In particular, values 3 through 7 of this sequence
 862    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
 863    %rsp %rbp %rsi %rdi.
 864
 865    The REX.W bit has no bearing at all on register numbers.  Instead
 866    its presence indicates that the operand size is to be overridden
 867    from its default value (32 bits) to 64 bits instead.  This is in
 868    the same fashion that an 0x66 prefix indicates the operand size is
 869    to be overridden from 32 bits down to 16 bits.  When both REX.W and
 870    0x66 are present there is a conflict, and REX.W takes precedence.
 871
 872    Rather than try to handle this complexity using a single huge
 873    function, several smaller ones are provided.  The aim is to make it
 874    as difficult as possible to screw up register decoding in a subtle
 875    and hard-to-track-down way.
 876
 877    Because these routines fish around in the host's memory (that is,
 878    in the guest state area) for sub-parts of guest registers, their
 879    correctness depends on the host's endianness.  So far these
 880    routines only work for little-endian hosts.  Those for which
 881    endianness is important have assertions to ensure sanity.
 882 */
 883
 884
 885 /* About the simplest question you can ask: where do the 64-bit
 886    integer registers live (in the guest state) ? */
 887
 888 static Int integerGuestReg64Offset ( UInt reg )
 889 {
 890    switch (reg) {
 891       case R_RAX: return OFFB_RAX;
 892       case R_RCX: return OFFB_RCX;
 893       case R_RDX: return OFFB_RDX;
 894       case R_RBX: return OFFB_RBX;
 895       case R_RSP: return OFFB_RSP;
 896       case R_RBP: return OFFB_RBP;
 897       case R_RSI: return OFFB_RSI;
 898       case R_RDI: return OFFB_RDI;
 899       case R_R8:  return OFFB_R8;
 900       case R_R9:  return OFFB_R9;
 901       case R_R10: return OFFB_R10;
 902       case R_R11: return OFFB_R11;
 903       case R_R12: return OFFB_R12;
 904       case R_R13: return OFFB_R13;
 905       case R_R14: return OFFB_R14;
 906       case R_R15: return OFFB_R15;
 907       default: vpanic("integerGuestReg64Offset(amd64)");
 908    }
 909 }
 910
 911
 912 /* Produce the name of an integer register, for printing purposes.
 913    reg is a number in the range 0 .. 15 that has been generated from a
 914    3-bit reg-field number and a REX extension bit.  irregular denotes
 915    the case where sz==1 and no REX byte is present and where the denoted
 916    sub-register is bits 15:8 of the containing 64-bit register. */
 917
 918 static
 919 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
 920 {
 921    static const HChar* ireg64_names[16]
 922      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
 923          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
 924    static const HChar* ireg32_names[16]
 925      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 926          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
 927    static const HChar* ireg16_names[16]
 928      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
 929          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
 930    static const HChar* ireg8_names[16]
 931      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
 932          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
 933    static const HChar* ireg8_irregular[4]
 934      = { "%ah", "%ch", "%dh", "%bh" };
 935
 936    vassert(reg < 16);
 937    if (sz == 1) {
 938       if (irregular)
 939          vassert(reg >= 4 && reg < 8);
 940    } else {
 941       vassert(irregular == False);
 942    }
 943
 944    switch (sz) {
 945       case 8: return ireg64_names[reg];
 946       case 4: return ireg32_names[reg];
 947       case 2: return ireg16_names[reg];
 948       case 1: if (irregular) {
 949                  vassert(reg >= 4 && reg < 8);
 950                  return ireg8_irregular[reg - 4];
 951               } else {
 952                  return ireg8_names[reg];
 953               }
 954       default: vpanic("nameIReg(amd64)");
 955    }
 956 }
 957
 958 /* Using the same argument conventions as nameIReg, produce the
 959    guest state offset of an integer register. */
 960
 961 static
 962 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
 963 {
 964    vassert(reg < 16);
 965    if (sz == 1) {
 966       if (irregular)
 967          vassert(reg >= 4 && reg < 8);
 968    } else {
 969       vassert(irregular == False);
 970    }
 971
 972    /* Deal with irregular case -- sz==1 and no REX present */
 973    if (sz == 1 && irregular) {
 974       switch (reg) {
 975          case R_RSP: return 1+ OFFB_RAX;
 976          case R_RBP: return 1+ OFFB_RCX;
 977          case R_RSI: return 1+ OFFB_RDX;
 978          case R_RDI: return 1+ OFFB_RBX;
 979          default:    break; /* use the normal case */
 980       }
 981    }
 982
 983    /* Normal case */
 984    return integerGuestReg64Offset(reg);
 985 }
 986
 987
 988 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
 989
 990 static IRExpr* getIRegCL ( void )
 991 {
 992    vassert(host_endness == VexEndnessLE);
 993    return unop(Iop_64to8, IRExpr_Get( OFFB_RCX, Ity_I64 ));
 994 }
 995
 996
 997 /* Write to the %AH register. */
 998
 999 static void putIRegAH ( IRExpr* e )
1000 {
1001    vassert(host_endness == VexEndnessLE);
1002    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
1003    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
1004 }
1005
1006
1007 /* Read/write various widths of %RAX, as it has various
1008    special-purpose uses. */
1009
1010 static const HChar* nameIRegRAX ( Int sz )
1011 {
1012    switch (sz) {
1013       case 1: return "%al";
1014       case 2: return "%ax";
1015       case 4: return "%eax";
1016       case 8: return "%rax";
1017       default: vpanic("nameIRegRAX(amd64)");
1018    }
1019 }
1020
1021 static IRExpr* getIRegRAX ( Int sz )
1022 {
1023    vassert(host_endness == VexEndnessLE);
1024    switch (sz) {
1025       case 1: return unop(Iop_64to8,  IRExpr_Get( OFFB_RAX, Ity_I64 ));
1026       case 2: return unop(Iop_64to16, IRExpr_Get( OFFB_RAX, Ity_I64 ));
1027       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
1028       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
1029       default: vpanic("getIRegRAX(amd64)");
1030    }
1031 }
1032
1033 static void putIRegRAX ( Int sz, IRExpr* e )
1034 {
1035    IRType ty = typeOfIRExpr(irsb->tyenv, e);
1036    vassert(host_endness == VexEndnessLE);
1037    switch (sz) {
1038       case 8: vassert(ty == Ity_I64);
1039               stmt( IRStmt_Put( OFFB_RAX, e ));
1040               break;
1041       case 4: vassert(ty == Ity_I32);
1042               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
1043               break;
1044       case 2: vassert(ty == Ity_I16);
1045               stmt( IRStmt_Put( OFFB_RAX, e ));
1046               break;
1047       case 1: vassert(ty == Ity_I8);
1048               stmt( IRStmt_Put( OFFB_RAX, e ));
1049               break;
1050       default: vpanic("putIRegRAX(amd64)");
1051    }
1052 }
1053
1054
1055 /* Read/write various widths of %RDX, as it has various
1056    special-purpose uses. */
1057
1058 static const HChar* nameIRegRDX ( Int sz )
1059 {
1060    switch (sz) {
1061       case 1: return "%dl";
1062       case 2: return "%dx";
1063       case 4: return "%edx";
1064       case 8: return "%rdx";
1065       default: vpanic("nameIRegRDX(amd64)");
1066    }
1067 }
1068
1069 static IRExpr* getIRegRDX ( Int sz )
1070 {
1071    vassert(host_endness == VexEndnessLE);
1072    switch (sz) {
1073       case 1: return unop(Iop_64to8,  IRExpr_Get( OFFB_RDX, Ity_I64 ));
1074       case 2: return unop(Iop_64to16, IRExpr_Get( OFFB_RDX, Ity_I64 ));
1075       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
1076       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
1077       default: vpanic("getIRegRDX(amd64)");
1078    }
1079 }
1080
1081 static void putIRegRDX ( Int sz, IRExpr* e )
1082 {
1083    vassert(host_endness == VexEndnessLE);
1084    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1085    switch (sz) {
1086       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
1087               break;
1088       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
1089               break;
1090       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
1091               break;
1092       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
1093               break;
1094       default: vpanic("putIRegRDX(amd64)");
1095    }
1096 }
1097
1098
1099 /* Simplistic functions to deal with the integer registers as a
1100    straightforward bank of 16 64-bit regs. */
1101
1102 static IRExpr* getIReg64 ( UInt regno )
1103 {
1104    return IRExpr_Get( integerGuestReg64Offset(regno),
1105                       Ity_I64 );
1106 }
1107
1108 static void putIReg64 ( UInt regno, IRExpr* e )
1109 {
1110    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1111    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
1112 }
1113
1114 static const HChar* nameIReg64 ( UInt regno )
1115 {
1116    return nameIReg( 8, regno, False );
1117 }
1118
1119
1120 /* Simplistic functions to deal with the lower halves of integer
1121    registers as a straightforward bank of 16 32-bit regs. */
1122
1123 static IRExpr* getIReg32 ( UInt regno )
1124 {
1125    vassert(host_endness == VexEndnessLE);
1126    return unop(Iop_64to32,
1127                IRExpr_Get( integerGuestReg64Offset(regno),
1128                            Ity_I64 ));
1129 }
1130
1131 static void putIReg32 ( UInt regno, IRExpr* e )
1132 {
1133    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1134    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1135                      unop(Iop_32Uto64,e) ) );
1136 }
1137
1138 static const HChar* nameIReg32 ( UInt regno )
1139 {
1140    return nameIReg( 4, regno, False );
1141 }
1142
1143
1144 /* Simplistic functions to deal with the lower quarters of integer
1145    registers as a straightforward bank of 16 16-bit regs. */
1146
1147 static IRExpr* getIReg16 ( UInt regno )
1148 {
1149    vassert(host_endness == VexEndnessLE);
1150    return unop(Iop_64to16,
1151                IRExpr_Get( integerGuestReg64Offset(regno),
1152                            Ity_I64 ));
1153 }
1154
1155 static void putIReg16 ( UInt regno, IRExpr* e )
1156 {
1157    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1158    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1159                      unop(Iop_16Uto64,e) ) );
1160 }
1161
1162 static const HChar* nameIReg16 ( UInt regno )
1163 {
1164    return nameIReg( 2, regno, False );
1165 }
1166
1167
1168 /* Sometimes what we know is a 3-bit register number, a REX byte, and
1169    which field of the REX byte is to be used to extend to a 4-bit
1170    number.  These functions cater for that situation.
1171 */
1172 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
1173 {
1174    vassert(lo3bits < 8);
1175    vassert(IS_VALID_PFX(pfx));
1176    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
1177 }
1178
1179 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
1180 {
1181    vassert(lo3bits < 8);
1182    vassert(IS_VALID_PFX(pfx));
1183    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
1184 }
1185
1186 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1187 {
1188    vassert(lo3bits < 8);
1189    vassert(IS_VALID_PFX(pfx));
1190    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1191    UInt regNo = lo3bits | (getRexB(pfx) << 3);
1192    return nameIReg( sz, regNo,
1193                         toBool(sz==1 && !haveREX(pfx) && regNo >= 4 && regNo < 8));
1194 }
1195
1196 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1197 {
1198    vassert(lo3bits < 8);
1199    vassert(IS_VALID_PFX(pfx));
1200    UInt regNo = (getRexB(pfx) << 3) | lo3bits;
1201    switch (sz) {
1202       case 8: {
1203          return IRExpr_Get(
1204                    offsetIReg( 8, regNo, False/*!irregular*/ ),
1205                    Ity_I64
1206                 );
1207       }
1208       case 4: {
1209          return unop(Iop_64to32,
1210                      IRExpr_Get(
1211                         offsetIReg( 8, regNo, False/*!irregular*/ ),
1212                         Ity_I64
1213                 ));
1214       }
1215       case 2: {
1216          return unop(Iop_64to16,
1217                      IRExpr_Get(
1218                         offsetIReg( 8, regNo, False/*!irregular*/ ),
1219                         Ity_I64
1220                 ));
1221       }
1222       case 1: {
1223          Bool irregular = !haveREX(pfx) && regNo >= 4 && regNo < 8;
1224          if (irregular) {
1225             return IRExpr_Get(
1226                       offsetIReg( 1, regNo, True/*irregular*/ ),
1227                       Ity_I8
1228                    );
1229          } else {
1230             return unop(Iop_64to8,
1231                         IRExpr_Get(
1232                            offsetIReg( 8, regNo, False/*!irregular*/ ),
1233                            Ity_I64
1234                    ));
1235          }
1236       }
1237       default: {
1238          vpanic("getIRegRexB");
1239       }
1240    }
1241 }
1242
1243 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
1244 {
1245    vassert(lo3bits < 8);
1246    vassert(IS_VALID_PFX(pfx));
1247    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1248    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1249    Bool irregular = sz == 1 && !haveREX(pfx) && lo3bits >= 4 && lo3bits < 8;
1250    stmt( IRStmt_Put(
1251             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3), irregular ),
1252             sz==4 ? unop(Iop_32Uto64,e) : e
1253    ));
1254 }
1255
1256
1257 /* Functions for getting register numbers from modrm bytes and REX
1258    when we don't have to consider the complexities of integer subreg
1259    accesses.
1260 */
1261 /* Extract the g reg field from a modRM byte, and augment it using the
1262    REX.R bit from the supplied REX byte.  The R bit usually is
1263    associated with the g register field.
1264 */
1265 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1266 {
1267    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
1268    reg += (pfx & PFX_REXR) ? 8 : 0;
1269    return reg;
1270 }
1271
1272 /* Extract the e reg field from a modRM byte, and augment it using the
1273    REX.B bit from the supplied REX byte.  The B bit usually is
1274    associated with the e register field (when modrm indicates e is a
1275    register, that is).
1276 */
1277 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1278 {
1279    Int rm;
1280    vassert(epartIsReg(mod_reg_rm));
1281    rm = (Int)(mod_reg_rm & 0x7);
1282    rm += (pfx & PFX_REXB) ? 8 : 0;
1283    return rm;
1284 }
1285
1286
1287 /* General functions for dealing with integer register access. */
1288
1289 /* Produce the guest state offset for a reference to the 'g' register
1290    field in a modrm byte, taking into account REX (or its absence),
1291    and the size of the access.
1292 */
1293 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1294 {
1295    UInt reg;
1296    vassert(host_endness == VexEndnessLE);
1297    vassert(IS_VALID_PFX(pfx));
1298    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1299    reg = gregOfRexRM( pfx, mod_reg_rm );
1300    Bool irregular = sz == 1 && !haveREX(pfx) && reg >= 4 && reg < 8;
1301    return offsetIReg( sz, reg, irregular );
1302 }
1303
1304 static
1305 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1306 {
1307    switch (sz) {
1308       case 8: {
1309          return IRExpr_Get( offsetIRegG( 8, pfx, mod_reg_rm ), Ity_I64 );
1310       }
1311       case 4: {
1312          return unop(Iop_64to32,
1313                      IRExpr_Get( offsetIRegG( 8, pfx, mod_reg_rm ), Ity_I64 ));
1314       }
1315       case 2: {
1316          return unop(Iop_64to16,
1317                      IRExpr_Get( offsetIRegG( 8, pfx, mod_reg_rm ), Ity_I64 ));
1318       }
1319       case 1: {
1320          UInt regNo = gregOfRexRM( pfx, mod_reg_rm );
1321          Bool irregular = !haveREX(pfx) && regNo >= 4 && regNo < 8;
1322          if (irregular) {
1323             return IRExpr_Get( offsetIRegG( 1, pfx, mod_reg_rm ), Ity_I8 );
1324          } else {
1325             return unop(Iop_64to8,
1326                         IRExpr_Get( offsetIRegG( 8, pfx, mod_reg_rm ),
1327                         Ity_I64 ));
1328          }
1329       }
1330       default: {
1331          vpanic("getIRegG");
1332       }
1333    }
1334 }
1335
1336 static
1337 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1338 {
1339    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1340    if (sz == 4) {
1341       e = unop(Iop_32Uto64,e);
1342    }
1343    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
1344 }
1345
1346 static
1347 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1348 {
1349    UInt regNo = gregOfRexRM( pfx, mod_reg_rm );
1350    Bool irregular = sz == 1 && !haveREX(pfx) && regNo >= 4 && regNo < 8;
1351    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm), irregular );
1352 }
1353
1354
1355 static
1356 IRExpr* getIRegV ( Int sz, Prefix pfx )
1357 {
1358    vassert(sz == 8 || sz == 4);
1359    if (sz == 4) {
1360       return unop(Iop_64to32,
1361                   IRExpr_Get( offsetIReg( 8, getVexNvvvv(pfx), False ),
1362                               Ity_I64 ));
1363    } else if (sz == 2) {
1364       return unop(Iop_64to16,
1365                   IRExpr_Get( offsetIReg( 8, getVexNvvvv(pfx), False ),
1366                               Ity_I64 ));
1367    } else {
1368       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
1369                          szToITy(sz) );
1370    }
1371 }
1372
1373 static
1374 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
1375 {
1376    vassert(sz == 8 || sz == 4);
1377    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1378    if (sz == 4) {
1379       e = unop(Iop_32Uto64,e);
1380    }
1381    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
1382 }
1383
1384 static
1385 const HChar* nameIRegV ( Int sz, Prefix pfx )
1386 {
1387    vassert(sz == 8 || sz == 4);
1388    return nameIReg( sz, getVexNvvvv(pfx), False );
1389 }
1390
1391
1392
1393 /* Produce the guest state offset for a reference to the 'e' register
1394    field in a modrm byte, taking into account REX (or its absence),
1395    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
1396    denotes a memory access rather than a register access.
1397 */
1398 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1399 {
1400    UInt reg;
1401    vassert(host_endness == VexEndnessLE);
1402    vassert(IS_VALID_PFX(pfx));
1403    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1404    reg = eregOfRexRM( pfx, mod_reg_rm );
1405    Bool irregular = sz == 1 && !haveREX(pfx) && (reg >= 4 && reg < 8);
1406    return offsetIReg( sz, reg, irregular );
1407 }
1408
1409 static
1410 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1411 {
1412    switch (sz) {
1413       case 8: {
1414          return IRExpr_Get( offsetIRegE( 8, pfx, mod_reg_rm ), Ity_I64 );
1415       }
1416       case 4: {
1417          return unop(Iop_64to32,
1418                      IRExpr_Get( offsetIRegE( 8, pfx, mod_reg_rm ), Ity_I64 ));
1419       }
1420       case 2: {
1421          return unop(Iop_64to16,
1422                      IRExpr_Get( offsetIRegE( 8, pfx, mod_reg_rm ), Ity_I64 ));
1423       }
1424       case 1: {
1425          UInt regNo = eregOfRexRM( pfx, mod_reg_rm );
1426          Bool irregular = !haveREX(pfx) && regNo >= 4 && regNo < 8;
1427          if (irregular) {
1428             return IRExpr_Get( offsetIRegE( 1, pfx, mod_reg_rm ), Ity_I8 );
1429          } else {
1430             return unop(Iop_64to8,
1431                         IRExpr_Get( offsetIRegE( 8, pfx, mod_reg_rm ),
1432                         Ity_I64 ));
1433          }
1434       }
1435       default: {
1436          vpanic("getIRegE");
1437       }
1438    }
1439 }
1440
1441 static
1442 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1443 {
1444    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1445    if (sz == 4) {
1446       e = unop(Iop_32Uto64,e);
1447    }
1448    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
1449 }
1450
1451 static
1452 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1453 {
1454    UInt regNo = eregOfRexRM( pfx, mod_reg_rm );
1455    Bool irregular = sz == 1 && !haveREX(pfx) && regNo >= 4 && regNo < 8;
1456    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm), irregular );
1457 }
1458
1459
1460 /*------------------------------------------------------------*/
1461 /*--- For dealing with XMM registers                       ---*/
1462 /*------------------------------------------------------------*/
1463
1464 static Int ymmGuestRegOffset ( UInt ymmreg )
1465 {
1466    switch (ymmreg) {
1467       case 0:  return OFFB_YMM0;
1468       case 1:  return OFFB_YMM1;
1469       case 2:  return OFFB_YMM2;
1470       case 3:  return OFFB_YMM3;
1471       case 4:  return OFFB_YMM4;
1472       case 5:  return OFFB_YMM5;
1473       case 6:  return OFFB_YMM6;
1474       case 7:  return OFFB_YMM7;
1475       case 8:  return OFFB_YMM8;
1476       case 9:  return OFFB_YMM9;
1477       case 10: return OFFB_YMM10;
1478       case 11: return OFFB_YMM11;
1479       case 12: return OFFB_YMM12;
1480       case 13: return OFFB_YMM13;
1481       case 14: return OFFB_YMM14;
1482       case 15: return OFFB_YMM15;
1483       default: vpanic("ymmGuestRegOffset(amd64)");
1484    }
1485 }
1486
1487 static Int xmmGuestRegOffset ( UInt xmmreg )
1488 {
1489    /* Correct for little-endian host only. */
1490    vassert(host_endness == VexEndnessLE);
1491    return ymmGuestRegOffset( xmmreg );
1492 }
1493
1494 /* Lanes of vector registers are always numbered from zero being the
1495    least significant lane (rightmost in the register).  */
1496
1497 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
1498 {
1499    /* Correct for little-endian host only. */
1500    vassert(host_endness == VexEndnessLE);
1501    vassert(laneno >= 0 && laneno < 8);
1502    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
1503 }
1504
1505 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
1506 {
1507    /* Correct for little-endian host only. */
1508    vassert(host_endness == VexEndnessLE);
1509    vassert(laneno >= 0 && laneno < 4);
1510    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
1511 }
1512
1513 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
1514 {
1515    /* Correct for little-endian host only. */
1516    vassert(host_endness == VexEndnessLE);
1517    vassert(laneno >= 0 && laneno < 2);
1518    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
1519 }
1520
1521 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
1522 {
1523    /* Correct for little-endian host only. */
1524    vassert(host_endness == VexEndnessLE);
1525    vassert(laneno >= 0 && laneno < 2);
1526    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
1527 }
1528
1529 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
1530 {
1531    /* Correct for little-endian host only. */
1532    vassert(host_endness == VexEndnessLE);
1533    vassert(laneno >= 0 && laneno < 4);
1534    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
1535 }
1536
1537 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
1538 {
1539    /* Correct for little-endian host only. */
1540    vassert(host_endness == VexEndnessLE);
1541    vassert(laneno >= 0 && laneno < 8);
1542    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
1543 }
1544
1545 static IRExpr* getXMMReg ( UInt xmmreg )
1546 {
1547    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
1548 }
1549
1550 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
1551 {
1552    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
1553 }
1554
1555 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
1556 {
1557    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
1558 }
1559
1560 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
1561 {
1562    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
1563 }
1564
1565 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
1566 {
1567    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
1568 }
1569
1570 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
1571 {
1572   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
1573 }
1574
1575 static void putXMMReg ( UInt xmmreg, IRExpr* e )
1576 {
1577    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1578    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
1579 }
1580
1581 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
1582 {
1583    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1584    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1585 }
1586
1587 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
1588 {
1589    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1590    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1591 }
1592
1593 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
1594 {
1595    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1596    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1597 }
1598
1599 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
1600 {
1601    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1602    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1603 }
1604
1605 static IRExpr* getYMMReg ( UInt xmmreg )
1606 {
1607    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
1608 }
1609
1610 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
1611 {
1612    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
1613 }
1614
1615 static IRExpr* getYMMRegLane64F ( UInt ymmreg, Int laneno )
1616 {
1617    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_F64 );
1618 }
1619
1620 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
1621 {
1622    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
1623 }
1624
1625 static IRExpr* getYMMRegLane32F ( UInt ymmreg, Int laneno )
1626 {
1627    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_F32 );
1628 }
1629
1630 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
1631 {
1632    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
1633 }
1634
1635 static void putYMMReg ( UInt ymmreg, IRExpr* e )
1636 {
1637    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
1638    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
1639 }
1640
1641 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
1642 {
1643    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1644    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
1645 }
1646
1647 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
1648 {
1649    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1650    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1651 }
1652
1653 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
1654 {
1655    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1656    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1657 }
1658
1659 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
1660 {
1661    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1662    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1663 }
1664
1665 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
1666 {
1667    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1668    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1669 }
1670
1671 static IRExpr* mkV128 ( UShort mask )
1672 {
1673    return IRExpr_Const(IRConst_V128(mask));
1674 }
1675
1676 /* Write the low half of a YMM reg and zero out the upper half. */
1677 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
1678 {
1679    putYMMRegLane128( ymmreg, 0, e );
1680    putYMMRegLane128( ymmreg, 1, mkV128(0) );
1681 }
1682
1683 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
1684 {
1685    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
1686    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
1687    return unop(Iop_64to1,
1688                binop(Iop_And64,
1689                      unop(Iop_1Uto64,x),
1690                      unop(Iop_1Uto64,y)));
1691 }
1692
1693 /* Generate a compare-and-swap operation, operating on memory at
1694    'addr'.  The expected value is 'expVal' and the new value is
1695    'newVal'.  If the operation fails, then transfer control (with a
1696    no-redir jump (XXX no -- see comment at top of this file)) to
1697    'restart_point', which is presumably the address of the guest
1698    instruction again -- retrying, essentially. */
1699 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
1700                     Addr64 restart_point )
1701 {
1702    IRCAS* cas;
1703    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
1704    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
1705    IRTemp oldTmp = newTemp(tyE);
1706    IRTemp expTmp = newTemp(tyE);
1707    vassert(tyE == tyN);
1708    vassert(tyE == Ity_I64 || tyE == Ity_I32
1709            || tyE == Ity_I16 || tyE == Ity_I8);
1710    assign(expTmp, expVal);
1711    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
1712                   NULL, mkexpr(expTmp), NULL, newVal );
1713    stmt( IRStmt_CAS(cas) );
1714    stmt( IRStmt_Exit(
1715             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
1716                    mkexpr(oldTmp), mkexpr(expTmp) ),
1717             Ijk_Boring, /*Ijk_NoRedir*/
1718             IRConst_U64( restart_point ),
1719             OFFB_RIP
1720          ));
1721 }
1722
1723
1724 /*------------------------------------------------------------*/
1725 /*--- Helpers for %rflags.                                 ---*/
1726 /*------------------------------------------------------------*/
1727
1728 /* -------------- Evaluating the flags-thunk. -------------- */
1729
1730 /* Build IR to calculate all the eflags from stored
1731    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1732    Ity_I64. */
1733 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
1734 {
1735    IRExpr** args
1736       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1737                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1738                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1739                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1740    IRExpr* call
1741       = mkIRExprCCall(
1742            Ity_I64,
1743            0/*regparm*/,
1744            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
1745            args
1746         );
1747    /* Exclude OP and NDEP from definedness checking.  We're only
1748       interested in DEP1 and DEP2. */
1749    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1750    return call;
1751 }
1752
1753 /* Build IR to calculate some particular condition from stored
1754    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1755    Ity_Bit. */
1756 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
1757 {
1758    IRExpr** args
1759       = mkIRExprVec_5( mkU64(cond),
1760                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1761                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1762                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1763                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1764    IRExpr* call
1765       = mkIRExprCCall(
1766            Ity_I64,
1767            0/*regparm*/,
1768            "amd64g_calculate_condition", &amd64g_calculate_condition,
1769            args
1770         );
1771    /* Exclude the requested condition, OP and NDEP from definedness
1772       checking.  We're only interested in DEP1 and DEP2. */
1773    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
1774    return unop(Iop_64to1, call);
1775 }
1776
1777 /* Build IR to calculate just the carry flag from stored
1778    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
1779 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
1780 {
1781    IRExpr** args
1782       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1783                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1784                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1785                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1786    IRExpr* call
1787       = mkIRExprCCall(
1788            Ity_I64,
1789            0/*regparm*/,
1790            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
1791            args
1792         );
1793    /* Exclude OP and NDEP from definedness checking.  We're only
1794       interested in DEP1 and DEP2. */
1795    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1796    return call;
1797 }
1798
1799
1800 /* -------------- Building the flags-thunk. -------------- */
1801
1802 /* The machinery in this section builds the flag-thunk following a
1803    flag-setting operation.  Hence the various setFlags_* functions.
1804 */
1805
1806 static Bool isAddSub ( IROp op8 )
1807 {
1808    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
1809 }
1810
1811 static Bool isLogic ( IROp op8 )
1812 {
1813    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
1814 }
1815
1816 /* U-widen 1/8/16/32/64 bit int expr to 64. */
1817 static IRExpr* widenUto64 ( IRExpr* e )
1818 {
1819    switch (typeOfIRExpr(irsb->tyenv,e)) {
1820       case Ity_I64: return e;
1821       case Ity_I32: return unop(Iop_32Uto64, e);
1822       case Ity_I16: return unop(Iop_16Uto64, e);
1823       case Ity_I8:  return unop(Iop_8Uto64, e);
1824       case Ity_I1:  return unop(Iop_1Uto64, e);
1825       default: vpanic("widenUto64");
1826    }
1827 }
1828
1829 /* S-widen 8/16/32/64 bit int expr to 32. */
1830 static IRExpr* widenSto64 ( IRExpr* e )
1831 {
1832    switch (typeOfIRExpr(irsb->tyenv,e)) {
1833       case Ity_I64: return e;
1834       case Ity_I32: return unop(Iop_32Sto64, e);
1835       case Ity_I16: return unop(Iop_16Sto64, e);
1836       case Ity_I8:  return unop(Iop_8Sto64, e);
1837       default: vpanic("widenSto64");
1838    }
1839 }
1840
1841 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
1842    of these combinations make sense. */
1843 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
1844 {
1845    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
1846    if (src_ty == dst_ty)
1847       return e;
1848    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
1849       return unop(Iop_32to16, e);
1850    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
1851       return unop(Iop_32to8, e);
1852    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
1853       return unop(Iop_64to32, e);
1854    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
1855       return unop(Iop_64to16, e);
1856    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
1857       return unop(Iop_64to8, e);
1858
1859    vex_printf("\nsrc, dst tys are: ");
1860    ppIRType(src_ty);
1861    vex_printf(", ");
1862    ppIRType(dst_ty);
1863    vex_printf("\n");
1864    vpanic("narrowTo(amd64)");
1865 }
1866
1867
1868 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
1869    auto-sized up to the real op. */
1870
1871 static
1872 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
1873 {
1874    Int ccOp = 0;
1875    switch (ty) {
1876       case Ity_I8:  ccOp = 0; break;
1877       case Ity_I16: ccOp = 1; break;
1878       case Ity_I32: ccOp = 2; break;
1879       case Ity_I64: ccOp = 3; break;
1880       default: vassert(0);
1881    }
1882    switch (op8) {
1883       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
1884       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
1885       default:       ppIROp(op8);
1886                      vpanic("setFlags_DEP1_DEP2(amd64)");
1887    }
1888    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1889    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1890    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
1891    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
1892 }
1893
1894
1895 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
1896
1897 static
1898 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
1899 {
1900    Int ccOp = 0;
1901    switch (ty) {
1902       case Ity_I8:  ccOp = 0; break;
1903       case Ity_I16: ccOp = 1; break;
1904       case Ity_I32: ccOp = 2; break;
1905       case Ity_I64: ccOp = 3; break;
1906       default: vassert(0);
1907    }
1908    switch (op8) {
1909       case Iop_Or8:
1910       case Iop_And8:
1911       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
1912       default:       ppIROp(op8);
1913                      vpanic("setFlags_DEP1(amd64)");
1914    }
1915    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1916    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1917    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1918    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
1919 }
1920
1921
1922 /* For shift operations, we put in the result and the undershifted
1923    result.  Except if the shift amount is zero, the thunk is left
1924    unchanged. */
1925
1926 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
1927                                        IRTemp  res,
1928                                        IRTemp  resUS,
1929                                        IRType  ty,
1930                                        IRTemp  guard )
1931 {
1932    Int ccOp = 0;
1933    switch (ty) {
1934       case Ity_I8:  ccOp = 0; break;
1935       case Ity_I16: ccOp = 1; break;
1936       case Ity_I32: ccOp = 2; break;
1937       case Ity_I64: ccOp = 3; break;
1938       default: vassert(0);
1939    }
1940
1941    vassert(guard);
1942
1943    /* Both kinds of right shifts are handled by the same thunk
1944       operation. */
1945    switch (op64) {
1946       case Iop_Shr64:
1947       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
1948       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
1949       default:        ppIROp(op64);
1950                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
1951    }
1952
1953    /* guard :: Ity_I8.  We need to convert it to I1. */
1954    IRTemp guardB = newTemp(Ity_I1);
1955    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
1956
1957    /* DEP1 contains the result, DEP2 contains the undershifted value. */
1958    stmt( IRStmt_Put( OFFB_CC_OP,
1959                      IRExpr_ITE( mkexpr(guardB),
1960                                  mkU64(ccOp),
1961                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
1962    stmt( IRStmt_Put( OFFB_CC_DEP1,
1963                      IRExpr_ITE( mkexpr(guardB),
1964                                  widenUto64(mkexpr(res)),
1965                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
1966    stmt( IRStmt_Put( OFFB_CC_DEP2,
1967                      IRExpr_ITE( mkexpr(guardB),
1968                                  widenUto64(mkexpr(resUS)),
1969                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
1970    stmt( IRStmt_Put( OFFB_CC_NDEP,
1971                      mkU64(0) ));
1972 }
1973
1974
1975 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
1976    the former value of the carry flag, which unfortunately we have to
1977    compute. */
1978
1979 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1980 {
1981    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
1982
1983    switch (ty) {
1984       case Ity_I8:  ccOp += 0; break;
1985       case Ity_I16: ccOp += 1; break;
1986       case Ity_I32: ccOp += 2; break;
1987       case Ity_I64: ccOp += 3; break;
1988       default: vassert(0);
1989    }
1990
1991    /* This has to come first, because calculating the C flag
1992       may require reading all four thunk fields. */
1993    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
1994    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1995    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
1996    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1997 }
1998
1999
2000 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
2001    two arguments. */
2002
2003 static
2004 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
2005 {
2006    switch (ty) {
2007       case Ity_I8:
2008          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
2009          break;
2010       case Ity_I16:
2011          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
2012          break;
2013       case Ity_I32:
2014          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
2015          break;
2016       case Ity_I64:
2017          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
2018          break;
2019       default:
2020          vpanic("setFlags_MUL(amd64)");
2021    }
2022    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
2023    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
2024    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
2025 }
2026
2027
2028 /* -------------- Condition codes. -------------- */
2029
2030 /* Condition codes, using the AMD encoding.  */
2031
2032 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
2033 {
2034    switch (cond) {
2035       case AMD64CondO:      return "o";
2036       case AMD64CondNO:     return "no";
2037       case AMD64CondB:      return "b";
2038       case AMD64CondNB:     return "ae"; /*"nb";*/
2039       case AMD64CondZ:      return "e"; /*"z";*/
2040       case AMD64CondNZ:     return "ne"; /*"nz";*/
2041       case AMD64CondBE:     return "be";
2042       case AMD64CondNBE:    return "a"; /*"nbe";*/
2043       case AMD64CondS:      return "s";
2044       case AMD64CondNS:     return "ns";
2045       case AMD64CondP:      return "p";
2046       case AMD64CondNP:     return "np";
2047       case AMD64CondL:      return "l";
2048       case AMD64CondNL:     return "ge"; /*"nl";*/
2049       case AMD64CondLE:     return "le";
2050       case AMD64CondNLE:    return "g"; /*"nle";*/
2051       case AMD64CondAlways: return "ALWAYS";
2052       default: vpanic("name_AMD64Condcode");
2053    }
2054 }
2055
2056 static
2057 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
2058                                           /*OUT*/Bool*   needInvert )
2059 {
2060    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
2061    if (cond & 1) {
2062       *needInvert = True;
2063       return cond-1;
2064    } else {
2065       *needInvert = False;
2066       return cond;
2067    }
2068 }
2069
2070
2071 /* -------------- Helpers for ADD/SUB with carry. -------------- */
2072
2073 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
2074    appropriately.
2075
2076    Optionally, generate a store for the 'tres' value.  This can either
2077    be a normal store, or it can be a cas-with-possible-failure style
2078    store:
2079
2080    if taddr is IRTemp_INVALID, then no store is generated.
2081
2082    if taddr is not IRTemp_INVALID, then a store (using taddr as
2083    the address) is generated:
2084
2085      if texpVal is IRTemp_INVALID then a normal store is
2086      generated, and restart_point must be zero (it is irrelevant).
2087
2088      if texpVal is not IRTemp_INVALID then a cas-style store is
2089      generated.  texpVal is the expected value, restart_point
2090      is the restart point if the store fails, and texpVal must
2091      have the same type as tres.
2092
2093 */
2094 static void helper_ADC ( Int sz,
2095                          IRTemp tres, IRTemp ta1, IRTemp ta2,
2096                          /* info about optional store: */
2097                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
2098 {
2099    UInt    thunkOp;
2100    IRType  ty    = szToITy(sz);
2101    IRTemp  oldc  = newTemp(Ity_I64);
2102    IRTemp  oldcn = newTemp(ty);
2103    IROp    plus  = mkSizedOp(ty, Iop_Add8);
2104    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2105
2106    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2107
2108    switch (sz) {
2109       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
2110       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
2111       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
2112       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
2113       default: vassert(0);
2114    }
2115
2116    /* oldc = old carry flag, 0 or 1 */
2117    assign( oldc,  binop(Iop_And64,
2118                         mk_amd64g_calculate_rflags_c(),
2119                         mkU64(1)) );
2120
2121    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2122
2123    assign( tres, binop(plus,
2124                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
2125                        mkexpr(oldcn)) );
2126
2127    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2128       start of this function. */
2129    if (taddr != IRTemp_INVALID) {
2130       if (texpVal == IRTemp_INVALID) {
2131          vassert(restart_point == 0);
2132          storeLE( mkexpr(taddr), mkexpr(tres) );
2133       } else {
2134          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2135          /* .. and hence 'texpVal' has the same type as 'tres'. */
2136          casLE( mkexpr(taddr),
2137                 mkexpr(texpVal), mkexpr(tres), restart_point );
2138       }
2139    }
2140
2141    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2142    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
2143    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2144                                                          mkexpr(oldcn)) )) );
2145    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2146 }
2147
2148
2149 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
2150    appropriately.  As with helper_ADC, possibly generate a store of
2151    the result -- see comments on helper_ADC for details.
2152 */
2153 static void helper_SBB ( Int sz,
2154                          IRTemp tres, IRTemp ta1, IRTemp ta2,
2155                          /* info about optional store: */
2156                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
2157 {
2158    UInt    thunkOp;
2159    IRType  ty    = szToITy(sz);
2160    IRTemp  oldc  = newTemp(Ity_I64);
2161    IRTemp  oldcn = newTemp(ty);
2162    IROp    minus = mkSizedOp(ty, Iop_Sub8);
2163    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2164
2165    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2166
2167    switch (sz) {
2168       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
2169       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
2170       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
2171       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
2172       default: vassert(0);
2173    }
2174
2175    /* oldc = old carry flag, 0 or 1 */
2176    assign( oldc, binop(Iop_And64,
2177                        mk_amd64g_calculate_rflags_c(),
2178                        mkU64(1)) );
2179
2180    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2181
2182    assign( tres, binop(minus,
2183                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
2184                        mkexpr(oldcn)) );
2185
2186    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2187       start of this function. */
2188    if (taddr != IRTemp_INVALID) {
2189       if (texpVal == IRTemp_INVALID) {
2190          vassert(restart_point == 0);
2191          storeLE( mkexpr(taddr), mkexpr(tres) );
2192       } else {
2193          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2194          /* .. and hence 'texpVal' has the same type as 'tres'. */
2195          casLE( mkexpr(taddr),
2196                 mkexpr(texpVal), mkexpr(tres), restart_point );
2197       }
2198    }
2199
2200    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2201    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
2202    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2203                                                          mkexpr(oldcn)) )) );
2204    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2205 }
2206
2207
2208 /* Given ta1, ta2 and tres, compute tres = ADCX(ta1,ta2) or tres = ADOX(ta1,ta2)
2209    and set flags appropriately.
2210 */
2211 static void helper_ADCX_ADOX ( Bool isADCX, Int sz,
2212                                IRTemp tres, IRTemp ta1, IRTemp ta2 )
2213 {
2214    UInt    thunkOp;
2215    IRType  ty        = szToITy(sz);
2216    IRTemp  oldflags  = newTemp(Ity_I64);
2217    IRTemp  oldOC     = newTemp(Ity_I64); // old O or C flag
2218    IRTemp  oldOCn    = newTemp(ty);      // old O or C flag, narrowed
2219    IROp    plus      = mkSizedOp(ty, Iop_Add8);
2220    IROp    xor       = mkSizedOp(ty, Iop_Xor8);
2221
2222    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2223
2224    switch (sz) {
2225       case 8:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX64
2226                                 : AMD64G_CC_OP_ADOX64; break;
2227       case 4:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX32
2228                                 : AMD64G_CC_OP_ADOX32; break;
2229       default: vassert(0);
2230    }
2231
2232    assign( oldflags, mk_amd64g_calculate_rflags_all() );
2233
2234    /* oldOC = old overflow/carry flag, 0 or 1 */
2235    assign( oldOC, binop(Iop_And64,
2236                         binop(Iop_Shr64,
2237                               mkexpr(oldflags),
2238                               mkU8(isADCX ? AMD64G_CC_SHIFT_C
2239                                           : AMD64G_CC_SHIFT_O)),
2240                         mkU64(1)) );
2241
2242    assign( oldOCn, narrowTo(ty, mkexpr(oldOC)) );
2243
2244    assign( tres, binop(plus,
2245                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
2246                        mkexpr(oldOCn)) );
2247
2248    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2249    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
2250    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2251                                                          mkexpr(oldOCn)) )) );
2252    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldflags) ) );
2253 }
2254
2255
2256 /* -------------- Helpers for disassembly printing. -------------- */
2257
2258 static const HChar* nameGrp1 ( Int opc_aux )
2259 {
2260    static const HChar* grp1_names[8]
2261      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
2262    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
2263    return grp1_names[opc_aux];
2264 }
2265
2266 static const HChar* nameGrp2 ( Int opc_aux )
2267 {
2268    static const HChar* grp2_names[8]
2269      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
2270    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
2271    return grp2_names[opc_aux];
2272 }
2273
2274 static const HChar* nameGrp4 ( Int opc_aux )
2275 {
2276    static const HChar* grp4_names[8]
2277      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
2278    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
2279    return grp4_names[opc_aux];
2280 }
2281
2282 static const HChar* nameGrp5 ( Int opc_aux )
2283 {
2284    static const HChar* grp5_names[8]
2285      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
2286    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
2287    return grp5_names[opc_aux];
2288 }
2289
2290 static const HChar* nameGrp8 ( Int opc_aux )
2291 {
2292    static const HChar* grp8_names[8]
2293       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
2294    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
2295    return grp8_names[opc_aux];
2296 }
2297
2298 static const HChar* nameSReg ( UInt sreg )
2299 {
2300    switch (sreg) {
2301       case R_ES: return "%es";
2302       case R_CS: return "%cs";
2303       case R_SS: return "%ss";
2304       case R_DS: return "%ds";
2305       case R_FS: return "%fs";
2306       case R_GS: return "%gs";
2307       default: vpanic("nameSReg(amd64)");
2308    }
2309 }
2310
2311 static const HChar* nameMMXReg ( Int mmxreg )
2312 {
2313    static const HChar* mmx_names[8]
2314      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
2315    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
2316    return mmx_names[mmxreg];
2317 }
2318
2319 static const HChar* nameXMMReg ( Int xmmreg )
2320 {
2321    static const HChar* xmm_names[16]
2322      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
2323          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
2324          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
2325          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
2326    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
2327    return xmm_names[xmmreg];
2328 }
2329
2330 static const HChar* nameMMXGran ( Int gran )
2331 {
2332    switch (gran) {
2333       case 0: return "b";
2334       case 1: return "w";
2335       case 2: return "d";
2336       case 3: return "q";
2337       default: vpanic("nameMMXGran(amd64,guest)");
2338    }
2339 }
2340
2341 static HChar nameISize ( Int size )
2342 {
2343    switch (size) {
2344       case 8: return 'q';
2345       case 4: return 'l';
2346       case 2: return 'w';
2347       case 1: return 'b';
2348       default: vpanic("nameISize(amd64)");
2349    }
2350 }
2351
2352 static const HChar* nameYMMReg ( Int ymmreg )
2353 {
2354    static const HChar* ymm_names[16]
2355      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
2356          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
2357          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
2358          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
2359    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
2360    return ymm_names[ymmreg];
2361 }
2362
2363
2364 /*------------------------------------------------------------*/
2365 /*--- JMP helpers                                          ---*/
2366 /*------------------------------------------------------------*/
2367
2368 static void jmp_lit( /*MOD*/DisResult* dres,
2369                      IRJumpKind kind, Addr64 d64 )
2370 {
2371    vassert(dres->whatNext    == Dis_Continue);
2372    vassert(dres->len         == 0);
2373    vassert(dres->jk_StopHere == Ijk_INVALID);
2374    dres->whatNext    = Dis_StopHere;
2375    dres->jk_StopHere = kind;
2376    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
2377 }
2378
2379 static void jmp_treg( /*MOD*/DisResult* dres,
2380                       IRJumpKind kind, IRTemp t )
2381 {
2382    vassert(dres->whatNext    == Dis_Continue);
2383    vassert(dres->len         == 0);
2384    vassert(dres->jk_StopHere == Ijk_INVALID);
2385    dres->whatNext    = Dis_StopHere;
2386    dres->jk_StopHere = kind;
2387    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
2388 }
2389
2390 static
2391 void jcc_01 ( /*MOD*/DisResult* dres,
2392               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
2393 {
2394    Bool          invert;
2395    AMD64Condcode condPos;
2396    vassert(dres->whatNext    == Dis_Continue);
2397    vassert(dres->len         == 0);
2398    vassert(dres->jk_StopHere == Ijk_INVALID);
2399    dres->whatNext    = Dis_StopHere;
2400    dres->jk_StopHere = Ijk_Boring;
2401    condPos = positiveIse_AMD64Condcode ( cond, &invert );
2402    if (invert) {
2403       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2404                          Ijk_Boring,
2405                          IRConst_U64(d64_false),
2406                          OFFB_RIP ) );
2407       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
2408    } else {
2409       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2410                          Ijk_Boring,
2411                          IRConst_U64(d64_true),
2412                          OFFB_RIP ) );
2413       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
2414    }
2415 }
2416
2417 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
2418    guest address of the next instruction to be executed.
2419
2420    This function generates an AbiHint to say that -128(%rsp)
2421    .. -1(%rsp) should now be regarded as uninitialised.
2422 */
2423 static
2424 void make_redzone_AbiHint ( const VexAbiInfo* vbi,
2425                             IRTemp new_rsp, IRTemp nia, const HChar* who )
2426 {
2427    Int szB = vbi->guest_stack_redzone_size;
2428    vassert(szB >= 0);
2429
2430    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
2431       for is ELF.  So just check it's the expected 128 value
2432       (paranoia). */
2433    vassert(szB == 128);
2434
2435    if (0) vex_printf("AbiHint: %s\n", who);
2436    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
2437    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
2438    if (szB > 0)
2439       stmt( IRStmt_AbiHint(
2440                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
2441                szB,
2442                mkexpr(nia)
2443             ));
2444 }
2445
2446
2447 /*------------------------------------------------------------*/
2448 /*--- Disassembling addressing modes                       ---*/
2449 /*------------------------------------------------------------*/
2450
2451 static
2452 const HChar* segRegTxt ( Prefix pfx )
2453 {
2454    if (pfx & PFX_CS) return "%cs:";
2455    if (pfx & PFX_DS) return "%ds:";
2456    if (pfx & PFX_ES) return "%es:";
2457    if (pfx & PFX_FS) return "%fs:";
2458    if (pfx & PFX_GS) return "%gs:";
2459    if (pfx & PFX_SS) return "%ss:";
2460    return ""; /* no override */
2461 }
2462
2463
2464 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
2465    linear address by adding any required segment override as indicated
2466    by sorb, and also dealing with any address size override
2467    present. */
2468 static
2469 IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
2470                               Prefix pfx, IRExpr* virtual )
2471 {
2472    /* --- address size override --- */
2473    if (haveASO(pfx))
2474       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
2475
2476    /* Note that the below are hacks that relies on the assumption
2477       that %fs or %gs are constant.
2478       Typically, %fs is always 0x63 on linux (in the main thread, it
2479       stays at value 0), %gs always 0x60 on Darwin, ... */
2480    /* --- segment overrides --- */
2481    if (pfx & PFX_FS) {
2482       if (vbi->guest_amd64_assume_fs_is_const) {
2483          /* return virtual + guest_FS_CONST. */
2484          virtual = binop(Iop_Add64, virtual,
2485                                     IRExpr_Get(OFFB_FS_CONST, Ity_I64));
2486       } else {
2487          unimplemented("amd64 %fs segment override");
2488       }
2489    }
2490
2491    if (pfx & PFX_GS) {
2492       if (vbi->guest_amd64_assume_gs_is_const) {
2493          /* return virtual + guest_GS_CONST. */
2494          virtual = binop(Iop_Add64, virtual,
2495                                     IRExpr_Get(OFFB_GS_CONST, Ity_I64));
2496       } else {
2497          unimplemented("amd64 %gs segment override");
2498       }
2499    }
2500
2501    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
2502
2503    return virtual;
2504 }
2505
2506 //.. {
2507 //..    Int    sreg;
2508 //..    IRType hWordTy;
2509 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
2510 //..
2511 //..    if (sorb == 0)
2512 //..       /* the common case - no override */
2513 //..       return virtual;
2514 //..
2515 //..    switch (sorb) {
2516 //..       case 0x3E: sreg = R_DS; break;
2517 //..       case 0x26: sreg = R_ES; break;
2518 //..       case 0x64: sreg = R_FS; break;
2519 //..       case 0x65: sreg = R_GS; break;
2520 //..       default: vpanic("handleAddrOverrides(x86,guest)");
2521 //..    }
2522 //..
2523 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
2524 //..
2525 //..    seg_selector = newTemp(Ity_I32);
2526 //..    ldt_ptr      = newTemp(hWordTy);
2527 //..    gdt_ptr      = newTemp(hWordTy);
2528 //..    r64          = newTemp(Ity_I64);
2529 //..
2530 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
2531 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
2532 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
2533 //..
2534 //..    /*
2535 //..    Call this to do the translation and limit checks:
2536 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
2537 //..                                  UInt seg_selector, UInt virtual_addr )
2538 //..    */
2539 //..    assign(
2540 //..       r64,
2541 //..       mkIRExprCCall(
2542 //..          Ity_I64,
2543 //..          0/*regparms*/,
2544 //..          "x86g_use_seg_selector",
2545 //..          &x86g_use_seg_selector,
2546 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
2547 //..                         mkexpr(seg_selector), virtual)
2548 //..       )
2549 //..    );
2550 //..
2551 //..    /* If the high 32 of the result are non-zero, there was a
2552 //..       failure in address translation.  In which case, make a
2553 //..       quick exit.
2554 //..    */
2555 //..    stmt(
2556 //..       IRStmt_Exit(
2557 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
2558 //..          Ijk_MapFail,
2559 //..          IRConst_U32( guest_eip_curr_instr )
2560 //..       )
2561 //..    );
2562 //..
2563 //..    /* otherwise, here's the translated result. */
2564 //..    return unop(Iop_64to32, mkexpr(r64));
2565 //.. }
2566
2567
2568 /* Generate IR to calculate an address indicated by a ModRM and
2569    following SIB bytes.  The expression, and the number of bytes in
2570    the address mode, are returned (the latter in *len).  Note that
2571    this fn should not be called if the R/M part of the address denotes
2572    a register instead of memory.  If print_codegen is true, text of
2573    the addressing mode is placed in buf.
2574
2575    The computed address is stored in a new tempreg, and the
2576    identity of the tempreg is returned.
2577
2578    extra_bytes holds the number of bytes after the amode, as supplied
2579    by the caller.  This is needed to make sense of %rip-relative
2580    addresses.  Note that the value that *len is set to is only the
2581    length of the amode itself and does not include the value supplied
2582    in extra_bytes.
2583  */
2584
2585 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
2586 {
2587    IRTemp tmp = newTemp(Ity_I64);
2588    assign( tmp, addr64 );
2589    return tmp;
2590 }
2591
2592 static
2593 IRTemp disAMode ( /*OUT*/Int* len,
2594                   const VexAbiInfo* vbi, Prefix pfx, Long delta,
2595                   /*OUT*/HChar* buf, Int extra_bytes )
2596 {
2597    UChar mod_reg_rm = getUChar(delta);
2598    delta++;
2599
2600    buf[0] = (UChar)0;
2601    vassert(extra_bytes >= 0 && extra_bytes < 10);
2602
2603    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2604       jump table seems a bit excessive.
2605    */
2606    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2607    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2608                                                /* is now XX0XXYYY */
2609    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2610    switch (mod_reg_rm) {
2611
2612       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2613          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2614       */
2615       case 0x00: case 0x01: case 0x02: case 0x03:
2616       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2617          { UChar rm = toUChar(mod_reg_rm & 7);
2618            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2619            *len = 1;
2620            return disAMode_copy2tmp(
2621                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
2622          }
2623
2624       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2625          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2626       */
2627       case 0x08: case 0x09: case 0x0A: case 0x0B:
2628       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2629          { UChar rm = toUChar(mod_reg_rm & 7);
2630            Long d   = getSDisp8(delta);
2631            if (d == 0) {
2632               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2633            } else {
2634               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2635            }
2636            *len = 2;
2637            return disAMode_copy2tmp(
2638                   handleAddrOverrides(vbi, pfx,
2639                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2640          }
2641
2642       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2643          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2644       */
2645       case 0x10: case 0x11: case 0x12: case 0x13:
2646       /* ! 14 */ case 0x15: case 0x16: case 0x17:
2647          { UChar rm = toUChar(mod_reg_rm & 7);
2648            Long  d  = getSDisp32(delta);
2649            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2650            *len = 5;
2651            return disAMode_copy2tmp(
2652                   handleAddrOverrides(vbi, pfx,
2653                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2654          }
2655
2656       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2657       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2658       case 0x18: case 0x19: case 0x1A: case 0x1B:
2659       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2660          vpanic("disAMode(amd64): not an addr!");
2661
2662       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
2663          correctly at the start of handling each instruction. */
2664       case 0x05:
2665          { Long d = getSDisp32(delta);
2666            *len = 5;
2667            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
2668            /* We need to know the next instruction's start address.
2669               Try and figure out what it is, record the guess, and ask
2670               the top-level driver logic (bbToIR_AMD64) to check we
2671               guessed right, after the instruction is completely
2672               decoded. */
2673            guest_RIP_next_mustcheck = True;
2674            guest_RIP_next_assumed = guest_RIP_bbstart
2675                                     + delta+4 + extra_bytes;
2676            return disAMode_copy2tmp(
2677                      handleAddrOverrides(vbi, pfx,
2678                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
2679                                          mkU64(d))));
2680          }
2681
2682       case 0x04: {
2683          /* SIB, with no displacement.  Special cases:
2684             -- %rsp cannot act as an index value.
2685                If index_r indicates %rsp, zero is used for the index.
2686             -- when mod is zero and base indicates RBP or R13, base is
2687                instead a 32-bit sign-extended literal.
2688             It's all madness, I tell you.  Extract %index, %base and
2689             scale from the SIB byte.  The value denoted is then:
2690                | %index == %RSP && (%base == %RBP || %base == %R13)
2691                = d32 following SIB byte
2692                | %index == %RSP && !(%base == %RBP || %base == %R13)
2693                = %base
2694                | %index != %RSP && (%base == %RBP || %base == %R13)
2695                = d32 following SIB byte + (%index << scale)
2696                | %index != %RSP && !(%base == %RBP || %base == %R13)
2697                = %base + (%index << scale)
2698          */
2699          UChar sib     = getUChar(delta);
2700          UChar scale   = toUChar((sib >> 6) & 3);
2701          UChar index_r = toUChar((sib >> 3) & 7);
2702          UChar base_r  = toUChar(sib & 7);
2703          /* correct since #(R13) == 8 + #(RBP) */
2704          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2705          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
2706          delta++;
2707
2708          if ((!index_is_SP) && (!base_is_BPor13)) {
2709             if (scale == 0) {
2710                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2711                          nameIRegRexB(8,pfx,base_r),
2712                          nameIReg64rexX(pfx,index_r));
2713             } else {
2714                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2715                          nameIRegRexB(8,pfx,base_r),
2716                          nameIReg64rexX(pfx,index_r), 1<<scale);
2717             }
2718             *len = 2;
2719             return
2720                disAMode_copy2tmp(
2721                handleAddrOverrides(vbi, pfx,
2722                   binop(Iop_Add64,
2723                         getIRegRexB(8,pfx,base_r),
2724                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2725                               mkU8(scale)))));
2726          }
2727
2728          if ((!index_is_SP) && base_is_BPor13) {
2729             Long d = getSDisp32(delta);
2730             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
2731                       nameIReg64rexX(pfx,index_r), 1<<scale);
2732             *len = 6;
2733             return
2734                disAMode_copy2tmp(
2735                handleAddrOverrides(vbi, pfx,
2736                   binop(Iop_Add64,
2737                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2738                                          mkU8(scale)),
2739                         mkU64(d))));
2740          }
2741
2742          if (index_is_SP && (!base_is_BPor13)) {
2743             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
2744             *len = 2;
2745             return disAMode_copy2tmp(
2746                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
2747          }
2748
2749          if (index_is_SP && base_is_BPor13) {
2750             Long d = getSDisp32(delta);
2751             DIS(buf, "%s%lld", segRegTxt(pfx), d);
2752             *len = 6;
2753             return disAMode_copy2tmp(
2754                    handleAddrOverrides(vbi, pfx, mkU64(d)));
2755          }
2756
2757          vassert(0);
2758       }
2759
2760       /* SIB, with 8-bit displacement.  Special cases:
2761          -- %esp cannot act as an index value.
2762             If index_r indicates %esp, zero is used for the index.
2763          Denoted value is:
2764             | %index == %ESP
2765             = d8 + %base
2766             | %index != %ESP
2767             = d8 + %base + (%index << scale)
2768       */
2769       case 0x0C: {
2770          UChar sib     = getUChar(delta);
2771          UChar scale   = toUChar((sib >> 6) & 3);
2772          UChar index_r = toUChar((sib >> 3) & 7);
2773          UChar base_r  = toUChar(sib & 7);
2774          Long d        = getSDisp8(delta+1);
2775
2776          if (index_r == R_RSP && 0==getRexX(pfx)) {
2777             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2778                                    d, nameIRegRexB(8,pfx,base_r));
2779             *len = 3;
2780             return disAMode_copy2tmp(
2781                    handleAddrOverrides(vbi, pfx,
2782                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2783          } else {
2784             if (scale == 0) {
2785                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2786                          nameIRegRexB(8,pfx,base_r),
2787                          nameIReg64rexX(pfx,index_r));
2788             } else {
2789                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2790                          nameIRegRexB(8,pfx,base_r),
2791                          nameIReg64rexX(pfx,index_r), 1<<scale);
2792             }
2793             *len = 3;
2794             return
2795                 disAMode_copy2tmp(
2796                 handleAddrOverrides(vbi, pfx,
2797                   binop(Iop_Add64,
2798                         binop(Iop_Add64,
2799                               getIRegRexB(8,pfx,base_r),
2800                               binop(Iop_Shl64,
2801                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
2802                         mkU64(d))));
2803          }
2804          vassert(0); /*NOTREACHED*/
2805       }
2806
2807       /* SIB, with 32-bit displacement.  Special cases:
2808          -- %rsp cannot act as an index value.
2809             If index_r indicates %rsp, zero is used for the index.
2810          Denoted value is:
2811             | %index == %RSP
2812             = d32 + %base
2813             | %index != %RSP
2814             = d32 + %base + (%index << scale)
2815       */
2816       case 0x14: {
2817          UChar sib     = getUChar(delta);
2818          UChar scale   = toUChar((sib >> 6) & 3);
2819          UChar index_r = toUChar((sib >> 3) & 7);
2820          UChar base_r  = toUChar(sib & 7);
2821          Long d        = getSDisp32(delta+1);
2822
2823          if (index_r == R_RSP && 0==getRexX(pfx)) {
2824             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2825                                    d, nameIRegRexB(8,pfx,base_r));
2826             *len = 6;
2827             return disAMode_copy2tmp(
2828                    handleAddrOverrides(vbi, pfx,
2829                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2830          } else {
2831             if (scale == 0) {
2832                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2833                          nameIRegRexB(8,pfx,base_r),
2834                          nameIReg64rexX(pfx,index_r));
2835             } else {
2836                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2837                          nameIRegRexB(8,pfx,base_r),
2838                          nameIReg64rexX(pfx,index_r), 1<<scale);
2839             }
2840             *len = 6;
2841             return
2842                 disAMode_copy2tmp(
2843                 handleAddrOverrides(vbi, pfx,
2844                   binop(Iop_Add64,
2845                         binop(Iop_Add64,
2846                               getIRegRexB(8,pfx,base_r),
2847                               binop(Iop_Shl64,
2848                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
2849                         mkU64(d))));
2850          }
2851          vassert(0); /*NOTREACHED*/
2852       }
2853
2854       default:
2855          vpanic("disAMode(amd64)");
2856          return 0; /*notreached*/
2857    }
2858 }
2859
2860
2861 /* Similarly for VSIB addressing.  This returns just the addend,
2862    and fills in *rI and *vscale with the register number of the vector
2863    index and its multiplicand.  */
2864 static
2865 IRTemp disAVSIBMode ( /*OUT*/Int* len,
2866                       const VexAbiInfo* vbi, Prefix pfx, Long delta,
2867                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
2868                       IRType ty, /*OUT*/Int* vscale )
2869 {
2870    UChar mod_reg_rm = getUChar(delta);
2871    const HChar *vindex;
2872
2873    *len = 0;
2874    *rI = 0;
2875    *vscale = 0;
2876    buf[0] = (UChar)0;
2877    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
2878       return IRTemp_INVALID;
2879
2880    UChar sib     = getUChar(delta+1);
2881    UChar scale   = toUChar((sib >> 6) & 3);
2882    UChar index_r = toUChar((sib >> 3) & 7);
2883    UChar base_r  = toUChar(sib & 7);
2884    Long  d       = 0;
2885    /* correct since #(R13) == 8 + #(RBP) */
2886    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2887    delta += 2;
2888    *len = 2;
2889
2890    *rI = index_r | (getRexX(pfx) << 3);
2891    if (ty == Ity_V128)
2892       vindex = nameXMMReg(*rI);
2893    else
2894       vindex = nameYMMReg(*rI);
2895    *vscale = 1<<scale;
2896
2897    switch (mod_reg_rm >> 6) {
2898    case 0:
2899       if (base_is_BPor13) {
2900          d = getSDisp32(delta);
2901          *len += 4;
2902          if (scale == 0) {
2903             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
2904          } else {
2905             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
2906          }
2907          return disAMode_copy2tmp( mkU64(d) );
2908       } else {
2909          if (scale == 0) {
2910             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2911                      nameIRegRexB(8,pfx,base_r), vindex);
2912          } else {
2913             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2914                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
2915          }
2916       }
2917       break;
2918    case 1:
2919       d = getSDisp8(delta);
2920       *len += 1;
2921       goto have_disp;
2922    case 2:
2923       d = getSDisp32(delta);
2924       *len += 4;
2925    have_disp:
2926       if (scale == 0) {
2927          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2928                   nameIRegRexB(8,pfx,base_r), vindex);
2929       } else {
2930          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2931                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
2932       }
2933       break;
2934    }
2935
2936    if (!d)
2937       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
2938    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
2939                                    mkU64(d)) );
2940 }
2941
2942
2943 /* Figure out the number of (insn-stream) bytes constituting the amode
2944    beginning at delta.  Is useful for getting hold of literals beyond
2945    the end of the amode before it has been disassembled.  */
2946
2947 static UInt lengthAMode ( Prefix pfx, Long delta )
2948 {
2949    UChar mod_reg_rm = getUChar(delta);
2950    delta++;
2951
2952    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2953       jump table seems a bit excessive.
2954    */
2955    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2956    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2957                                                /* is now XX0XXYYY */
2958    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2959    switch (mod_reg_rm) {
2960
2961       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2962          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2963       */
2964       case 0x00: case 0x01: case 0x02: case 0x03:
2965       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2966          return 1;
2967
2968       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2969          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2970       */
2971       case 0x08: case 0x09: case 0x0A: case 0x0B:
2972       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2973          return 2;
2974
2975       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2976          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2977       */
2978       case 0x10: case 0x11: case 0x12: case 0x13:
2979       /* ! 14 */ case 0x15: case 0x16: case 0x17:
2980          return 5;
2981
2982       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2983       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2984       /* Not an address, but still handled. */
2985       case 0x18: case 0x19: case 0x1A: case 0x1B:
2986       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2987          return 1;
2988
2989       /* RIP + disp32. */
2990       case 0x05:
2991          return 5;
2992
2993       case 0x04: {
2994          /* SIB, with no displacement. */
2995          UChar sib     = getUChar(delta);
2996          UChar base_r  = toUChar(sib & 7);
2997          /* correct since #(R13) == 8 + #(RBP) */
2998          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2999
3000          if (base_is_BPor13) {
3001             return 6;
3002          } else {
3003             return 2;
3004          }
3005       }
3006
3007       /* SIB, with 8-bit displacement. */
3008       case 0x0C:
3009          return 3;
3010
3011       /* SIB, with 32-bit displacement. */
3012       case 0x14:
3013          return 6;
3014
3015       default:
3016          vpanic("lengthAMode(amd64)");
3017          return 0; /*notreached*/
3018    }
3019 }
3020
3021
3022 /*------------------------------------------------------------*/
3023 /*--- Disassembling common idioms                          ---*/
3024 /*------------------------------------------------------------*/
3025
3026 typedef
3027   enum { WithFlagNone=2, WithFlagCarry, WithFlagCarryX, WithFlagOverX }
3028   WithFlag;
3029
3030 /* Handle binary integer instructions of the form
3031       op E, G  meaning
3032       op reg-or-mem, reg
3033    Is passed the a ptr to the modRM byte, the actual operation, and the
3034    data size.  Returns the address advanced completely over this
3035    instruction.
3036
3037    E(src) is reg-or-mem
3038    G(dst) is reg.
3039
3040    If E is reg, -->    GET %G,  tmp
3041                        OP %E,   tmp
3042                        PUT tmp, %G
3043
3044    If E is mem and OP is not reversible,
3045                 -->    (getAddr E) -> tmpa
3046                        LD (tmpa), tmpa
3047                        GET %G, tmp2
3048                        OP tmpa, tmp2
3049                        PUT tmp2, %G
3050
3051    If E is mem and OP is reversible
3052                 -->    (getAddr E) -> tmpa
3053                        LD (tmpa), tmpa
3054                        OP %G, tmpa
3055                        PUT tmpa, %G
3056 */
3057 static
3058 ULong dis_op2_E_G ( const VexAbiInfo* vbi,
3059                     Prefix      pfx,
3060                     IROp        op8,
3061                     WithFlag    flag,
3062                     Bool        keep,
3063                     Int         size,
3064                     Long        delta0,
3065                     const HChar* t_amd64opc )
3066 {
3067    HChar   dis_buf[50];
3068    Int     len;
3069    IRType  ty   = szToITy(size);
3070    IRTemp  dst1 = newTemp(ty);
3071    IRTemp  src  = newTemp(ty);
3072    IRTemp  dst0 = newTemp(ty);
3073    UChar   rm   = getUChar(delta0);
3074    IRTemp  addr = IRTemp_INVALID;
3075
3076    /* Stay sane -- check for valid (op8, flag, keep) combinations. */
3077    switch (op8) {
3078       case Iop_Add8:
3079          switch (flag) {
3080             case WithFlagNone: case WithFlagCarry:
3081             case WithFlagCarryX: case WithFlagOverX:
3082                vassert(keep);
3083                break;
3084             default:
3085                vassert(0);
3086          }
3087          break;
3088       case Iop_Sub8:
3089          vassert(flag == WithFlagNone || flag == WithFlagCarry);
3090          if (flag == WithFlagCarry) vassert(keep);
3091          break;
3092       case Iop_And8:
3093          vassert(flag == WithFlagNone);
3094          break;
3095       case Iop_Or8: case Iop_Xor8:
3096          vassert(flag == WithFlagNone);
3097          vassert(keep);
3098          break;
3099       default:
3100          vassert(0);
3101    }
3102
3103    if (epartIsReg(rm)) {
3104       /* Specially handle XOR reg,reg, because that doesn't really
3105          depend on reg, and doing the obvious thing potentially
3106          generates a spurious value check failure due to the bogus
3107          dependency.  Ditto SUB/SBB reg,reg. */
3108       if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
3109           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
3110          putIRegG(size,pfx,rm, mkU(ty,0));
3111       }
3112
3113       assign( dst0, getIRegG(size,pfx,rm) );
3114       assign( src,  getIRegE(size,pfx,rm) );
3115
3116       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3117          helper_ADC( size, dst1, dst0, src,
3118                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3119          putIRegG(size, pfx, rm, mkexpr(dst1));
3120       } else
3121       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3122          helper_SBB( size, dst1, dst0, src,
3123                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3124          putIRegG(size, pfx, rm, mkexpr(dst1));
3125       } else
3126       if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
3127          helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
3128          putIRegG(size, pfx, rm, mkexpr(dst1));
3129       } else
3130       if (op8 == Iop_Add8 && flag == WithFlagOverX) {
3131          helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
3132          putIRegG(size, pfx, rm, mkexpr(dst1));
3133       } else {
3134          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3135          if (isAddSub(op8))
3136             setFlags_DEP1_DEP2(op8, dst0, src, ty);
3137          else
3138             setFlags_DEP1(op8, dst1, ty);
3139          if (keep)
3140             putIRegG(size, pfx, rm, mkexpr(dst1));
3141       }
3142
3143       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3144                           nameIRegE(size,pfx,rm),
3145                           nameIRegG(size,pfx,rm));
3146       return 1+delta0;
3147    } else {
3148       /* E refers to memory */
3149       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3150       assign( dst0, getIRegG(size,pfx,rm) );
3151       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
3152
3153       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3154          helper_ADC( size, dst1, dst0, src,
3155                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3156          putIRegG(size, pfx, rm, mkexpr(dst1));
3157       } else
3158       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3159          helper_SBB( size, dst1, dst0, src,
3160                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3161          putIRegG(size, pfx, rm, mkexpr(dst1));
3162       } else
3163       if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
3164          helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
3165          putIRegG(size, pfx, rm, mkexpr(dst1));
3166       } else
3167       if (op8 == Iop_Add8 && flag == WithFlagOverX) {
3168          helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
3169          putIRegG(size, pfx, rm, mkexpr(dst1));
3170       } else {
3171          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3172          if (isAddSub(op8))
3173             setFlags_DEP1_DEP2(op8, dst0, src, ty);
3174          else
3175             setFlags_DEP1(op8, dst1, ty);
3176          if (keep)
3177             putIRegG(size, pfx, rm, mkexpr(dst1));
3178       }
3179
3180       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3181                           dis_buf, nameIRegG(size, pfx, rm));
3182       return len+delta0;
3183    }
3184 }
3185
3186
3187
3188 /* Handle binary integer instructions of the form
3189       op G, E  meaning
3190       op reg, reg-or-mem
3191    Is passed the a ptr to the modRM byte, the actual operation, and the
3192    data size.  Returns the address advanced completely over this
3193    instruction.
3194
3195    G(src) is reg.
3196    E(dst) is reg-or-mem
3197
3198    If E is reg, -->    GET %E,  tmp
3199                        OP %G,   tmp
3200                        PUT tmp, %E
3201
3202    If E is mem, -->    (getAddr E) -> tmpa
3203                        LD (tmpa), tmpv
3204                        OP %G, tmpv
3205                        ST tmpv, (tmpa)
3206 */
3207 static
3208 ULong dis_op2_G_E ( const VexAbiInfo* vbi,
3209                     Prefix      pfx,
3210                     IROp        op8,
3211                     WithFlag    flag,
3212                     Bool        keep,
3213                     Int         size,
3214                     Long        delta0,
3215                     const HChar* t_amd64opc )
3216 {
3217    HChar   dis_buf[50];
3218    Int     len;
3219    IRType  ty   = szToITy(size);
3220    IRTemp  dst1 = newTemp(ty);
3221    IRTemp  src  = newTemp(ty);
3222    IRTemp  dst0 = newTemp(ty);
3223    UChar   rm   = getUChar(delta0);
3224    IRTemp  addr = IRTemp_INVALID;
3225
3226    /* Stay sane -- check for valid (op8, flag, keep) combinations. */
3227    switch (op8) {
3228       case Iop_Add8:
3229          vassert(flag == WithFlagNone || flag == WithFlagCarry);
3230          vassert(keep);
3231          break;
3232       case Iop_Sub8:
3233          vassert(flag == WithFlagNone || flag == WithFlagCarry);
3234          if (flag == WithFlagCarry) vassert(keep);
3235          break;
3236       case Iop_And8: case Iop_Or8: case Iop_Xor8:
3237          vassert(flag == WithFlagNone);
3238          vassert(keep);
3239          break;
3240       default:
3241          vassert(0);
3242    }
3243
3244    /* flag != WithFlagNone is only allowed for Add and Sub and indicates the
3245       intended operation is add-with-carry or subtract-with-borrow. */
3246
3247    if (epartIsReg(rm)) {
3248       /* Specially handle XOR reg,reg, because that doesn't really
3249          depend on reg, and doing the obvious thing potentially
3250          generates a spurious value check failure due to the bogus
3251          dependency.  Ditto SUB/SBB reg,reg. */
3252       if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
3253           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
3254          putIRegE(size,pfx,rm, mkU(ty,0));
3255       }
3256
3257       assign(dst0, getIRegE(size,pfx,rm));
3258       assign(src,  getIRegG(size,pfx,rm));
3259
3260       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3261          helper_ADC( size, dst1, dst0, src,
3262                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3263          putIRegE(size, pfx, rm, mkexpr(dst1));
3264       } else
3265       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3266          helper_SBB( size, dst1, dst0, src,
3267                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3268          putIRegE(size, pfx, rm, mkexpr(dst1));
3269       } else {
3270          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3271          if (isAddSub(op8))
3272             setFlags_DEP1_DEP2(op8, dst0, src, ty);
3273          else
3274             setFlags_DEP1(op8, dst1, ty);
3275          if (keep)
3276             putIRegE(size, pfx, rm, mkexpr(dst1));
3277       }
3278
3279       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3280                           nameIRegG(size,pfx,rm),
3281                           nameIRegE(size,pfx,rm));
3282       return 1+delta0;
3283    }
3284
3285    /* E refers to memory */
3286    {
3287       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3288       assign(dst0, loadLE(ty,mkexpr(addr)));
3289       assign(src,  getIRegG(size,pfx,rm));
3290
3291       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3292          if (haveLOCK(pfx)) {
3293             /* cas-style store */
3294             helper_ADC( size, dst1, dst0, src,
3295                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3296          } else {
3297             /* normal store */
3298             helper_ADC( size, dst1, dst0, src,
3299                         /*store*/addr, IRTemp_INVALID, 0 );
3300          }
3301       } else
3302       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3303          if (haveLOCK(pfx)) {
3304             /* cas-style store */
3305             helper_SBB( size, dst1, dst0, src,
3306                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3307          } else {
3308             /* normal store */
3309             helper_SBB( size, dst1, dst0, src,
3310                         /*store*/addr, IRTemp_INVALID, 0 );
3311          }
3312       } else {
3313          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3314          if (keep) {
3315             if (haveLOCK(pfx)) {
3316                if (0) vex_printf("locked case\n" );
3317                casLE( mkexpr(addr),
3318                       mkexpr(dst0)/*expval*/,
3319                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
3320             } else {
3321                if (0) vex_printf("nonlocked case\n");
3322                storeLE(mkexpr(addr), mkexpr(dst1));
3323             }
3324          }
3325          if (isAddSub(op8))
3326             setFlags_DEP1_DEP2(op8, dst0, src, ty);
3327          else
3328             setFlags_DEP1(op8, dst1, ty);
3329       }
3330
3331       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3332                           nameIRegG(size,pfx,rm), dis_buf);
3333       return len+delta0;
3334    }
3335 }
3336
3337
3338 /* Handle move instructions of the form
3339       mov E, G  meaning
3340       mov reg-or-mem, reg
3341    Is passed the a ptr to the modRM byte, and the data size.  Returns
3342    the address advanced completely over this instruction.
3343
3344    E(src) is reg-or-mem
3345    G(dst) is reg.
3346
3347    If E is reg, -->    GET %E,  tmpv
3348                        PUT tmpv, %G
3349
3350    If E is mem  -->    (getAddr E) -> tmpa
3351                        LD (tmpa), tmpb
3352                        PUT tmpb, %G
3353 */
3354 static
3355 ULong dis_mov_E_G ( const VexAbiInfo* vbi,
3356                     Prefix      pfx,
3357                     Int         size,
3358                     Long        delta0 )
3359 {
3360    Int len;
3361    UChar rm = getUChar(delta0);
3362    HChar dis_buf[50];
3363
3364    if (epartIsReg(rm)) {
3365       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
3366       DIP("mov%c %s,%s\n", nameISize(size),
3367                            nameIRegE(size,pfx,rm),
3368                            nameIRegG(size,pfx,rm));
3369       return 1+delta0;
3370    }
3371
3372    /* E refers to memory */
3373    {
3374       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3375       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
3376       DIP("mov%c %s,%s\n", nameISize(size),
3377                            dis_buf,
3378                            nameIRegG(size,pfx,rm));
3379       return delta0+len;
3380    }
3381 }
3382
3383
3384 /* Handle move instructions of the form
3385       mov G, E  meaning
3386       mov reg, reg-or-mem
3387    Is passed the a ptr to the modRM byte, and the data size.  Returns
3388    the address advanced completely over this instruction.
3389    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
3390
3391    G(src) is reg.
3392    E(dst) is reg-or-mem
3393
3394    If E is reg, -->    GET %G,  tmp
3395                        PUT tmp, %E
3396
3397    If E is mem, -->    (getAddr E) -> tmpa
3398                        GET %G, tmpv
3399                        ST tmpv, (tmpa)
3400 */
3401 static
3402 ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
3403                     Prefix       pfx,
3404                     Int          size,
3405                     Long         delta0,
3406                     /*OUT*/Bool* ok )
3407 {
3408    Int   len;
3409    UChar rm = getUChar(delta0);
3410    HChar dis_buf[50];
3411
3412    *ok = True;
3413
3414    if (epartIsReg(rm)) {
3415       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
3416       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
3417       DIP("mov%c %s,%s\n", nameISize(size),
3418                            nameIRegG(size,pfx,rm),
3419                            nameIRegE(size,pfx,rm));
3420       return 1+delta0;
3421    }
3422
3423    /* E refers to memory */
3424    {
3425       if (haveF2(pfx)) { *ok = False; return delta0; }
3426       /* F3(XRELEASE) is acceptable, though. */
3427       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3428       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
3429       DIP("mov%c %s,%s\n", nameISize(size),
3430                            nameIRegG(size,pfx,rm),
3431                            dis_buf);
3432       return len+delta0;
3433    }
3434 }
3435
3436
3437 /* op $immediate, AL/AX/EAX/RAX. */
3438 static
3439 ULong dis_op_imm_A ( Int    size,
3440                      Bool   carrying,
3441                      IROp   op8,
3442                      Bool   keep,
3443                      Long   delta,
3444                      const HChar* t_amd64opc )
3445 {
3446    Int    size4 = imin(size,4);
3447    IRType ty    = szToITy(size);
3448    IRTemp dst0  = newTemp(ty);
3449    IRTemp src   = newTemp(ty);
3450    IRTemp dst1  = newTemp(ty);
3451    Long  lit    = getSDisp(size4,delta);
3452    assign(dst0, getIRegRAX(size));
3453    assign(src,  mkU(ty,lit & mkSizeMask(size)));
3454
3455    if (isAddSub(op8) && !carrying) {
3456       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3457       setFlags_DEP1_DEP2(op8, dst0, src, ty);
3458    }
3459    else
3460    if (isLogic(op8)) {
3461       vassert(!carrying);
3462       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3463       setFlags_DEP1(op8, dst1, ty);
3464    }
3465    else
3466    if (op8 == Iop_Add8 && carrying) {
3467       helper_ADC( size, dst1, dst0, src,
3468                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3469    }
3470    else
3471    if (op8 == Iop_Sub8 && carrying) {
3472       helper_SBB( size, dst1, dst0, src,
3473                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3474    }
3475    else
3476       vpanic("dis_op_imm_A(amd64,guest)");
3477
3478    if (keep)
3479       putIRegRAX(size, mkexpr(dst1));
3480
3481    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
3482                            lit, nameIRegRAX(size));
3483    return delta+size4;
3484 }
3485
3486
3487 /* Sign- and Zero-extending moves. */
3488 static
3489 ULong dis_movx_E_G ( const VexAbiInfo* vbi,
3490                      Prefix pfx,
3491                      Long delta, Int szs, Int szd, Bool sign_extend )
3492 {
3493    UChar rm = getUChar(delta);
3494    if (epartIsReg(rm)) {
3495       putIRegG(szd, pfx, rm,
3496                     doScalarWidening(
3497                        szs,szd,sign_extend,
3498                        getIRegE(szs,pfx,rm)));
3499       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3500                                nameISize(szs),
3501                                nameISize(szd),
3502                                nameIRegE(szs,pfx,rm),
3503                                nameIRegG(szd,pfx,rm));
3504       return 1+delta;
3505    }
3506
3507    /* E refers to memory */
3508    {
3509       Int    len;
3510       HChar  dis_buf[50];
3511       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
3512       putIRegG(szd, pfx, rm,
3513                     doScalarWidening(
3514                        szs,szd,sign_extend,
3515                        loadLE(szToITy(szs),mkexpr(addr))));
3516       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3517                                nameISize(szs),
3518                                nameISize(szd),
3519                                dis_buf,
3520                                nameIRegG(szd,pfx,rm));
3521       return len+delta;
3522    }
3523 }
3524
3525
3526 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
3527    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
3528 static
3529 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
3530 {
3531    /* special-case the 64-bit case */
3532    if (sz == 8) {
3533       IROp   op     = signed_divide ? Iop_DivModS128to64
3534                                     : Iop_DivModU128to64;
3535       IRTemp src128 = newTemp(Ity_I128);
3536       IRTemp dst128 = newTemp(Ity_I128);
3537       assign( src128, binop(Iop_64HLto128,
3538                             getIReg64(R_RDX),
3539                             getIReg64(R_RAX)) );
3540       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
3541       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
3542       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
3543    } else {
3544       IROp   op    = signed_divide ? Iop_DivModS64to32
3545                                    : Iop_DivModU64to32;
3546       IRTemp src64 = newTemp(Ity_I64);
3547       IRTemp dst64 = newTemp(Ity_I64);
3548       switch (sz) {
3549       case 4:
3550          assign( src64,
3551                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
3552          assign( dst64,
3553                  binop(op, mkexpr(src64), mkexpr(t)) );
3554          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
3555          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
3556          break;
3557       case 2: {
3558          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3559          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3560          assign( src64, unop(widen3264,
3561                              binop(Iop_16HLto32,
3562                                    getIRegRDX(2),
3563                                    getIRegRAX(2))) );
3564          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
3565          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
3566          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
3567          break;
3568       }
3569       case 1: {
3570          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3571          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3572          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
3573          assign( src64, unop(widen3264,
3574                         unop(widen1632, getIRegRAX(2))) );
3575          assign( dst64,
3576                  binop(op, mkexpr(src64),
3577                            unop(widen1632, unop(widen816, mkexpr(t)))) );
3578          putIRegRAX( 1, unop(Iop_16to8,
3579                         unop(Iop_32to16,
3580                         unop(Iop_64to32,mkexpr(dst64)))) );
3581          putIRegAH( unop(Iop_16to8,
3582                     unop(Iop_32to16,
3583                     unop(Iop_64HIto32,mkexpr(dst64)))) );
3584          break;
3585       }
3586       default:
3587          vpanic("codegen_div(amd64)");
3588       }
3589    }
3590 }
3591
3592 static
3593 ULong dis_Grp1 ( const VexAbiInfo* vbi,
3594                  Prefix pfx,
3595                  Long delta, UChar modrm,
3596                  Int am_sz, Int d_sz, Int sz, Long d64 )
3597 {
3598    Int     len;
3599    HChar   dis_buf[50];
3600    IRType  ty   = szToITy(sz);
3601    IRTemp  dst1 = newTemp(ty);
3602    IRTemp  src  = newTemp(ty);
3603    IRTemp  dst0 = newTemp(ty);
3604    IRTemp  addr = IRTemp_INVALID;
3605    IROp    op8  = Iop_INVALID;
3606    ULong   mask = mkSizeMask(sz);
3607
3608    switch (gregLO3ofRM(modrm)) {
3609       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
3610       case 2: break;  // ADC
3611       case 3: break;  // SBB
3612       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
3613       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
3614       /*NOTREACHED*/
3615       default: vpanic("dis_Grp1(amd64): unhandled case");
3616    }
3617
3618    if (epartIsReg(modrm)) {
3619       vassert(am_sz == 1);
3620
3621       assign(dst0, getIRegE(sz,pfx,modrm));
3622       assign(src,  mkU(ty,d64 & mask));
3623
3624       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3625          helper_ADC( sz, dst1, dst0, src,
3626                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3627       } else
3628       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3629          helper_SBB( sz, dst1, dst0, src,
3630                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3631       } else {
3632          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3633          if (isAddSub(op8))
3634             setFlags_DEP1_DEP2(op8, dst0, src, ty);
3635          else
3636             setFlags_DEP1(op8, dst1, ty);
3637       }
3638
3639       if (gregLO3ofRM(modrm) < 7)
3640          putIRegE(sz, pfx, modrm, mkexpr(dst1));
3641
3642       delta += (am_sz + d_sz);
3643       DIP("%s%c $%lld, %s\n",
3644           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
3645           nameIRegE(sz,pfx,modrm));
3646    } else {
3647       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3648
3649       assign(dst0, loadLE(ty,mkexpr(addr)));
3650       assign(src, mkU(ty,d64 & mask));
3651
3652       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3653          if (haveLOCK(pfx)) {
3654             /* cas-style store */
3655             helper_ADC( sz, dst1, dst0, src,
3656                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3657          } else {
3658             /* normal store */
3659             helper_ADC( sz, dst1, dst0, src,
3660                         /*store*/addr, IRTemp_INVALID, 0 );
3661          }
3662       } else
3663       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3664          if (haveLOCK(pfx)) {
3665             /* cas-style store */
3666             helper_SBB( sz, dst1, dst0, src,
3667                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3668          } else {
3669             /* normal store */
3670             helper_SBB( sz, dst1, dst0, src,
3671                         /*store*/addr, IRTemp_INVALID, 0 );
3672          }
3673       } else {
3674          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3675          if (gregLO3ofRM(modrm) < 7) {
3676             if (haveLOCK(pfx)) {
3677                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
3678                                     mkexpr(dst1)/*newVal*/,
3679                                     guest_RIP_curr_instr );
3680             } else {
3681                storeLE(mkexpr(addr), mkexpr(dst1));
3682             }
3683          }
3684          if (isAddSub(op8))
3685             setFlags_DEP1_DEP2(op8, dst0, src, ty);
3686          else
3687             setFlags_DEP1(op8, dst1, ty);
3688       }
3689
3690       delta += (len+d_sz);
3691       DIP("%s%c $%lld, %s\n",
3692           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
3693           d64, dis_buf);
3694    }
3695    return delta;
3696 }
3697
3698
3699 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
3700    expression. */
3701
3702 static
3703 ULong dis_Grp2 ( const VexAbiInfo* vbi,
3704                  Prefix pfx,
3705                  Long delta, UChar modrm,
3706                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
3707                  const HChar* shift_expr_txt, Bool* decode_OK )
3708 {
3709    /* delta on entry points at the modrm byte. */
3710    HChar  dis_buf[50];
3711    Int    len;
3712    Bool   isShift, isRotate, isRotateC;
3713    IRType ty    = szToITy(sz);
3714    IRTemp dst0  = newTemp(ty);
3715    IRTemp dst1  = newTemp(ty);
3716    IRTemp addr  = IRTemp_INVALID;
3717
3718    *decode_OK = True;
3719
3720    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
3721
3722    /* Put value to shift/rotate in dst0. */
3723    if (epartIsReg(modrm)) {
3724       assign(dst0, getIRegE(sz, pfx, modrm));
3725       delta += (am_sz + d_sz);
3726    } else {
3727       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3728       assign(dst0, loadLE(ty,mkexpr(addr)));
3729       delta += len + d_sz;
3730    }
3731
3732    isShift = False;
3733    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
3734
3735    isRotate = False;
3736    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
3737
3738    isRotateC = False;
3739    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
3740
3741    if (!isShift && !isRotate && !isRotateC) {
3742       /*NOTREACHED*/
3743       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
3744    }
3745
3746    if (isRotateC) {
3747       /* Call a helper; this insn is so ridiculous it does not deserve
3748          better.  One problem is, the helper has to calculate both the
3749          new value and the new flags.  This is more than 64 bits, and
3750          there is no way to return more than 64 bits from the helper.
3751          Hence the crude and obvious solution is to call it twice,
3752          using the sign of the sz field to indicate whether it is the
3753          value or rflags result we want.
3754       */
3755       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
3756       IRExpr** argsVALUE;
3757       IRExpr** argsRFLAGS;
3758
3759       IRTemp new_value  = newTemp(Ity_I64);
3760       IRTemp new_rflags = newTemp(Ity_I64);
3761       IRTemp old_rflags = newTemp(Ity_I64);
3762
3763       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
3764
3765       argsVALUE
3766          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3767                           widenUto64(shift_expr),   /* rotate amount */
3768                           mkexpr(old_rflags),
3769                           mkU64(sz) );
3770       assign( new_value,
3771                  mkIRExprCCall(
3772                     Ity_I64,
3773                     0/*regparm*/,
3774                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3775                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3776                     argsVALUE
3777                  )
3778             );
3779
3780       argsRFLAGS
3781          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3782                           widenUto64(shift_expr),   /* rotate amount */
3783                           mkexpr(old_rflags),
3784                           mkU64(-sz) );
3785       assign( new_rflags,
3786                  mkIRExprCCall(
3787                     Ity_I64,
3788                     0/*regparm*/,
3789                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3790                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3791                     argsRFLAGS
3792                  )
3793             );
3794
3795       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
3796       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3797       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
3798       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3799       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3800    }
3801
3802    else
3803    if (isShift) {
3804
3805       IRTemp pre64     = newTemp(Ity_I64);
3806       IRTemp res64     = newTemp(Ity_I64);
3807       IRTemp res64ss   = newTemp(Ity_I64);
3808       IRTemp shift_amt = newTemp(Ity_I8);
3809       UChar  mask      = toUChar(sz==8 ? 63 : 31);
3810       IROp   op64;
3811
3812       switch (gregLO3ofRM(modrm)) {
3813          case 4: op64 = Iop_Shl64; break;
3814          case 5: op64 = Iop_Shr64; break;
3815          case 6: op64 = Iop_Shl64; break;
3816          case 7: op64 = Iop_Sar64; break;
3817          /*NOTREACHED*/
3818          default: vpanic("dis_Grp2:shift"); break;
3819       }
3820
3821       /* Widen the value to be shifted to 64 bits, do the shift, and
3822          narrow back down.  This seems surprisingly long-winded, but
3823          unfortunately the AMD semantics requires that 8/16/32-bit
3824          shifts give defined results for shift values all the way up
3825          to 32, and this seems the simplest way to do it.  It has the
3826          advantage that the only IR level shifts generated are of 64
3827          bit values, and the shift amount is guaranteed to be in the
3828          range 0 .. 63, thereby observing the IR semantics requiring
3829          all shift values to be in the range 0 .. 2^word_size-1.
3830
3831          Therefore the shift amount is masked with 63 for 64-bit shifts
3832          and 31 for all others.
3833       */
3834       /* shift_amt = shift_expr & MASK, regardless of operation size */
3835       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
3836
3837       /* suitably widen the value to be shifted to 64 bits. */
3838       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
3839                                      : widenUto64(mkexpr(dst0)) );
3840
3841       /* res64 = pre64 `shift` shift_amt */
3842       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
3843
3844       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
3845       assign( res64ss,
3846               binop(op64,
3847                     mkexpr(pre64),
3848                     binop(Iop_And8,
3849                           binop(Iop_Sub8,
3850                                 mkexpr(shift_amt), mkU8(1)),
3851                           mkU8(mask))) );
3852
3853       /* Build the flags thunk. */
3854       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
3855
3856       /* Narrow the result back down. */
3857       assign( dst1, narrowTo(ty, mkexpr(res64)) );
3858
3859    } /* if (isShift) */
3860
3861    else
3862    if (isRotate) {
3863       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
3864                                         : (ty==Ity_I32 ? 2 : 3));
3865       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
3866       IRTemp rot_amt   = newTemp(Ity_I8);
3867       IRTemp rot_amt64 = newTemp(Ity_I8);
3868       IRTemp oldFlags  = newTemp(Ity_I64);
3869       UChar  mask      = toUChar(sz==8 ? 63 : 31);
3870
3871       /* rot_amt = shift_expr & mask */
3872       /* By masking the rotate amount thusly, the IR-level Shl/Shr
3873          expressions never shift beyond the word size and thus remain
3874          well defined. */
3875       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
3876
3877       if (ty == Ity_I64)
3878          assign(rot_amt, mkexpr(rot_amt64));
3879       else
3880          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
3881
3882       if (left) {
3883
3884          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
3885          assign(dst1,
3886             binop( mkSizedOp(ty,Iop_Or8),
3887                    binop( mkSizedOp(ty,Iop_Shl8),
3888                           mkexpr(dst0),
3889                           mkexpr(rot_amt)
3890                    ),
3891                    binop( mkSizedOp(ty,Iop_Shr8),
3892                           mkexpr(dst0),
3893                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3894                    )
3895             )
3896          );
3897          ccOp += AMD64G_CC_OP_ROLB;
3898
3899       } else { /* right */
3900
3901          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
3902          assign(dst1,
3903             binop( mkSizedOp(ty,Iop_Or8),
3904                    binop( mkSizedOp(ty,Iop_Shr8),
3905                           mkexpr(dst0),
3906                           mkexpr(rot_amt)
3907                    ),
3908                    binop( mkSizedOp(ty,Iop_Shl8),
3909                           mkexpr(dst0),
3910                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3911                    )
3912             )
3913          );
3914          ccOp += AMD64G_CC_OP_RORB;
3915
3916       }
3917
3918       /* dst1 now holds the rotated value.  Build flag thunk.  We
3919          need the resulting value for this, and the previous flags.
3920          Except don't set it if the rotate count is zero. */
3921
3922       assign(oldFlags, mk_amd64g_calculate_rflags_all());
3923
3924       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
3925       IRTemp rot_amt64b = newTemp(Ity_I1);
3926       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
3927
3928       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
3929       stmt( IRStmt_Put( OFFB_CC_OP,
3930                         IRExpr_ITE( mkexpr(rot_amt64b),
3931                                     mkU64(ccOp),
3932                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
3933       stmt( IRStmt_Put( OFFB_CC_DEP1,
3934                         IRExpr_ITE( mkexpr(rot_amt64b),
3935                                     widenUto64(mkexpr(dst1)),
3936                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
3937       stmt( IRStmt_Put( OFFB_CC_DEP2,
3938                         IRExpr_ITE( mkexpr(rot_amt64b),
3939                                     mkU64(0),
3940                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
3941       stmt( IRStmt_Put( OFFB_CC_NDEP,
3942                         IRExpr_ITE( mkexpr(rot_amt64b),
3943                                     mkexpr(oldFlags),
3944                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
3945    } /* if (isRotate) */
3946
3947    /* Save result, and finish up. */
3948    if (epartIsReg(modrm)) {
3949       putIRegE(sz, pfx, modrm, mkexpr(dst1));
3950       if (vex_traceflags & VEX_TRACE_FE) {
3951          vex_printf("%s%c ",
3952                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3953          if (shift_expr_txt)
3954             vex_printf("%s", shift_expr_txt);
3955          else
3956             ppIRExpr(shift_expr);
3957          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
3958       }
3959    } else {
3960       storeLE(mkexpr(addr), mkexpr(dst1));
3961       if (vex_traceflags & VEX_TRACE_FE) {
3962          vex_printf("%s%c ",
3963                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3964          if (shift_expr_txt)
3965             vex_printf("%s", shift_expr_txt);
3966          else
3967             ppIRExpr(shift_expr);
3968          vex_printf(", %s\n", dis_buf);
3969       }
3970    }
3971    return delta;
3972 }
3973
3974
3975 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
3976 static
3977 ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
3978                      Prefix pfx,
3979                      Long delta, UChar modrm,
3980                      Int am_sz, Int sz, ULong src_val,
3981                      Bool* decode_OK )
3982 {
3983    /* src_val denotes a d8.
3984       And delta on entry points at the modrm byte. */
3985
3986    IRType ty     = szToITy(sz);
3987    IRTemp t2     = newTemp(Ity_I64);
3988    IRTemp t2m    = newTemp(Ity_I64);
3989    IRTemp t_addr = IRTemp_INVALID;
3990    HChar  dis_buf[50];
3991    ULong  mask;
3992
3993    /* we're optimists :-) */
3994    *decode_OK = True;
3995
3996    /* Check whether F2 or F3 are acceptable. */
3997    if (epartIsReg(modrm)) {
3998       /* F2 or F3 are not allowed in the register case. */
3999       if (haveF2orF3(pfx)) {
4000          *decode_OK = False;
4001          return delta;
4002      }
4003    } else {
4004       /* F2 or F3 (but not both) are allowable provided LOCK is also
4005          present. */
4006       if (haveF2orF3(pfx)) {
4007          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
4008             *decode_OK = False;
4009             return delta;
4010          }
4011       }
4012    }
4013
4014    /* Limit src_val -- the bit offset -- to something within a word.
4015       The Intel docs say that literal offsets larger than a word are
4016       masked in this way. */
4017    switch (sz) {
4018       case 2:  src_val &= 15; break;
4019       case 4:  src_val &= 31; break;
4020       case 8:  src_val &= 63; break;
4021       default: *decode_OK = False; return delta;
4022    }
4023
4024    /* Invent a mask suitable for the operation. */
4025    switch (gregLO3ofRM(modrm)) {
4026       case 4: /* BT */  mask = 0;                  break;
4027       case 5: /* BTS */ mask = 1ULL << src_val;    break;
4028       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
4029       case 7: /* BTC */ mask = 1ULL << src_val;    break;
4030          /* If this needs to be extended, probably simplest to make a
4031             new function to handle the other cases (0 .. 3).  The
4032             Intel docs do however not indicate any use for 0 .. 3, so
4033             we don't expect this to happen. */
4034       default: *decode_OK = False; return delta;
4035    }
4036
4037    /* Fetch the value to be tested and modified into t2, which is
4038       64-bits wide regardless of sz. */
4039    if (epartIsReg(modrm)) {
4040       vassert(am_sz == 1);
4041       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
4042       delta += (am_sz + 1);
4043       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
4044                                 nameISize(sz),
4045                                 src_val, nameIRegE(sz,pfx,modrm));
4046    } else {
4047       Int len;
4048       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
4049       delta  += (len+1);
4050       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
4051       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
4052                                 nameISize(sz),
4053                                 src_val, dis_buf);
4054    }
4055
4056    /* Compute the new value into t2m, if non-BT. */
4057    switch (gregLO3ofRM(modrm)) {
4058       case 4: /* BT */
4059          break;
4060       case 5: /* BTS */
4061          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
4062          break;
4063       case 6: /* BTR */
4064          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
4065          break;
4066       case 7: /* BTC */
4067          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
4068          break;
4069      default:
4070          /*NOTREACHED*/ /*the previous switch guards this*/
4071          vassert(0);
4072    }
4073
4074    /* Write the result back, if non-BT. */
4075    if (gregLO3ofRM(modrm) != 4 /* BT */) {
4076       if (epartIsReg(modrm)) {
4077         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
4078       } else {
4079          if (haveLOCK(pfx)) {
4080             casLE( mkexpr(t_addr),
4081                    narrowTo(ty, mkexpr(t2))/*expd*/,
4082                    narrowTo(ty, mkexpr(t2m))/*new*/,
4083                    guest_RIP_curr_instr );
4084          } else {
4085             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
4086          }
4087       }
4088    }
4089
4090    /* Copy relevant bit from t2 into the carry flag. */
4091    /* Flags: C=selected bit, O,S,A,P undefined, Z unchanged */
4092    /* so let's also keep O,S,A,P unchanged */
4093    const ULong maskC     = AMD64G_CC_MASK_C;
4094    const ULong maskOSZAP = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S
4095                            | AMD64G_CC_MASK_Z | AMD64G_CC_MASK_A
4096                            | AMD64G_CC_MASK_P;
4097
4098    IRTemp old_rflags = newTemp(Ity_I64);
4099    assign(old_rflags, mk_amd64g_calculate_rflags_all());
4100
4101    IRTemp new_rflags = newTemp(Ity_I64);
4102    assign(new_rflags,
4103           binop(Iop_Or64,
4104                 binop(Iop_And64, mkexpr(old_rflags), mkU64(maskOSZAP)),
4105                 binop(Iop_And64,
4106                                 binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
4107                                     mkU64(maskC)) ));
4108
4109    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
4110    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
4111    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
4112    /* Set NDEP even though it isn't used.  This makes redundant-PUT
4113       elimination of previous stores to this field work better. */
4114    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
4115
4116    return delta;
4117 }
4118
4119
4120 /* Signed/unsigned widening multiply.  Generate IR to multiply the
4121    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
4122    RDX:RAX/EDX:EAX/DX:AX/AX.
4123 */
4124 static void codegen_mulL_A_D ( Int sz, Bool syned,
4125                                IRTemp tmp, const HChar* tmp_txt )
4126 {
4127    IRType ty = szToITy(sz);
4128    IRTemp t1 = newTemp(ty);
4129
4130    assign( t1, getIRegRAX(sz) );
4131
4132    switch (ty) {
4133       case Ity_I64: {
4134          IRTemp res128  = newTemp(Ity_I128);
4135          IRTemp resHi   = newTemp(Ity_I64);
4136          IRTemp resLo   = newTemp(Ity_I64);
4137          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
4138          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4139          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
4140          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4141          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
4142          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
4143          putIReg64(R_RDX, mkexpr(resHi));
4144          putIReg64(R_RAX, mkexpr(resLo));
4145          break;
4146       }
4147       case Ity_I32: {
4148          IRTemp res64   = newTemp(Ity_I64);
4149          IRTemp resHi   = newTemp(Ity_I32);
4150          IRTemp resLo   = newTemp(Ity_I32);
4151          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
4152          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4153          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
4154          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4155          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
4156          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
4157          putIRegRDX(4, mkexpr(resHi));
4158          putIRegRAX(4, mkexpr(resLo));
4159          break;
4160       }
4161       case Ity_I16: {
4162          IRTemp res32   = newTemp(Ity_I32);
4163          IRTemp resHi   = newTemp(Ity_I16);
4164          IRTemp resLo   = newTemp(Ity_I16);
4165          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
4166          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4167          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
4168          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4169          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
4170          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
4171          putIRegRDX(2, mkexpr(resHi));
4172          putIRegRAX(2, mkexpr(resLo));
4173          break;
4174       }
4175       case Ity_I8: {
4176          IRTemp res16   = newTemp(Ity_I16);
4177          IRTemp resHi   = newTemp(Ity_I8);
4178          IRTemp resLo   = newTemp(Ity_I8);
4179          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
4180          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4181          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
4182          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4183          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
4184          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
4185          putIRegRAX(2, mkexpr(res16));
4186          break;
4187       }
4188       default:
4189          ppIRType(ty);
4190          vpanic("codegen_mulL_A_D(amd64)");
4191    }
4192    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
4193 }
4194
4195
4196 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
4197    might be valid.*/
4198 static
4199 ULong dis_Grp3 ( const VexAbiInfo* vbi,
4200                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
4201 {
4202    Long    d64;
4203    UChar   modrm;
4204    HChar   dis_buf[50];
4205    Int     len;
4206    IRTemp  addr;
4207    IRType  ty = szToITy(sz);
4208    IRTemp  t1 = newTemp(ty);
4209    IRTemp dst1, src, dst0;
4210    *decode_OK = True;
4211    modrm = getUChar(delta);
4212    if (epartIsReg(modrm)) {
4213       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
4214       if (haveF2orF3(pfx)) goto unhandled;
4215       switch (gregLO3ofRM(modrm)) {
4216          case 0: { /* TEST */
4217             delta++;
4218             d64 = getSDisp(imin(4,sz), delta);
4219             delta += imin(4,sz);
4220             dst1 = newTemp(ty);
4221             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
4222                                getIRegE(sz,pfx,modrm),
4223                                mkU(ty, d64 & mkSizeMask(sz))));
4224             setFlags_DEP1( Iop_And8, dst1, ty );
4225             DIP("test%c $%lld, %s\n",
4226                 nameISize(sz), d64,
4227                 nameIRegE(sz, pfx, modrm));
4228             break;
4229          }
4230          case 1:
4231             *decode_OK = False;
4232             return delta;
4233          case 2: /* NOT */
4234             delta++;
4235             putIRegE(sz, pfx, modrm,
4236                               unop(mkSizedOp(ty,Iop_Not8),
4237                                    getIRegE(sz, pfx, modrm)));
4238             DIP("not%c %s\n", nameISize(sz),
4239                               nameIRegE(sz, pfx, modrm));
4240             break;
4241          case 3: /* NEG */
4242             delta++;
4243             dst0 = newTemp(ty);
4244             src  = newTemp(ty);
4245             dst1 = newTemp(ty);
4246             assign(dst0, mkU(ty,0));
4247             assign(src,  getIRegE(sz, pfx, modrm));
4248             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
4249                                                        mkexpr(src)));
4250             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
4251             putIRegE(sz, pfx, modrm, mkexpr(dst1));
4252             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
4253             break;
4254          case 4: /* MUL (unsigned widening) */
4255             delta++;
4256             src = newTemp(ty);
4257             assign(src, getIRegE(sz,pfx,modrm));
4258             codegen_mulL_A_D ( sz, False, src,
4259                                nameIRegE(sz,pfx,modrm) );
4260             break;
4261          case 5: /* IMUL (signed widening) */
4262             delta++;
4263             src = newTemp(ty);
4264             assign(src, getIRegE(sz,pfx,modrm));
4265             codegen_mulL_A_D ( sz, True, src,
4266                                nameIRegE(sz,pfx,modrm) );
4267             break;
4268          case 6: /* DIV */
4269             delta++;
4270             assign( t1, getIRegE(sz, pfx, modrm) );
4271             codegen_div ( sz, t1, False );
4272             DIP("div%c %s\n", nameISize(sz),
4273                               nameIRegE(sz, pfx, modrm));
4274             break;
4275          case 7: /* IDIV */
4276             delta++;
4277             assign( t1, getIRegE(sz, pfx, modrm) );
4278             codegen_div ( sz, t1, True );
4279             DIP("idiv%c %s\n", nameISize(sz),
4280                                nameIRegE(sz, pfx, modrm));
4281             break;
4282          default:
4283             /*NOTREACHED*/
4284             vpanic("Grp3(amd64,R)");
4285       }
4286    } else {
4287       /* Decide if F2/XACQ or F3/XREL might be valid. */
4288       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4289       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
4290           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4291          validF2orF3 = True;
4292       }
4293       if (!validF2orF3) goto unhandled;
4294       /* */
4295       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
4296                         /* we have to inform disAMode of any immediate
4297                            bytes used */
4298                         gregLO3ofRM(modrm)==0/*TEST*/
4299                            ? imin(4,sz)
4300                            : 0
4301                       );
4302       t1   = newTemp(ty);
4303       delta += len;
4304       assign(t1, loadLE(ty,mkexpr(addr)));
4305       switch (gregLO3ofRM(modrm)) {
4306          case 0: { /* TEST */
4307             d64 = getSDisp(imin(4,sz), delta);
4308             delta += imin(4,sz);
4309             dst1 = newTemp(ty);
4310             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
4311                                mkexpr(t1),
4312                                mkU(ty, d64 & mkSizeMask(sz))));
4313             setFlags_DEP1( Iop_And8, dst1, ty );
4314             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
4315             break;
4316          }
4317          case 1:
4318             *decode_OK = False;
4319             return delta;
4320          case 2: /* NOT */
4321             dst1 = newTemp(ty);
4322             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
4323             if (haveLOCK(pfx)) {
4324                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
4325                                     guest_RIP_curr_instr );
4326             } else {
4327                storeLE( mkexpr(addr), mkexpr(dst1) );
4328             }
4329             DIP("not%c %s\n", nameISize(sz), dis_buf);
4330             break;
4331          case 3: /* NEG */
4332             dst0 = newTemp(ty);
4333             src  = newTemp(ty);
4334             dst1 = newTemp(ty);
4335             assign(dst0, mkU(ty,0));
4336             assign(src,  mkexpr(t1));
4337             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
4338                                                        mkexpr(src)));
4339             if (haveLOCK(pfx)) {
4340                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
4341                                     guest_RIP_curr_instr );
4342             } else {
4343                storeLE( mkexpr(addr), mkexpr(dst1) );
4344             }
4345             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
4346             DIP("neg%c %s\n", nameISize(sz), dis_buf);
4347             break;
4348          case 4: /* MUL (unsigned widening) */
4349             codegen_mulL_A_D ( sz, False, t1, dis_buf );
4350             break;
4351          case 5: /* IMUL */
4352             codegen_mulL_A_D ( sz, True, t1, dis_buf );
4353             break;
4354          case 6: /* DIV */
4355             codegen_div ( sz, t1, False );
4356             DIP("div%c %s\n", nameISize(sz), dis_buf);
4357             break;
4358          case 7: /* IDIV */
4359             codegen_div ( sz, t1, True );
4360             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
4361             break;
4362          default:
4363             /*NOTREACHED*/
4364             vpanic("Grp3(amd64,M)");
4365       }
4366    }
4367    return delta;
4368   unhandled:
4369    *decode_OK = False;
4370    return delta;
4371 }
4372
4373
4374 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
4375    might be valid. */
4376 static
4377 ULong dis_Grp4 ( const VexAbiInfo* vbi,
4378                  Prefix pfx, Long delta, Bool* decode_OK )
4379 {
4380    Int   alen;
4381    UChar modrm;
4382    HChar dis_buf[50];
4383    IRType ty = Ity_I8;
4384    IRTemp t1 = newTemp(ty);
4385    IRTemp t2 = newTemp(ty);
4386
4387    *decode_OK = True;
4388
4389    modrm = getUChar(delta);
4390    if (epartIsReg(modrm)) {
4391       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
4392       if (haveF2orF3(pfx)) goto unhandled;
4393       assign(t1, getIRegE(1, pfx, modrm));
4394       switch (gregLO3ofRM(modrm)) {
4395          case 0: /* INC */
4396             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4397             putIRegE(1, pfx, modrm, mkexpr(t2));
4398             setFlags_INC_DEC( True, t2, ty );
4399             break;
4400          case 1: /* DEC */
4401             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4402             putIRegE(1, pfx, modrm, mkexpr(t2));
4403             setFlags_INC_DEC( False, t2, ty );
4404             break;
4405          default:
4406             *decode_OK = False;
4407             return delta;
4408       }
4409       delta++;
4410       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
4411                       nameIRegE(1, pfx, modrm));
4412    } else {
4413       /* Decide if F2/XACQ or F3/XREL might be valid. */
4414       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4415       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
4416           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4417          validF2orF3 = True;
4418       }
4419       if (!validF2orF3) goto unhandled;
4420       /* */
4421       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
4422       assign( t1, loadLE(ty, mkexpr(addr)) );
4423       switch (gregLO3ofRM(modrm)) {
4424          case 0: /* INC */
4425             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4426             if (haveLOCK(pfx)) {
4427                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4428                       guest_RIP_curr_instr );
4429             } else {
4430                storeLE( mkexpr(addr), mkexpr(t2) );
4431             }
4432             setFlags_INC_DEC( True, t2, ty );
4433             break;
4434          case 1: /* DEC */
4435             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4436             if (haveLOCK(pfx)) {
4437                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4438                       guest_RIP_curr_instr );
4439             } else {
4440                storeLE( mkexpr(addr), mkexpr(t2) );
4441             }
4442             setFlags_INC_DEC( False, t2, ty );
4443             break;
4444          default:
4445             *decode_OK = False;
4446             return delta;
4447       }
4448       delta += alen;
4449       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
4450    }
4451    return delta;
4452   unhandled:
4453    *decode_OK = False;
4454    return delta;
4455 }
4456
4457
4458 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
4459    might be valid. */
4460 static
4461 ULong dis_Grp5 ( const VexAbiInfo* vbi,
4462                  Prefix pfx, Int sz, Long delta,
4463                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
4464 {
4465    Int     len;
4466    UChar   modrm;
4467    HChar   dis_buf[50];
4468    IRTemp  addr = IRTemp_INVALID;
4469    IRType  ty = szToITy(sz);
4470    IRTemp  t1 = newTemp(ty);
4471    IRTemp  t2 = IRTemp_INVALID;
4472    IRTemp  t3 = IRTemp_INVALID;
4473    Bool    showSz = True;
4474
4475    *decode_OK = True;
4476
4477    modrm = getUChar(delta);
4478    if (epartIsReg(modrm)) {
4479       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
4480          F2/CALL and F2/JMP may have bnd prefix. */
4481      if (haveF2orF3(pfx)
4482          && ! (haveF2(pfx)
4483                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
4484         goto unhandledR;
4485       assign(t1, getIRegE(sz,pfx,modrm));
4486       switch (gregLO3ofRM(modrm)) {
4487          case 0: /* INC */
4488             t2 = newTemp(ty);
4489             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4490                              mkexpr(t1), mkU(ty,1)));
4491             setFlags_INC_DEC( True, t2, ty );
4492             putIRegE(sz,pfx,modrm, mkexpr(t2));
4493             break;
4494          case 1: /* DEC */
4495             t2 = newTemp(ty);
4496             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4497                              mkexpr(t1), mkU(ty,1)));
4498             setFlags_INC_DEC( False, t2, ty );
4499             putIRegE(sz,pfx,modrm, mkexpr(t2));
4500             break;
4501          case 2: /* call Ev */
4502             /* Ignore any sz value and operate as if sz==8. */
4503             if (!(sz == 4 || sz == 8)) goto unhandledR;
4504             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4505             sz = 8;
4506             t3 = newTemp(Ity_I64);
4507             assign(t3, getIRegE(sz,pfx,modrm));
4508             t2 = newTemp(Ity_I64);
4509             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4510             putIReg64(R_RSP, mkexpr(t2));
4511             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
4512             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
4513             jmp_treg(dres, Ijk_Call, t3);
4514             vassert(dres->whatNext == Dis_StopHere);
4515             showSz = False;
4516             break;
4517          case 4: /* jmp Ev */
4518             /* Ignore any sz value and operate as if sz==8. */
4519             if (!(sz == 4 || sz == 8)) goto unhandledR;
4520             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4521             sz = 8;
4522             t3 = newTemp(Ity_I64);
4523             assign(t3, getIRegE(sz,pfx,modrm));
4524             jmp_treg(dres, Ijk_Boring, t3);
4525             vassert(dres->whatNext == Dis_StopHere);
4526             showSz = False;
4527             break;
4528          case 6: /* PUSH Ev */
4529             /* There is no encoding for 32-bit operand size; hence ... */
4530             if (sz == 4) sz = 8;
4531             if (sz == 8 || sz == 2) {
4532                ty = szToITy(sz); /* redo it, since sz might have changed */
4533                t3 = newTemp(ty);
4534                assign(t3, getIRegE(sz,pfx,modrm));
4535                t2 = newTemp(Ity_I64);
4536                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4537                putIReg64(R_RSP, mkexpr(t2) );
4538                storeLE( mkexpr(t2), mkexpr(t3) );
4539                break;
4540             } else {
4541                goto unhandledR; /* awaiting test case */
4542             }
4543          default:
4544          unhandledR:
4545             *decode_OK = False;
4546             return delta;
4547       }
4548       delta++;
4549       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4550                        showSz ? nameISize(sz) : ' ',
4551                        nameIRegE(sz, pfx, modrm));
4552    } else {
4553       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
4554       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4555       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
4556           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4557          validF2orF3 = True;
4558       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
4559                  && (haveF2(pfx) && !haveF3(pfx))) {
4560          validF2orF3 = True;
4561       }
4562       if (!validF2orF3) goto unhandledM;
4563       /* */
4564       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
4565       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
4566                                   && gregLO3ofRM(modrm) != 6) {
4567          assign(t1, loadLE(ty,mkexpr(addr)));
4568       }
4569       switch (gregLO3ofRM(modrm)) {
4570          case 0: /* INC */
4571             t2 = newTemp(ty);
4572             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4573                              mkexpr(t1), mkU(ty,1)));
4574             if (haveLOCK(pfx)) {
4575                casLE( mkexpr(addr),
4576                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4577             } else {
4578                storeLE(mkexpr(addr),mkexpr(t2));
4579             }
4580             setFlags_INC_DEC( True, t2, ty );
4581             break;
4582          case 1: /* DEC */
4583             t2 = newTemp(ty);
4584             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4585                              mkexpr(t1), mkU(ty,1)));
4586             if (haveLOCK(pfx)) {
4587                casLE( mkexpr(addr),
4588                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4589             } else {
4590                storeLE(mkexpr(addr),mkexpr(t2));
4591             }
4592             setFlags_INC_DEC( False, t2, ty );
4593             break;
4594          case 2: /* call Ev */
4595             /* Ignore any sz value and operate as if sz==8. */
4596             if (!(sz == 4 || sz == 8)) goto unhandledM;
4597             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4598             sz = 8;
4599             t3 = newTemp(Ity_I64);
4600             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4601             t2 = newTemp(Ity_I64);
4602             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4603             putIReg64(R_RSP, mkexpr(t2));
4604             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
4605             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
4606             jmp_treg(dres, Ijk_Call, t3);
4607             vassert(dres->whatNext == Dis_StopHere);
4608             showSz = False;
4609             break;
4610          case 4: /* JMP Ev */
4611             /* Ignore any sz value and operate as if sz==8. */
4612             if (!(sz == 4 || sz == 8)) goto unhandledM;
4613             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4614             sz = 8;
4615             t3 = newTemp(Ity_I64);
4616             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4617             jmp_treg(dres, Ijk_Boring, t3);
4618             vassert(dres->whatNext == Dis_StopHere);
4619             showSz = False;
4620             break;
4621          case 6: /* PUSH Ev */
4622             /* There is no encoding for 32-bit operand size; hence ... */
4623             if (sz == 4) sz = 8;
4624             if (sz == 8 || sz == 2) {
4625                ty = szToITy(sz); /* redo it, since sz might have changed */
4626                t3 = newTemp(ty);
4627                assign(t3, loadLE(ty,mkexpr(addr)));
4628                t2 = newTemp(Ity_I64);
4629                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4630                putIReg64(R_RSP, mkexpr(t2) );
4631                storeLE( mkexpr(t2), mkexpr(t3) );
4632                break;
4633             } else {
4634                goto unhandledM; /* awaiting test case */
4635             }
4636          default:
4637          unhandledM:
4638             *decode_OK = False;
4639             return delta;
4640       }
4641       delta += len;
4642       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4643                        showSz ? nameISize(sz) : ' ',
4644                        dis_buf);
4645    }
4646    return delta;
4647 }
4648
4649
4650 /*------------------------------------------------------------*/
4651 /*--- Disassembling string ops (including REP prefixes)    ---*/
4652 /*------------------------------------------------------------*/
4653
4654 /* Code shared by all the string ops */
4655 static
4656 void dis_string_op_increment ( Int sz, IRTemp t_inc )
4657 {
4658    UChar logSz;
4659    if (sz == 8 || sz == 4 || sz == 2) {
4660       logSz = 1;
4661       if (sz == 4) logSz = 2;
4662       if (sz == 8) logSz = 3;
4663       assign( t_inc,
4664               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
4665                                mkU8(logSz) ) );
4666    } else {
4667       assign( t_inc,
4668               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
4669    }
4670 }
4671
4672 static
4673 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
4674                     Int sz, const HChar* name, Prefix pfx )
4675 {
4676    IRTemp t_inc = newTemp(Ity_I64);
4677    /* Really we ought to inspect the override prefixes, but we don't.
4678       The following assertion catches any resulting sillyness. */
4679    vassert(pfx == clearSegBits(pfx));
4680    dis_string_op_increment(sz, t_inc);
4681    dis_OP( sz, t_inc, pfx );
4682    DIP("%s%c\n", name, nameISize(sz));
4683 }
4684
4685 static
4686 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
4687 {
4688    IRType ty = szToITy(sz);
4689    IRTemp td = newTemp(Ity_I64);   /* RDI */
4690    IRTemp ts = newTemp(Ity_I64);   /* RSI */
4691    IRExpr *incd, *incs;
4692
4693    if (haveASO(pfx)) {
4694       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4695       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4696    } else {
4697       assign( td, getIReg64(R_RDI) );
4698       assign( ts, getIReg64(R_RSI) );
4699    }
4700
4701    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
4702
4703    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4704    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4705    if (haveASO(pfx)) {
4706       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4707       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4708    }
4709    putIReg64( R_RDI, incd );
4710    putIReg64( R_RSI, incs );
4711 }
4712
4713 static
4714 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
4715 {
4716    IRType ty = szToITy(sz);
4717    IRTemp ts = newTemp(Ity_I64);   /* RSI */
4718    IRExpr *incs;
4719
4720    if (haveASO(pfx))
4721       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4722    else
4723       assign( ts, getIReg64(R_RSI) );
4724
4725    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
4726
4727    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4728    if (haveASO(pfx))
4729       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4730    putIReg64( R_RSI, incs );
4731 }
4732
4733 static
4734 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
4735 {
4736    IRType ty = szToITy(sz);
4737    IRTemp ta = newTemp(ty);        /* rAX */
4738    IRTemp td = newTemp(Ity_I64);   /* RDI */
4739    IRExpr *incd;
4740
4741    assign( ta, getIRegRAX(sz) );
4742
4743    if (haveASO(pfx))
4744       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4745    else
4746       assign( td, getIReg64(R_RDI) );
4747
4748    storeLE( mkexpr(td), mkexpr(ta) );
4749
4750    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4751    if (haveASO(pfx))
4752       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4753    putIReg64( R_RDI, incd );
4754 }
4755
4756 static
4757 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
4758 {
4759    IRType ty  = szToITy(sz);
4760    IRTemp tdv = newTemp(ty);      /* (RDI) */
4761    IRTemp tsv = newTemp(ty);      /* (RSI) */
4762    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
4763    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
4764    IRExpr *incd, *incs;
4765
4766    if (haveASO(pfx)) {
4767       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4768       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4769    } else {
4770       assign( td, getIReg64(R_RDI) );
4771       assign( ts, getIReg64(R_RSI) );
4772    }
4773
4774    assign( tdv, loadLE(ty,mkexpr(td)) );
4775
4776    assign( tsv, loadLE(ty,mkexpr(ts)) );
4777
4778    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
4779
4780    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4781    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4782    if (haveASO(pfx)) {
4783       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4784       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4785    }
4786    putIReg64( R_RDI, incd );
4787    putIReg64( R_RSI, incs );
4788 }
4789
4790 static
4791 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
4792 {
4793    IRType ty  = szToITy(sz);
4794    IRTemp ta  = newTemp(ty);       /*  rAX  */
4795    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
4796    IRTemp tdv = newTemp(ty);       /* (RDI) */
4797    IRExpr *incd;
4798
4799    assign( ta, getIRegRAX(sz) );
4800
4801    if (haveASO(pfx))
4802       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4803    else
4804       assign( td, getIReg64(R_RDI) );
4805
4806    assign( tdv, loadLE(ty,mkexpr(td)) );
4807
4808    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
4809
4810    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4811    if (haveASO(pfx))
4812       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4813    putIReg64( R_RDI, incd );
4814 }
4815
4816
4817 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
4818    the insn is the last one in the basic block, and so emit a jump to
4819    the next insn, rather than just falling through. */
4820 static
4821 void dis_REP_op ( /*MOD*/DisResult* dres,
4822                   AMD64Condcode cond,
4823                   void (*dis_OP)(Int, IRTemp, Prefix),
4824                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
4825                   Prefix pfx )
4826 {
4827    IRTemp t_inc = newTemp(Ity_I64);
4828    IRTemp tc;
4829    IRExpr* cmp;
4830
4831    /* Really we ought to inspect the override prefixes, but we don't.
4832       The following assertion catches any resulting sillyness. */
4833    vassert(pfx == clearSegBits(pfx));
4834
4835    if (haveASO(pfx)) {
4836       tc = newTemp(Ity_I32);  /*  ECX  */
4837       assign( tc, getIReg32(R_RCX) );
4838       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
4839    } else {
4840       tc = newTemp(Ity_I64);  /*  RCX  */
4841       assign( tc, getIReg64(R_RCX) );
4842       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
4843    }
4844
4845    stmt( IRStmt_Exit( cmp, Ijk_Boring,
4846                       IRConst_U64(rip_next), OFFB_RIP ) );
4847
4848    if (haveASO(pfx))
4849       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
4850   else
4851       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
4852
4853    dis_string_op_increment(sz, t_inc);
4854    dis_OP (sz, t_inc, pfx);
4855
4856    if (cond == AMD64CondAlways) {
4857       jmp_lit(dres, Ijk_Boring, rip);
4858       vassert(dres->whatNext == Dis_StopHere);
4859    } else {
4860       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
4861                          Ijk_Boring,
4862                          IRConst_U64(rip),
4863                          OFFB_RIP ) );
4864       jmp_lit(dres, Ijk_Boring, rip_next);
4865       vassert(dres->whatNext == Dis_StopHere);
4866    }
4867    DIP("%s%c\n", name, nameISize(sz));
4868 }
4869
4870
4871 /*------------------------------------------------------------*/
4872 /*--- Arithmetic, etc.                                     ---*/
4873 /*------------------------------------------------------------*/
4874
4875 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
4876 static
4877 ULong dis_mul_E_G ( const VexAbiInfo* vbi,
4878                     Prefix      pfx,
4879                     Int         size,
4880                     Long        delta0 )
4881 {
4882    Int    alen;
4883    HChar  dis_buf[50];
4884    UChar  rm = getUChar(delta0);
4885    IRType ty = szToITy(size);
4886    IRTemp te = newTemp(ty);
4887    IRTemp tg = newTemp(ty);
4888    IRTemp resLo = newTemp(ty);
4889
4890    assign( tg, getIRegG(size, pfx, rm) );
4891    if (epartIsReg(rm)) {
4892       assign( te, getIRegE(size, pfx, rm) );
4893    } else {
4894       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
4895       assign( te, loadLE(ty,mkexpr(addr)) );
4896    }
4897
4898    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
4899
4900    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
4901
4902    putIRegG(size, pfx, rm, mkexpr(resLo) );
4903
4904    if (epartIsReg(rm)) {
4905       DIP("imul%c %s, %s\n", nameISize(size),
4906                              nameIRegE(size,pfx,rm),
4907                              nameIRegG(size,pfx,rm));
4908       return 1+delta0;
4909    } else {
4910       DIP("imul%c %s, %s\n", nameISize(size),
4911                              dis_buf,
4912                              nameIRegG(size,pfx,rm));
4913       return alen+delta0;
4914    }
4915 }
4916
4917
4918 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
4919 static
4920 ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
4921                        Prefix      pfx,
4922                        Int         size,
4923                        Long        delta,
4924                        Int         litsize )
4925 {
4926    Long   d64;
4927    Int    alen;
4928    HChar  dis_buf[50];
4929    UChar  rm = getUChar(delta);
4930    IRType ty = szToITy(size);
4931    IRTemp te = newTemp(ty);
4932    IRTemp tl = newTemp(ty);
4933    IRTemp resLo = newTemp(ty);
4934
4935    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
4936
4937    if (epartIsReg(rm)) {
4938       assign(te, getIRegE(size, pfx, rm));
4939       delta++;
4940    } else {
4941       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
4942                                      imin(4,litsize) );
4943       assign(te, loadLE(ty, mkexpr(addr)));
4944       delta += alen;
4945    }
4946    d64 = getSDisp(imin(4,litsize),delta);
4947    delta += imin(4,litsize);
4948
4949    d64 &= mkSizeMask(size);
4950    assign(tl, mkU(ty,d64));
4951
4952    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
4953
4954    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
4955
4956    putIRegG(size, pfx, rm, mkexpr(resLo));
4957
4958    DIP("imul%c $%lld, %s, %s\n",
4959        nameISize(size), d64,
4960        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
4961        nameIRegG(size,pfx,rm) );
4962    return delta;
4963 }
4964
4965
4966 /* Generate an IR sequence to do a popcount operation on the supplied
4967    IRTemp, and return a new IRTemp holding the result.  'ty' may be
4968    Ity_I16, Ity_I32 or Ity_I64 only. */
4969 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
4970 {
4971    Int i;
4972    if (ty == Ity_I16) {
4973       IRTemp old = IRTemp_INVALID;
4974       IRTemp nyu = IRTemp_INVALID;
4975       IRTemp mask[4], shift[4];
4976       for (i = 0; i < 4; i++) {
4977          mask[i]  = newTemp(ty);
4978          shift[i] = 1 << i;
4979       }
4980       assign(mask[0], mkU16(0x5555));
4981       assign(mask[1], mkU16(0x3333));
4982       assign(mask[2], mkU16(0x0F0F));
4983       assign(mask[3], mkU16(0x00FF));
4984       old = src;
4985       for (i = 0; i < 4; i++) {
4986          nyu = newTemp(ty);
4987          assign(nyu,
4988                 binop(Iop_Add16,
4989                       binop(Iop_And16,
4990                             mkexpr(old),
4991                             mkexpr(mask[i])),
4992                       binop(Iop_And16,
4993                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
4994                             mkexpr(mask[i]))));
4995          old = nyu;
4996       }
4997       return nyu;
4998    }
4999    if (ty == Ity_I32) {
5000       IRTemp old = IRTemp_INVALID;
5001       IRTemp nyu = IRTemp_INVALID;
5002       IRTemp mask[5], shift[5];
5003       for (i = 0; i < 5; i++) {
5004          mask[i]  = newTemp(ty);
5005          shift[i] = 1 << i;
5006       }
5007       assign(mask[0], mkU32(0x55555555));
5008       assign(mask[1], mkU32(0x33333333));
5009       assign(mask[2], mkU32(0x0F0F0F0F));
5010       assign(mask[3], mkU32(0x00FF00FF));
5011       assign(mask[4], mkU32(0x0000FFFF));
5012       old = src;
5013       for (i = 0; i < 5; i++) {
5014          nyu = newTemp(ty);
5015          assign(nyu,
5016                 binop(Iop_Add32,
5017                       binop(Iop_And32,
5018                             mkexpr(old),
5019                             mkexpr(mask[i])),
5020                       binop(Iop_And32,
5021                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
5022                             mkexpr(mask[i]))));
5023          old = nyu;
5024       }
5025       return nyu;
5026    }
5027    if (ty == Ity_I64) {
5028       IRTemp old = IRTemp_INVALID;
5029       IRTemp nyu = IRTemp_INVALID;
5030       IRTemp mask[6], shift[6];
5031       for (i = 0; i < 6; i++) {
5032          mask[i]  = newTemp(ty);
5033          shift[i] = 1 << i;
5034       }
5035       assign(mask[0], mkU64(0x5555555555555555ULL));
5036       assign(mask[1], mkU64(0x3333333333333333ULL));
5037       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
5038       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
5039       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
5040       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
5041       old = src;
5042       for (i = 0; i < 6; i++) {
5043          nyu = newTemp(ty);
5044          assign(nyu,
5045                 binop(Iop_Add64,
5046                       binop(Iop_And64,
5047                             mkexpr(old),
5048                             mkexpr(mask[i])),
5049                       binop(Iop_And64,
5050                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
5051                             mkexpr(mask[i]))));
5052          old = nyu;
5053       }
5054       return nyu;
5055    }
5056    /*NOTREACHED*/
5057    vassert(0);
5058 }
5059
5060
5061 /* Generate an IR sequence to do a count-leading-zeroes operation on
5062    the supplied IRTemp, and return a new IRTemp holding the result.
5063    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
5064    the argument is zero, return the number of bits in the word (the
5065    natural semantics). */
5066 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
5067 {
5068    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
5069
5070    IRTemp src64 = newTemp(Ity_I64);
5071    assign(src64, widenUto64( mkexpr(src) ));
5072
5073    IRTemp src64x = newTemp(Ity_I64);
5074    assign(src64x,
5075           binop(Iop_Shl64, mkexpr(src64),
5076                            mkU8(64 - 8 * sizeofIRType(ty))));
5077
5078    // Clz64 has undefined semantics when its input is zero, so
5079    // special-case around that.
5080    IRTemp res64 = newTemp(Ity_I64);
5081    assign(res64,
5082           IRExpr_ITE(
5083              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
5084              mkU64(8 * sizeofIRType(ty)),
5085              unop(Iop_Clz64, mkexpr(src64x))
5086    ));
5087
5088    IRTemp res = newTemp(ty);
5089    assign(res, narrowTo(ty, mkexpr(res64)));
5090    return res;
5091 }
5092
5093
5094 /* Generate an IR sequence to do a count-trailing-zeroes operation on
5095    the supplied IRTemp, and return a new IRTemp holding the result.
5096    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
5097    the argument is zero, return the number of bits in the word (the
5098    natural semantics). */
5099 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
5100 {
5101    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
5102
5103    IRTemp src64 = newTemp(Ity_I64);
5104    assign(src64, widenUto64( mkexpr(src) ));
5105
5106    // Ctz64 has undefined semantics when its input is zero, so
5107    // special-case around that.
5108    IRTemp res64 = newTemp(Ity_I64);
5109    assign(res64,
5110           IRExpr_ITE(
5111              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
5112              mkU64(8 * sizeofIRType(ty)),
5113              unop(Iop_Ctz64, mkexpr(src64))
5114    ));
5115
5116    IRTemp res = newTemp(ty);
5117    assign(res, narrowTo(ty, mkexpr(res64)));
5118    return res;
5119 }
5120
5121
5122 /*------------------------------------------------------------*/
5123 /*---                                                      ---*/
5124 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
5125 /*---                                                      ---*/
5126 /*------------------------------------------------------------*/
5127
5128 /* --- Helper functions for dealing with the register stack. --- */
5129
5130 /* --- Set the emulation-warning pseudo-register. --- */
5131
5132 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
5133 {
5134    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
5135    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
5136 }
5137
5138 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
5139
5140 static IRExpr* mkQNaN64 ( void )
5141 {
5142   /* QNaN is 0 2047 1 0(51times)
5143      == 0b 11111111111b 1 0(51times)
5144      == 0x7FF8 0000 0000 0000
5145    */
5146    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
5147 }
5148
5149 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
5150
5151 static IRExpr* get_ftop ( void )
5152 {
5153    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
5154 }
5155
5156 static void put_ftop ( IRExpr* e )
5157 {
5158    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
5159    stmt( IRStmt_Put( OFFB_FTOP, e ) );
5160 }
5161
5162 /* --------- Get/put the C3210 bits. --------- */
5163
5164 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
5165 {
5166    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
5167 }
5168
5169 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
5170 {
5171    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
5172    stmt( IRStmt_Put( OFFB_FC3210, e ) );
5173 }
5174
5175 /* --------- Get/put the FPU rounding mode. --------- */
5176 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
5177 {
5178    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
5179 }
5180
5181 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
5182 {
5183    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
5184    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
5185 }
5186
5187
5188 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
5189 /* Produces a value in 0 .. 3, which is encoded as per the type
5190    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
5191    per IRRoundingMode, we merely need to get it and mask it for
5192    safety.
5193 */
5194 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
5195 {
5196    return binop( Iop_And32, get_fpround(), mkU32(3) );
5197 }
5198
5199 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
5200 {
5201    return mkU32(Irrm_NEAREST);
5202 }
5203
5204
5205 /* --------- Get/set FP register tag bytes. --------- */
5206
5207 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
5208
5209 static void put_ST_TAG ( Int i, IRExpr* value )
5210 {
5211    IRRegArray* descr;
5212    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
5213    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5214    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
5215 }
5216
5217 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
5218    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
5219
5220 static IRExpr* get_ST_TAG ( Int i )
5221 {
5222    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5223    return IRExpr_GetI( descr, get_ftop(), i );
5224 }
5225
5226
5227 /* --------- Get/set FP registers. --------- */
5228
5229 /* Given i, and some expression e, emit 'ST(i) = e' and set the
5230    register's tag to indicate the register is full.  The previous
5231    state of the register is not checked. */
5232
5233 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
5234 {
5235    IRRegArray* descr;
5236    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
5237    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
5238    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
5239    /* Mark the register as in-use. */
5240    put_ST_TAG(i, mkU8(1));
5241 }
5242
5243 /* Given i, and some expression e, emit
5244       ST(i) = is_full(i) ? NaN : e
5245    and set the tag accordingly.
5246 */
5247
5248 static void put_ST ( Int i, IRExpr* value )
5249 {
5250    put_ST_UNCHECKED(
5251       i,
5252       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
5253                   /* non-0 means full */
5254                   mkQNaN64(),
5255                   /* 0 means empty */
5256                   value
5257       )
5258    );
5259 }
5260
5261
5262 /* Given i, generate an expression yielding 'ST(i)'. */
5263
5264 static IRExpr* get_ST_UNCHECKED ( Int i )
5265 {
5266    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
5267    return IRExpr_GetI( descr, get_ftop(), i );
5268 }
5269
5270
5271 /* Given i, generate an expression yielding
5272   is_full(i) ? ST(i) : NaN
5273 */
5274
5275 static IRExpr* get_ST ( Int i )
5276 {
5277    return
5278       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
5279                   /* non-0 means full */
5280                   get_ST_UNCHECKED(i),
5281                   /* 0 means empty */
5282                   mkQNaN64());
5283 }
5284
5285
5286 /* Given i, and some expression e, and a condition cond, generate IR
5287    which has the same effect as put_ST(i,e) when cond is true and has
5288    no effect when cond is false.  Given the lack of proper
5289    if-then-else in the IR, this is pretty tricky.
5290 */
5291
5292 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
5293 {
5294    // new_tag = if cond then FULL else old_tag
5295    // new_val = if cond then (if old_tag==FULL then NaN else val)
5296    //                   else old_val
5297
5298    IRTemp old_tag = newTemp(Ity_I8);
5299    assign(old_tag, get_ST_TAG(i));
5300    IRTemp new_tag = newTemp(Ity_I8);
5301    assign(new_tag,
5302           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
5303
5304    IRTemp old_val = newTemp(Ity_F64);
5305    assign(old_val, get_ST_UNCHECKED(i));
5306    IRTemp new_val = newTemp(Ity_F64);
5307    assign(new_val,
5308           IRExpr_ITE(mkexpr(cond),
5309                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
5310                                 /* non-0 means full */
5311                                 mkQNaN64(),
5312                                 /* 0 means empty */
5313                                 value),
5314                      mkexpr(old_val)));
5315
5316    put_ST_UNCHECKED(i, mkexpr(new_val));
5317    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
5318    // now set it to new_tag instead.
5319    put_ST_TAG(i, mkexpr(new_tag));
5320 }
5321
5322 /* Adjust FTOP downwards by one register. */
5323
5324 static void fp_push ( void )
5325 {
5326    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
5327 }
5328
5329 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
5330    don't change it. */
5331
5332 static void maybe_fp_push ( IRTemp cond )
5333 {
5334    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
5335 }
5336
5337 /* Adjust FTOP upwards by one register, and mark the vacated register
5338    as empty.  */
5339
5340 static void fp_pop ( void )
5341 {
5342    put_ST_TAG(0, mkU8(0));
5343    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
5344 }
5345
5346 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
5347    e[31:1] == 0.
5348 */
5349 static void set_C2 ( IRExpr* e )
5350 {
5351    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
5352    put_C3210( binop(Iop_Or64,
5353                     cleared,
5354                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
5355 }
5356
5357 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
5358    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
5359    test is simple, but the derivation of it is not so simple.
5360
5361    The exponent field for an IEEE754 double is 11 bits.  That means it
5362    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
5363    the number is either a NaN or an Infinity and so is not finite.
5364    Furthermore, a finite value of exactly 2^63 is the smallest value
5365    that has exponent value 0x43E.  Hence, what we need to do is
5366    extract the exponent, ignoring the sign bit and mantissa, and check
5367    it is < 0x43E, or <= 0x43D.
5368
5369    To make this easily applicable to 32- and 64-bit targets, a
5370    roundabout approach is used.  First the number is converted to I64,
5371    then the top 32 bits are taken.  Shifting them right by 20 bits
5372    places the sign bit and exponent in the bottom 12 bits.  Anding
5373    with 0x7FF gets rid of the sign bit, leaving just the exponent
5374    available for comparison.
5375 */
5376 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
5377 {
5378    IRTemp i64 = newTemp(Ity_I64);
5379    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
5380    IRTemp exponent = newTemp(Ity_I32);
5381    assign(exponent,
5382           binop(Iop_And32,
5383                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
5384                 mkU32(0x7FF)));
5385    IRTemp in_range_and_finite = newTemp(Ity_I1);
5386    assign(in_range_and_finite,
5387           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
5388    return in_range_and_finite;
5389 }
5390
5391 /* Invent a plausible-looking FPU status word value:
5392       ((ftop & 7) << 11) | (c3210 & 0x4700)
5393  */
5394 static IRExpr* get_FPU_sw ( void )
5395 {
5396    return
5397       unop(Iop_32to16,
5398            binop(Iop_Or32,
5399                  binop(Iop_Shl32,
5400                        binop(Iop_And32, get_ftop(), mkU32(7)),
5401                              mkU8(11)),
5402                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
5403                                         mkU32(0x4700))
5404       ));
5405 }
5406
5407
5408 /* Generate a dirty helper call that initialises the x87 state a la
5409    FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
5410    |guard| is used as a guarding condition.
5411 */
5412 static void gen_FINIT_SEQUENCE ( IRExpr* guard )
5413 {
5414    /* Uses dirty helper:
5415          void amd64g_do_FINIT ( VexGuestAMD64State* ) */
5416    IRDirty* d  = unsafeIRDirty_0_N (
5417                     0/*regparms*/,
5418                     "amd64g_dirtyhelper_FINIT",
5419                     &amd64g_dirtyhelper_FINIT,
5420                     mkIRExprVec_1( IRExpr_GSPTR() )
5421                  );
5422
5423    /* declare we're writing guest state */
5424    d->nFxState = 5;
5425    vex_bzero(&d->fxState, sizeof(d->fxState));
5426
5427    d->fxState[0].fx     = Ifx_Write;
5428    d->fxState[0].offset = OFFB_FTOP;
5429    d->fxState[0].size   = sizeof(UInt);
5430
5431    d->fxState[1].fx     = Ifx_Write;
5432    d->fxState[1].offset = OFFB_FPREGS;
5433    d->fxState[1].size   = 8 * sizeof(ULong);
5434
5435    d->fxState[2].fx     = Ifx_Write;
5436    d->fxState[2].offset = OFFB_FPTAGS;
5437    d->fxState[2].size   = 8 * sizeof(UChar);
5438
5439    d->fxState[3].fx     = Ifx_Write;
5440    d->fxState[3].offset = OFFB_FPROUND;
5441    d->fxState[3].size   = sizeof(ULong);
5442
5443    d->fxState[4].fx     = Ifx_Write;
5444    d->fxState[4].offset = OFFB_FC3210;
5445    d->fxState[4].size   = sizeof(ULong);
5446
5447    if (guard)
5448       d->guard = guard;
5449
5450    stmt( IRStmt_Dirty(d) );
5451 }
5452
5453
5454 /* ------------------------------------------------------- */
5455 /* Given all that stack-mangling junk, we can now go ahead
5456    and describe FP instructions.
5457 */
5458
5459 /* ST(0) = ST(0) `op` mem64/32(addr)
5460    Need to check ST(0)'s tag on read, but not on write.
5461 */
5462 static
5463 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
5464                          IROp op, Bool dbl )
5465 {
5466    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
5467    if (dbl) {
5468       put_ST_UNCHECKED(0,
5469          triop( op,
5470                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5471                 get_ST(0),
5472                 loadLE(Ity_F64,mkexpr(addr))
5473          ));
5474    } else {
5475       put_ST_UNCHECKED(0,
5476          triop( op,
5477                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5478                 get_ST(0),
5479                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
5480          ));
5481    }
5482 }
5483
5484
5485 /* ST(0) = mem64/32(addr) `op` ST(0)
5486    Need to check ST(0)'s tag on read, but not on write.
5487 */
5488 static
5489 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
5490                             IROp op, Bool dbl )
5491 {
5492    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
5493    if (dbl) {
5494       put_ST_UNCHECKED(0,
5495          triop( op,
5496                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5497                 loadLE(Ity_F64,mkexpr(addr)),
5498                 get_ST(0)
5499          ));
5500    } else {
5501       put_ST_UNCHECKED(0,
5502          triop( op,
5503                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5504                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
5505                 get_ST(0)
5506          ));
5507    }
5508 }
5509
5510
5511 /* ST(dst) = ST(dst) `op` ST(src).
5512    Check dst and src tags when reading but not on write.
5513 */
5514 static
5515 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
5516                       Bool pop_after )
5517 {
5518    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
5519    put_ST_UNCHECKED(
5520       st_dst,
5521       triop( op,
5522              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5523              get_ST(st_dst),
5524              get_ST(st_src) )
5525    );
5526    if (pop_after)
5527       fp_pop();
5528 }
5529
5530 /* ST(dst) = ST(src) `op` ST(dst).
5531    Check dst and src tags when reading but not on write.
5532 */
5533 static
5534 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
5535                          Bool pop_after )
5536 {
5537    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
5538    put_ST_UNCHECKED(
5539       st_dst,
5540       triop( op,
5541              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5542              get_ST(st_src),
5543              get_ST(st_dst) )
5544    );
5545    if (pop_after)
5546       fp_pop();
5547 }
5548
5549 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
5550 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
5551 {
5552    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
5553    /* This is a bit of a hack (and isn't really right).  It sets
5554       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
5555       documentation implies A and S are unchanged.
5556    */
5557    /* It's also fishy in that it is used both for COMIP and
5558       UCOMIP, and they aren't the same (although similar). */
5559    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
5560    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
5561    stmt( IRStmt_Put(
5562             OFFB_CC_DEP1,
5563             binop( Iop_And64,
5564                    unop( Iop_32Uto64,
5565                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
5566                    mkU64(0x45)
5567         )));
5568    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
5569    if (pop_after)
5570       fp_pop();
5571 }
5572
5573
5574 /* returns
5575    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
5576 */
5577 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
5578 {
5579    IRTemp t32 = newTemp(Ity_I32);
5580    assign( t32, e32 );
5581    return
5582       IRExpr_ITE(
5583          binop(Iop_CmpLT64U,
5584                unop(Iop_32Uto64,
5585                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
5586                mkU64(65536)),
5587          unop(Iop_32to16, mkexpr(t32)),
5588          mkU16( 0x8000 ) );
5589 }
5590
5591
5592 static
5593 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
5594                 const VexAbiInfo* vbi, Prefix pfx, Long delta )
5595 {
5596    Int    len;
5597    UInt   r_src, r_dst;
5598    HChar  dis_buf[50];
5599    IRTemp t1, t2;
5600
5601    /* On entry, delta points at the second byte of the insn (the modrm
5602       byte).*/
5603    UChar first_opcode = getUChar(delta-1);
5604    UChar modrm        = getUChar(delta+0);
5605
5606    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
5607
5608    if (first_opcode == 0xD8) {
5609       if (modrm < 0xC0) {
5610
5611          /* bits 5,4,3 are an opcode extension, and the modRM also
5612            specifies an address. */
5613          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5614          delta += len;
5615
5616          switch (gregLO3ofRM(modrm)) {
5617
5618             case 0: /* FADD single-real */
5619                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
5620                break;
5621
5622             case 1: /* FMUL single-real */
5623                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
5624                break;
5625
5626             case 2: /* FCOM single-real */
5627                DIP("fcoms %s\n", dis_buf);
5628                /* This forces C1 to zero, which isn't right. */
5629                /* The AMD documentation suggests that forcing C1 to
5630                   zero is correct (Eliot Moss) */
5631                put_C3210(
5632                    unop( Iop_32Uto64,
5633                        binop( Iop_And32,
5634                               binop(Iop_Shl32,
5635                                     binop(Iop_CmpF64,
5636                                           get_ST(0),
5637                                           unop(Iop_F32toF64,
5638                                                loadLE(Ity_F32,mkexpr(addr)))),
5639                                     mkU8(8)),
5640                               mkU32(0x4500)
5641                    )));
5642                break;
5643
5644             case 3: /* FCOMP single-real */
5645                /* The AMD documentation suggests that forcing C1 to
5646                   zero is correct (Eliot Moss) */
5647                DIP("fcomps %s\n", dis_buf);
5648                /* This forces C1 to zero, which isn't right. */
5649                put_C3210(
5650                    unop( Iop_32Uto64,
5651                        binop( Iop_And32,
5652                               binop(Iop_Shl32,
5653                                     binop(Iop_CmpF64,
5654                                           get_ST(0),
5655                                           unop(Iop_F32toF64,
5656                                                loadLE(Ity_F32,mkexpr(addr)))),
5657                                     mkU8(8)),
5658                               mkU32(0x4500)
5659                    )));
5660                fp_pop();
5661                break;
5662
5663             case 4: /* FSUB single-real */
5664                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
5665                break;
5666
5667             case 5: /* FSUBR single-real */
5668                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
5669                break;
5670
5671             case 6: /* FDIV single-real */
5672                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
5673                break;
5674
5675             case 7: /* FDIVR single-real */
5676                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
5677                break;
5678
5679             default:
5680                vex_printf("unhandled opc_aux = 0x%2x\n",
5681                           (UInt)gregLO3ofRM(modrm));
5682                vex_printf("first_opcode == 0xD8\n");
5683                goto decode_fail;
5684          }
5685       } else {
5686          delta++;
5687          switch (modrm) {
5688
5689             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
5690                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
5691                break;
5692
5693             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
5694                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
5695                break;
5696
5697             /* Dunno if this is right */
5698             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
5699                r_dst = (UInt)modrm - 0xD0;
5700                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
5701                /* This forces C1 to zero, which isn't right. */
5702                put_C3210(
5703                    unop(Iop_32Uto64,
5704                    binop( Iop_And32,
5705                           binop(Iop_Shl32,
5706                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5707                                 mkU8(8)),
5708                           mkU32(0x4500)
5709                    )));
5710                break;
5711
5712             /* Dunno if this is right */
5713             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
5714                r_dst = (UInt)modrm - 0xD8;
5715                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
5716                /* This forces C1 to zero, which isn't right. */
5717                put_C3210(
5718                    unop(Iop_32Uto64,
5719                    binop( Iop_And32,
5720                           binop(Iop_Shl32,
5721                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5722                                 mkU8(8)),
5723                           mkU32(0x4500)
5724                    )));
5725                fp_pop();
5726                break;
5727
5728             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
5729                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
5730                break;
5731
5732             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
5733                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
5734                break;
5735
5736             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
5737                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
5738                break;
5739
5740             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
5741                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
5742                break;
5743
5744             default:
5745                goto decode_fail;
5746          }
5747       }
5748    }
5749
5750    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
5751    else
5752    if (first_opcode == 0xD9) {
5753       if (modrm < 0xC0) {
5754
5755          /* bits 5,4,3 are an opcode extension, and the modRM also
5756             specifies an address. */
5757          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5758          delta += len;
5759
5760          switch (gregLO3ofRM(modrm)) {
5761
5762             case 0: /* FLD single-real */
5763                DIP("flds %s\n", dis_buf);
5764                fp_push();
5765                put_ST(0, unop(Iop_F32toF64,
5766                               loadLE(Ity_F32, mkexpr(addr))));
5767                break;
5768
5769             case 2: /* FST single-real */
5770                DIP("fsts %s\n", dis_buf);
5771                storeLE(mkexpr(addr),
5772                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5773                break;
5774
5775             case 3: /* FSTP single-real */
5776                DIP("fstps %s\n", dis_buf);
5777                storeLE(mkexpr(addr),
5778                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5779                fp_pop();
5780                break;
5781
5782             case 4: { /* FLDENV m28 */
5783                /* Uses dirty helper:
5784                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
5785                IRTemp    ew = newTemp(Ity_I32);
5786                IRTemp   w64 = newTemp(Ity_I64);
5787                IRDirty*   d = unsafeIRDirty_0_N (
5788                                  0/*regparms*/,
5789                                  "amd64g_dirtyhelper_FLDENV",
5790                                  &amd64g_dirtyhelper_FLDENV,
5791                                  mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
5792                               );
5793                d->tmp       = w64;
5794                /* declare we're reading memory */
5795                d->mFx   = Ifx_Read;
5796                d->mAddr = mkexpr(addr);
5797                d->mSize = 28;
5798
5799                /* declare we're writing guest state */
5800                d->nFxState = 4;
5801                vex_bzero(&d->fxState, sizeof(d->fxState));
5802
5803                d->fxState[0].fx     = Ifx_Write;
5804                d->fxState[0].offset = OFFB_FTOP;
5805                d->fxState[0].size   = sizeof(UInt);
5806
5807                d->fxState[1].fx     = Ifx_Write;
5808                d->fxState[1].offset = OFFB_FPTAGS;
5809                d->fxState[1].size   = 8 * sizeof(UChar);
5810
5811                d->fxState[2].fx     = Ifx_Write;
5812                d->fxState[2].offset = OFFB_FPROUND;
5813                d->fxState[2].size   = sizeof(ULong);
5814
5815                d->fxState[3].fx     = Ifx_Write;
5816                d->fxState[3].offset = OFFB_FC3210;
5817                d->fxState[3].size   = sizeof(ULong);
5818
5819                stmt( IRStmt_Dirty(d) );
5820
5821                /* ew contains any emulation warning we may need to
5822                   issue.  If needed, side-exit to the next insn,
5823                   reporting the warning, so that Valgrind's dispatcher
5824                   sees the warning. */
5825                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
5826                put_emwarn( mkexpr(ew) );
5827                stmt(
5828                   IRStmt_Exit(
5829                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5830                      Ijk_EmWarn,
5831                      IRConst_U64( guest_RIP_bbstart+delta ),
5832                      OFFB_RIP
5833                   )
5834                );
5835
5836                DIP("fldenv %s\n", dis_buf);
5837                break;
5838             }
5839
5840             case 5: {/* FLDCW */
5841                /* The only thing we observe in the control word is the
5842                   rounding mode.  Therefore, pass the 16-bit value
5843                   (x87 native-format control word) to a clean helper,
5844                   getting back a 64-bit value, the lower half of which
5845                   is the FPROUND value to store, and the upper half of
5846                   which is the emulation-warning token which may be
5847                   generated.
5848                */
5849                /* ULong amd64h_check_fldcw ( ULong ); */
5850                IRTemp t64 = newTemp(Ity_I64);
5851                IRTemp ew = newTemp(Ity_I32);
5852                DIP("fldcw %s\n", dis_buf);
5853                assign( t64, mkIRExprCCall(
5854                                Ity_I64, 0/*regparms*/,
5855                                "amd64g_check_fldcw",
5856                                &amd64g_check_fldcw,
5857                                mkIRExprVec_1(
5858                                   unop( Iop_16Uto64,
5859                                         loadLE(Ity_I16, mkexpr(addr)))
5860                                )
5861                             )
5862                      );
5863
5864                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
5865                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
5866                put_emwarn( mkexpr(ew) );
5867                /* Finally, if an emulation warning was reported,
5868                   side-exit to the next insn, reporting the warning,
5869                   so that Valgrind's dispatcher sees the warning. */
5870                stmt(
5871                   IRStmt_Exit(
5872                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5873                      Ijk_EmWarn,
5874                      IRConst_U64( guest_RIP_bbstart+delta ),
5875                      OFFB_RIP
5876                   )
5877                );
5878                break;
5879             }
5880
5881             case 6: { /* FNSTENV m28 */
5882                /* Uses dirty helper:
5883                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
5884                IRDirty* d = unsafeIRDirty_0_N (
5885                                0/*regparms*/,
5886                                "amd64g_dirtyhelper_FSTENV",
5887                                &amd64g_dirtyhelper_FSTENV,
5888                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
5889                             );
5890                /* declare we're writing memory */
5891                d->mFx   = Ifx_Write;
5892                d->mAddr = mkexpr(addr);
5893                d->mSize = 28;
5894
5895                /* declare we're reading guest state */
5896                d->nFxState = 4;
5897                vex_bzero(&d->fxState, sizeof(d->fxState));
5898
5899                d->fxState[0].fx     = Ifx_Read;
5900                d->fxState[0].offset = OFFB_FTOP;
5901                d->fxState[0].size   = sizeof(UInt);
5902
5903                d->fxState[1].fx     = Ifx_Read;
5904                d->fxState[1].offset = OFFB_FPTAGS;
5905                d->fxState[1].size   = 8 * sizeof(UChar);
5906
5907                d->fxState[2].fx     = Ifx_Read;
5908                d->fxState[2].offset = OFFB_FPROUND;
5909                d->fxState[2].size   = sizeof(ULong);
5910
5911                d->fxState[3].fx     = Ifx_Read;
5912                d->fxState[3].offset = OFFB_FC3210;
5913                d->fxState[3].size   = sizeof(ULong);
5914
5915                stmt( IRStmt_Dirty(d) );
5916
5917                DIP("fnstenv %s\n", dis_buf);
5918                break;
5919             }
5920
5921             case 7: /* FNSTCW */
5922                /* Fake up a native x87 FPU control word.  The only
5923                   thing it depends on is FPROUND[1:0], so call a clean
5924                   helper to cook it up. */
5925                /* ULong amd64g_create_fpucw ( ULong fpround ) */
5926                DIP("fnstcw %s\n", dis_buf);
5927                storeLE(
5928                   mkexpr(addr),
5929                   unop( Iop_64to16,
5930                         mkIRExprCCall(
5931                            Ity_I64, 0/*regp*/,
5932                            "amd64g_create_fpucw", &amd64g_create_fpucw,
5933                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
5934                         )
5935                   )
5936                );
5937                break;
5938
5939             default:
5940                vex_printf("unhandled opc_aux = 0x%2x\n",
5941                           (UInt)gregLO3ofRM(modrm));
5942                vex_printf("first_opcode == 0xD9\n");
5943                goto decode_fail;
5944          }
5945
5946       } else {
5947          delta++;
5948          switch (modrm) {
5949
5950             case 0xC0 ... 0xC7: /* FLD %st(?) */
5951                r_src = (UInt)modrm - 0xC0;
5952                DIP("fld %%st(%u)\n", r_src);
5953                t1 = newTemp(Ity_F64);
5954                assign(t1, get_ST(r_src));
5955                fp_push();
5956                put_ST(0, mkexpr(t1));
5957                break;
5958
5959             case 0xC8 ... 0xCF: /* FXCH %st(?) */
5960                r_src = (UInt)modrm - 0xC8;
5961                DIP("fxch %%st(%u)\n", r_src);
5962                t1 = newTemp(Ity_F64);
5963                t2 = newTemp(Ity_F64);
5964                assign(t1, get_ST(0));
5965                assign(t2, get_ST(r_src));
5966                put_ST_UNCHECKED(0, mkexpr(t2));
5967                put_ST_UNCHECKED(r_src, mkexpr(t1));
5968                break;
5969
5970             case 0xE0: /* FCHS */
5971                DIP("fchs\n");
5972                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
5973                break;
5974
5975             case 0xE1: /* FABS */
5976                DIP("fabs\n");
5977                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
5978                break;
5979
5980             case 0xE5: { /* FXAM */
5981                /* This is an interesting one.  It examines %st(0),
5982                   regardless of whether the tag says it's empty or not.
5983                   Here, just pass both the tag (in our format) and the
5984                   value (as a double, actually a ULong) to a helper
5985                   function. */
5986                IRExpr** args
5987                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
5988                                    unop(Iop_ReinterpF64asI64,
5989                                         get_ST_UNCHECKED(0)) );
5990                put_C3210(mkIRExprCCall(
5991                             Ity_I64,
5992                             0/*regparm*/,
5993                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
5994                             args
5995                         ));
5996                DIP("fxam\n");
5997                break;
5998             }
5999
6000             case 0xE8: /* FLD1 */
6001                DIP("fld1\n");
6002                fp_push();
6003                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
6004                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
6005                break;
6006
6007             case 0xE9: /* FLDL2T */
6008                DIP("fldl2t\n");
6009                fp_push();
6010                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
6011                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
6012                break;
6013
6014             case 0xEA: /* FLDL2E */
6015                DIP("fldl2e\n");
6016                fp_push();
6017                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
6018                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
6019                break;
6020
6021             case 0xEB: /* FLDPI */
6022                DIP("fldpi\n");
6023                fp_push();
6024                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
6025                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
6026                break;
6027
6028             case 0xEC: /* FLDLG2 */
6029                DIP("fldlg2\n");
6030                fp_push();
6031                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
6032                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
6033                break;
6034
6035             case 0xED: /* FLDLN2 */
6036                DIP("fldln2\n");
6037                fp_push();
6038                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
6039                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
6040                break;
6041
6042             case 0xEE: /* FLDZ */
6043                DIP("fldz\n");
6044                fp_push();
6045                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
6046                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
6047                break;
6048
6049             case 0xF0: /* F2XM1 */
6050                DIP("f2xm1\n");
6051                put_ST_UNCHECKED(0,
6052                   binop(Iop_2xm1F64,
6053                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6054                         get_ST(0)));
6055                break;
6056
6057             case 0xF1: /* FYL2X */
6058                DIP("fyl2x\n");
6059                put_ST_UNCHECKED(1,
6060                   triop(Iop_Yl2xF64,
6061                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6062                         get_ST(1),
6063                         get_ST(0)));
6064                fp_pop();
6065                break;
6066
6067             case 0xF2: { /* FPTAN */
6068                DIP("fptan\n");
6069                IRTemp argD = newTemp(Ity_F64);
6070                assign(argD, get_ST(0));
6071                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
6072                IRTemp resD = newTemp(Ity_F64);
6073                assign(resD,
6074                   IRExpr_ITE(
6075                      mkexpr(argOK),
6076                      binop(Iop_TanF64,
6077                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6078                            mkexpr(argD)),
6079                      mkexpr(argD))
6080                );
6081                put_ST_UNCHECKED(0, mkexpr(resD));
6082                /* Conditionally push 1.0 on the stack, if the arg is
6083                   in range */
6084                maybe_fp_push(argOK);
6085                maybe_put_ST(argOK, 0,
6086                             IRExpr_Const(IRConst_F64(1.0)));
6087                set_C2( binop(Iop_Xor64,
6088                              unop(Iop_1Uto64, mkexpr(argOK)),
6089                              mkU64(1)) );
6090                break;
6091             }
6092
6093             case 0xF3: /* FPATAN */
6094                DIP("fpatan\n");
6095                put_ST_UNCHECKED(1,
6096                   triop(Iop_AtanF64,
6097                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6098                         get_ST(1),
6099                         get_ST(0)));
6100                fp_pop();
6101                break;
6102
6103             case 0xF4: { /* FXTRACT */
6104                IRTemp argF = newTemp(Ity_F64);
6105                IRTemp sigF = newTemp(Ity_F64);
6106                IRTemp expF = newTemp(Ity_F64);
6107                IRTemp argI = newTemp(Ity_I64);
6108                IRTemp sigI = newTemp(Ity_I64);
6109                IRTemp expI = newTemp(Ity_I64);
6110                DIP("fxtract\n");
6111                assign( argF, get_ST(0) );
6112                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
6113                assign( sigI,
6114                        mkIRExprCCall(
6115                           Ity_I64, 0/*regparms*/,
6116                           "x86amd64g_calculate_FXTRACT",
6117                           &x86amd64g_calculate_FXTRACT,
6118                           mkIRExprVec_2( mkexpr(argI),
6119                                          mkIRExpr_HWord(0)/*sig*/ ))
6120                );
6121                assign( expI,
6122                        mkIRExprCCall(
6123                           Ity_I64, 0/*regparms*/,
6124                           "x86amd64g_calculate_FXTRACT",
6125                           &x86amd64g_calculate_FXTRACT,
6126                           mkIRExprVec_2( mkexpr(argI),
6127                                          mkIRExpr_HWord(1)/*exp*/ ))
6128                );
6129                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
6130                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
6131                /* exponent */
6132                put_ST_UNCHECKED(0, mkexpr(expF) );
6133                fp_push();
6134                /* significand */
6135                put_ST(0, mkexpr(sigF) );
6136                break;
6137             }
6138
6139             case 0xF5: { /* FPREM1 -- IEEE compliant */
6140                IRTemp a1 = newTemp(Ity_F64);
6141                IRTemp a2 = newTemp(Ity_F64);
6142                DIP("fprem1\n");
6143                /* Do FPREM1 twice, once to get the remainder, and once
6144                   to get the C3210 flag values. */
6145                assign( a1, get_ST(0) );
6146                assign( a2, get_ST(1) );
6147                put_ST_UNCHECKED(0,
6148                   triop(Iop_PRem1F64,
6149                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6150                         mkexpr(a1),
6151                         mkexpr(a2)));
6152                put_C3210(
6153                   unop(Iop_32Uto64,
6154                   triop(Iop_PRem1C3210F64,
6155                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6156                         mkexpr(a1),
6157                         mkexpr(a2)) ));
6158                break;
6159             }
6160
6161             case 0xF7: /* FINCSTP */
6162                DIP("fincstp\n");
6163                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
6164                break;
6165
6166             case 0xF8: { /* FPREM -- not IEEE compliant */
6167                IRTemp a1 = newTemp(Ity_F64);
6168                IRTemp a2 = newTemp(Ity_F64);
6169                DIP("fprem\n");
6170                /* Do FPREM twice, once to get the remainder, and once
6171                   to get the C3210 flag values. */
6172                assign( a1, get_ST(0) );
6173                assign( a2, get_ST(1) );
6174                put_ST_UNCHECKED(0,
6175                   triop(Iop_PRemF64,
6176                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6177                         mkexpr(a1),
6178                         mkexpr(a2)));
6179                put_C3210(
6180                   unop(Iop_32Uto64,
6181                   triop(Iop_PRemC3210F64,
6182                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6183                         mkexpr(a1),
6184                         mkexpr(a2)) ));
6185                break;
6186             }
6187
6188             case 0xF9: /* FYL2XP1 */
6189                DIP("fyl2xp1\n");
6190                put_ST_UNCHECKED(1,
6191                   triop(Iop_Yl2xp1F64,
6192                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6193                         get_ST(1),
6194                         get_ST(0)));
6195                fp_pop();
6196                break;
6197
6198             case 0xFA: /* FSQRT */
6199                DIP("fsqrt\n");
6200                put_ST_UNCHECKED(0,
6201                   binop(Iop_SqrtF64,
6202                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6203                         get_ST(0)));
6204                break;
6205
6206             case 0xFB: { /* FSINCOS */
6207                DIP("fsincos\n");
6208                IRTemp argD = newTemp(Ity_F64);
6209                assign(argD, get_ST(0));
6210                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
6211                IRTemp resD = newTemp(Ity_F64);
6212                assign(resD,
6213                   IRExpr_ITE(
6214                      mkexpr(argOK),
6215                      binop(Iop_SinF64,
6216                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6217                            mkexpr(argD)),
6218                      mkexpr(argD))
6219                );
6220                put_ST_UNCHECKED(0, mkexpr(resD));
6221                /* Conditionally push the cos value on the stack, if
6222                   the arg is in range */
6223                maybe_fp_push(argOK);
6224                maybe_put_ST(argOK, 0,
6225                   binop(Iop_CosF64,
6226                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6227                         mkexpr(argD)));
6228                set_C2( binop(Iop_Xor64,
6229                              unop(Iop_1Uto64, mkexpr(argOK)),
6230                              mkU64(1)) );
6231                break;
6232             }
6233
6234             case 0xFC: /* FRNDINT */
6235                DIP("frndint\n");
6236                put_ST_UNCHECKED(0,
6237                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
6238                break;
6239
6240             case 0xFD: /* FSCALE */
6241                DIP("fscale\n");
6242                put_ST_UNCHECKED(0,
6243                   triop(Iop_ScaleF64,
6244                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6245                         get_ST(0),
6246                         get_ST(1)));
6247                break;
6248
6249             case 0xFE:   /* FSIN */
6250             case 0xFF: { /* FCOS */
6251                Bool isSIN = modrm == 0xFE;
6252                DIP("%s\n", isSIN ? "fsin" : "fcos");
6253                IRTemp argD = newTemp(Ity_F64);
6254                assign(argD, get_ST(0));
6255                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
6256                IRTemp resD = newTemp(Ity_F64);
6257                assign(resD,
6258                   IRExpr_ITE(
6259                      mkexpr(argOK),
6260                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
6261                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6262                            mkexpr(argD)),
6263                      mkexpr(argD))
6264                );
6265                put_ST_UNCHECKED(0, mkexpr(resD));
6266                set_C2( binop(Iop_Xor64,
6267                              unop(Iop_1Uto64, mkexpr(argOK)),
6268                              mkU64(1)) );
6269                break;
6270             }
6271
6272             default:
6273                goto decode_fail;
6274          }
6275       }
6276    }
6277
6278    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
6279    else
6280    if (first_opcode == 0xDA) {
6281
6282       if (modrm < 0xC0) {
6283
6284          /* bits 5,4,3 are an opcode extension, and the modRM also
6285             specifies an address. */
6286          IROp   fop;
6287          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6288          delta += len;
6289          switch (gregLO3ofRM(modrm)) {
6290
6291             case 0: /* FIADD m32int */ /* ST(0) += m32int */
6292                DIP("fiaddl %s\n", dis_buf);
6293                fop = Iop_AddF64;
6294                goto do_fop_m32;
6295
6296             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
6297                DIP("fimull %s\n", dis_buf);
6298                fop = Iop_MulF64;
6299                goto do_fop_m32;
6300
6301             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
6302                DIP("fisubl %s\n", dis_buf);
6303                fop = Iop_SubF64;
6304                goto do_fop_m32;
6305
6306             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
6307                DIP("fisubrl %s\n", dis_buf);
6308                fop = Iop_SubF64;
6309                goto do_foprev_m32;
6310
6311             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
6312                DIP("fisubl %s\n", dis_buf);
6313                fop = Iop_DivF64;
6314                goto do_fop_m32;
6315
6316             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
6317                DIP("fidivrl %s\n", dis_buf);
6318                fop = Iop_DivF64;
6319                goto do_foprev_m32;
6320
6321             do_fop_m32:
6322                put_ST_UNCHECKED(0,
6323                   triop(fop,
6324                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6325                         get_ST(0),
6326                         unop(Iop_I32StoF64,
6327                              loadLE(Ity_I32, mkexpr(addr)))));
6328                break;
6329
6330             do_foprev_m32:
6331                put_ST_UNCHECKED(0,
6332                   triop(fop,
6333                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6334                         unop(Iop_I32StoF64,
6335                              loadLE(Ity_I32, mkexpr(addr))),
6336                         get_ST(0)));
6337                break;
6338
6339             default:
6340                vex_printf("unhandled opc_aux = 0x%2x\n",
6341                           (UInt)gregLO3ofRM(modrm));
6342                vex_printf("first_opcode == 0xDA\n");
6343                goto decode_fail;
6344          }
6345
6346       } else {
6347
6348          delta++;
6349          switch (modrm) {
6350
6351             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
6352                r_src = (UInt)modrm - 0xC0;
6353                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
6354                put_ST_UNCHECKED(0,
6355                                 IRExpr_ITE(
6356                                     mk_amd64g_calculate_condition(AMD64CondB),
6357                                     get_ST(r_src), get_ST(0)) );
6358                break;
6359
6360             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
6361                r_src = (UInt)modrm - 0xC8;
6362                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
6363                put_ST_UNCHECKED(0,
6364                                 IRExpr_ITE(
6365                                     mk_amd64g_calculate_condition(AMD64CondZ),
6366                                     get_ST(r_src), get_ST(0)) );
6367                break;
6368
6369             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
6370                r_src = (UInt)modrm - 0xD0;
6371                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
6372                put_ST_UNCHECKED(0,
6373                                 IRExpr_ITE(
6374                                     mk_amd64g_calculate_condition(AMD64CondBE),
6375                                     get_ST(r_src), get_ST(0)) );
6376                break;
6377
6378             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
6379                r_src = (UInt)modrm - 0xD8;
6380                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
6381                put_ST_UNCHECKED(0,
6382                                 IRExpr_ITE(
6383                                     mk_amd64g_calculate_condition(AMD64CondP),
6384                                     get_ST(r_src), get_ST(0)) );
6385                break;
6386
6387             case 0xE9: /* FUCOMPP %st(0),%st(1) */
6388                DIP("fucompp %%st(0),%%st(1)\n");
6389                /* This forces C1 to zero, which isn't right. */
6390                put_C3210(
6391                    unop(Iop_32Uto64,
6392                    binop( Iop_And32,
6393                           binop(Iop_Shl32,
6394                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6395                                 mkU8(8)),
6396                           mkU32(0x4500)
6397                    )));
6398                fp_pop();
6399                fp_pop();
6400                break;
6401
6402             default:
6403                goto decode_fail;
6404          }
6405
6406       }
6407    }
6408
6409    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
6410    else
6411    if (first_opcode == 0xDB) {
6412       if (modrm < 0xC0) {
6413
6414          /* bits 5,4,3 are an opcode extension, and the modRM also
6415             specifies an address. */
6416          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6417          delta += len;
6418
6419          switch (gregLO3ofRM(modrm)) {
6420
6421             case 0: /* FILD m32int */
6422                DIP("fildl %s\n", dis_buf);
6423                fp_push();
6424                put_ST(0, unop(Iop_I32StoF64,
6425                               loadLE(Ity_I32, mkexpr(addr))));
6426                break;
6427
6428             case 1: /* FISTTPL m32 (SSE3) */
6429                DIP("fisttpl %s\n", dis_buf);
6430                storeLE( mkexpr(addr),
6431                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
6432                fp_pop();
6433                break;
6434
6435             case 2: /* FIST m32 */
6436                DIP("fistl %s\n", dis_buf);
6437                storeLE( mkexpr(addr),
6438                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
6439                break;
6440
6441             case 3: /* FISTP m32 */
6442                DIP("fistpl %s\n", dis_buf);
6443                storeLE( mkexpr(addr),
6444                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
6445                fp_pop();
6446                break;
6447
6448             case 5: { /* FLD extended-real */
6449                /* Uses dirty helper:
6450                      ULong amd64g_loadF80le ( ULong )
6451                   addr holds the address.  First, do a dirty call to
6452                   get hold of the data. */
6453                IRTemp   val  = newTemp(Ity_I64);
6454                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
6455
6456                IRDirty* d = unsafeIRDirty_1_N (
6457                                val,
6458                                0/*regparms*/,
6459                                "amd64g_dirtyhelper_loadF80le",
6460                                &amd64g_dirtyhelper_loadF80le,
6461                                args
6462                             );
6463                /* declare that we're reading memory */
6464                d->mFx   = Ifx_Read;
6465                d->mAddr = mkexpr(addr);
6466                d->mSize = 10;
6467
6468                /* execute the dirty call, dumping the result in val. */
6469                stmt( IRStmt_Dirty(d) );
6470                fp_push();
6471                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
6472
6473                DIP("fldt %s\n", dis_buf);
6474                break;
6475             }
6476
6477             case 7: { /* FSTP extended-real */
6478                /* Uses dirty helper:
6479                      void amd64g_storeF80le ( ULong addr, ULong data )
6480                */
6481                IRExpr** args
6482                   = mkIRExprVec_2( mkexpr(addr),
6483                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
6484
6485                IRDirty* d = unsafeIRDirty_0_N (
6486                                0/*regparms*/,
6487                                "amd64g_dirtyhelper_storeF80le",
6488                                &amd64g_dirtyhelper_storeF80le,
6489                                args
6490                             );
6491                /* declare we're writing memory */
6492                d->mFx   = Ifx_Write;
6493                d->mAddr = mkexpr(addr);
6494                d->mSize = 10;
6495
6496                /* execute the dirty call. */
6497                stmt( IRStmt_Dirty(d) );
6498                fp_pop();
6499
6500                DIP("fstpt\n %s", dis_buf);
6501                break;
6502             }
6503
6504             default:
6505                vex_printf("unhandled opc_aux = 0x%2x\n",
6506                           (UInt)gregLO3ofRM(modrm));
6507                vex_printf("first_opcode == 0xDB\n");
6508                goto decode_fail;
6509          }
6510
6511       } else {
6512
6513          delta++;
6514          switch (modrm) {
6515
6516             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
6517                r_src = (UInt)modrm - 0xC0;
6518                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
6519                put_ST_UNCHECKED(0,
6520                                 IRExpr_ITE(
6521                                     mk_amd64g_calculate_condition(AMD64CondNB),
6522                                     get_ST(r_src), get_ST(0)) );
6523                break;
6524
6525             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
6526                r_src = (UInt)modrm - 0xC8;
6527                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
6528                put_ST_UNCHECKED(
6529                   0,
6530                   IRExpr_ITE(
6531                      mk_amd64g_calculate_condition(AMD64CondNZ),
6532                      get_ST(r_src),
6533                      get_ST(0)
6534                   )
6535                );
6536                break;
6537
6538             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
6539                r_src = (UInt)modrm - 0xD0;
6540                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
6541                put_ST_UNCHECKED(
6542                   0,
6543                   IRExpr_ITE(
6544                      mk_amd64g_calculate_condition(AMD64CondNBE),
6545                      get_ST(r_src),
6546                      get_ST(0)
6547                   )
6548                );
6549                break;
6550
6551             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
6552                r_src = (UInt)modrm - 0xD8;
6553                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
6554                put_ST_UNCHECKED(
6555                   0,
6556                   IRExpr_ITE(
6557                      mk_amd64g_calculate_condition(AMD64CondNP),
6558                      get_ST(r_src),
6559                      get_ST(0)
6560                   )
6561                );
6562                break;
6563
6564             case 0xE2:
6565                DIP("fnclex\n");
6566                break;
6567
6568             case 0xE3: {
6569                gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
6570                DIP("fninit\n");
6571                break;
6572             }
6573
6574             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
6575                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
6576                break;
6577
6578             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
6579                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
6580                break;
6581
6582             default:
6583                goto decode_fail;
6584          }
6585       }
6586    }
6587
6588    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
6589    else
6590    if (first_opcode == 0xDC) {
6591       if (modrm < 0xC0) {
6592
6593          /* bits 5,4,3 are an opcode extension, and the modRM also
6594             specifies an address. */
6595          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6596          delta += len;
6597
6598          switch (gregLO3ofRM(modrm)) {
6599
6600             case 0: /* FADD double-real */
6601                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
6602                break;
6603
6604             case 1: /* FMUL double-real */
6605                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
6606                break;
6607
6608             case 2: /* FCOM double-real */
6609                DIP("fcoml %s\n", dis_buf);
6610                /* This forces C1 to zero, which isn't right. */
6611                put_C3210(
6612                    unop(Iop_32Uto64,
6613                    binop( Iop_And32,
6614                           binop(Iop_Shl32,
6615                                 binop(Iop_CmpF64,
6616                                       get_ST(0),
6617                                       loadLE(Ity_F64,mkexpr(addr))),
6618                                 mkU8(8)),
6619                           mkU32(0x4500)
6620                    )));
6621                break;
6622
6623             case 3: /* FCOMP double-real */
6624                DIP("fcompl %s\n", dis_buf);
6625                /* This forces C1 to zero, which isn't right. */
6626                put_C3210(
6627                    unop(Iop_32Uto64,
6628                    binop( Iop_And32,
6629                           binop(Iop_Shl32,
6630                                 binop(Iop_CmpF64,
6631                                       get_ST(0),
6632                                       loadLE(Ity_F64,mkexpr(addr))),
6633                                 mkU8(8)),
6634                           mkU32(0x4500)
6635                    )));
6636                fp_pop();
6637                break;
6638
6639             case 4: /* FSUB double-real */
6640                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
6641                break;
6642
6643             case 5: /* FSUBR double-real */
6644                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
6645                break;
6646
6647             case 6: /* FDIV double-real */
6648                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
6649                break;
6650
6651             case 7: /* FDIVR double-real */
6652                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
6653                break;
6654
6655             default:
6656                vex_printf("unhandled opc_aux = 0x%2x\n",
6657                           (UInt)gregLO3ofRM(modrm));
6658                vex_printf("first_opcode == 0xDC\n");
6659                goto decode_fail;
6660          }
6661
6662       } else {
6663
6664          delta++;
6665          switch (modrm) {
6666
6667             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
6668                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
6669                break;
6670
6671             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
6672                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
6673                break;
6674
6675             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
6676                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
6677                break;
6678
6679             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
6680                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
6681                break;
6682
6683             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
6684                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
6685                break;
6686
6687             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
6688                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
6689                break;
6690
6691             default:
6692                goto decode_fail;
6693          }
6694
6695       }
6696    }
6697
6698    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
6699    else
6700    if (first_opcode == 0xDD) {
6701
6702       if (modrm < 0xC0) {
6703
6704          /* bits 5,4,3 are an opcode extension, and the modRM also
6705             specifies an address. */
6706          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6707          delta += len;
6708
6709          switch (gregLO3ofRM(modrm)) {
6710
6711             case 0: /* FLD double-real */
6712                DIP("fldl %s\n", dis_buf);
6713                fp_push();
6714                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
6715                break;
6716
6717             case 1: /* FISTTPQ m64 (SSE3) */
6718                DIP("fistppll %s\n", dis_buf);
6719                storeLE( mkexpr(addr),
6720                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
6721                fp_pop();
6722                break;
6723
6724             case 2: /* FST double-real */
6725                DIP("fstl %s\n", dis_buf);
6726                storeLE(mkexpr(addr), get_ST(0));
6727                break;
6728
6729             case 3: /* FSTP double-real */
6730                DIP("fstpl %s\n", dis_buf);
6731                storeLE(mkexpr(addr), get_ST(0));
6732                fp_pop();
6733                break;
6734
6735             case 4: { /* FRSTOR m94/m108 */
6736                IRTemp   ew = newTemp(Ity_I32);
6737                IRTemp  w64 = newTemp(Ity_I64);
6738                IRDirty*  d;
6739                if ( have66(pfx) ) {
6740                   /* Uses dirty helper:
6741                      VexEmNote amd64g_dirtyhelper_FRSTORS
6742                                   ( VexGuestAMD64State*, HWord ) */
6743                   d = unsafeIRDirty_0_N (
6744                          0/*regparms*/,
6745                          "amd64g_dirtyhelper_FRSTORS",
6746                          &amd64g_dirtyhelper_FRSTORS,
6747                          mkIRExprVec_1( mkexpr(addr) )
6748                       );
6749                   d->mSize = 94;
6750                } else {
6751                   /* Uses dirty helper:
6752                      VexEmNote amd64g_dirtyhelper_FRSTOR
6753                                   ( VexGuestAMD64State*, HWord ) */
6754                   d = unsafeIRDirty_0_N (
6755                          0/*regparms*/,
6756                          "amd64g_dirtyhelper_FRSTOR",
6757                          &amd64g_dirtyhelper_FRSTOR,
6758                          mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
6759                       );
6760                   d->mSize = 108;
6761                }
6762
6763                d->tmp    = w64;
6764                /* declare we're reading memory */
6765                d->mFx   = Ifx_Read;
6766                d->mAddr = mkexpr(addr);
6767                /* d->mSize set above */
6768
6769                /* declare we're writing guest state */
6770                d->nFxState = 5;
6771                vex_bzero(&d->fxState, sizeof(d->fxState));
6772
6773                d->fxState[0].fx     = Ifx_Write;
6774                d->fxState[0].offset = OFFB_FTOP;
6775                d->fxState[0].size   = sizeof(UInt);
6776
6777                d->fxState[1].fx     = Ifx_Write;
6778                d->fxState[1].offset = OFFB_FPREGS;
6779                d->fxState[1].size   = 8 * sizeof(ULong);
6780
6781                d->fxState[2].fx     = Ifx_Write;
6782                d->fxState[2].offset = OFFB_FPTAGS;
6783                d->fxState[2].size   = 8 * sizeof(UChar);
6784
6785                d->fxState[3].fx     = Ifx_Write;
6786                d->fxState[3].offset = OFFB_FPROUND;
6787                d->fxState[3].size   = sizeof(ULong);
6788
6789                d->fxState[4].fx     = Ifx_Write;
6790                d->fxState[4].offset = OFFB_FC3210;
6791                d->fxState[4].size   = sizeof(ULong);
6792
6793                stmt( IRStmt_Dirty(d) );
6794
6795                /* ew contains any emulation warning we may need to
6796                   issue.  If needed, side-exit to the next insn,
6797                   reporting the warning, so that Valgrind's dispatcher
6798                   sees the warning. */
6799                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
6800                put_emwarn( mkexpr(ew) );
6801                stmt(
6802                   IRStmt_Exit(
6803                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
6804                      Ijk_EmWarn,
6805                      IRConst_U64( guest_RIP_bbstart+delta ),
6806                      OFFB_RIP
6807                   )
6808                );
6809
6810                if ( have66(pfx) ) {
6811                   DIP("frstors %s\n", dis_buf);
6812                } else {
6813                   DIP("frstor %s\n", dis_buf);
6814                }
6815                break;
6816             }
6817
6818             case 6: { /* FNSAVE m94/m108 */
6819                IRDirty *d;
6820                if ( have66(pfx) ) {
6821                  /* Uses dirty helper:
6822                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
6823                                                       HWord ) */
6824                   d = unsafeIRDirty_0_N (
6825                          0/*regparms*/,
6826                          "amd64g_dirtyhelper_FNSAVES",
6827                          &amd64g_dirtyhelper_FNSAVES,
6828                          mkIRExprVec_1( mkexpr(addr) )
6829                          );
6830                   d->mSize = 94;
6831                } else {
6832                  /* Uses dirty helper:
6833                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
6834                                                      HWord ) */
6835                   d = unsafeIRDirty_0_N (
6836                          0/*regparms*/,
6837                          "amd64g_dirtyhelper_FNSAVE",
6838                          &amd64g_dirtyhelper_FNSAVE,
6839                          mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
6840                       );
6841                   d->mSize = 108;
6842                }
6843
6844                /* declare we're writing memory */
6845                d->mFx   = Ifx_Write;
6846                d->mAddr = mkexpr(addr);
6847                /* d->mSize set above */
6848
6849                /* declare we're reading guest state */
6850                d->nFxState = 5;
6851                vex_bzero(&d->fxState, sizeof(d->fxState));
6852
6853                d->fxState[0].fx     = Ifx_Read;
6854                d->fxState[0].offset = OFFB_FTOP;
6855                d->fxState[0].size   = sizeof(UInt);
6856
6857                d->fxState[1].fx     = Ifx_Read;
6858                d->fxState[1].offset = OFFB_FPREGS;
6859                d->fxState[1].size   = 8 * sizeof(ULong);
6860
6861                d->fxState[2].fx     = Ifx_Read;
6862                d->fxState[2].offset = OFFB_FPTAGS;
6863                d->fxState[2].size   = 8 * sizeof(UChar);
6864
6865                d->fxState[3].fx     = Ifx_Read;
6866                d->fxState[3].offset = OFFB_FPROUND;
6867                d->fxState[3].size   = sizeof(ULong);
6868
6869                d->fxState[4].fx     = Ifx_Read;
6870                d->fxState[4].offset = OFFB_FC3210;
6871                d->fxState[4].size   = sizeof(ULong);
6872
6873                stmt( IRStmt_Dirty(d) );
6874
6875                if ( have66(pfx) ) {
6876                  DIP("fnsaves %s\n", dis_buf);
6877                } else {
6878                  DIP("fnsave %s\n", dis_buf);
6879                }
6880                break;
6881             }
6882
6883             case 7: { /* FNSTSW m16 */
6884                IRExpr* sw = get_FPU_sw();
6885                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
6886                storeLE( mkexpr(addr), sw );
6887                DIP("fnstsw %s\n", dis_buf);
6888                break;
6889             }
6890
6891             default:
6892                vex_printf("unhandled opc_aux = 0x%2x\n",
6893                           (UInt)gregLO3ofRM(modrm));
6894                vex_printf("first_opcode == 0xDD\n");
6895                goto decode_fail;
6896          }
6897       } else {
6898          delta++;
6899          switch (modrm) {
6900
6901             case 0xC0 ... 0xC7: /* FFREE %st(?) */
6902                r_dst = (UInt)modrm - 0xC0;
6903                DIP("ffree %%st(%u)\n", r_dst);
6904                put_ST_TAG ( r_dst, mkU8(0) );
6905                break;
6906
6907             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
6908                r_dst = (UInt)modrm - 0xD0;
6909                DIP("fst %%st(0),%%st(%u)\n", r_dst);
6910                /* P4 manual says: "If the destination operand is a
6911                   non-empty register, the invalid-operation exception
6912                   is not generated.  Hence put_ST_UNCHECKED. */
6913                put_ST_UNCHECKED(r_dst, get_ST(0));
6914                break;
6915
6916             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
6917                r_dst = (UInt)modrm - 0xD8;
6918                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
6919                /* P4 manual says: "If the destination operand is a
6920                   non-empty register, the invalid-operation exception
6921                   is not generated.  Hence put_ST_UNCHECKED. */
6922                put_ST_UNCHECKED(r_dst, get_ST(0));
6923                fp_pop();
6924                break;
6925
6926             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
6927                r_dst = (UInt)modrm - 0xE0;
6928                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
6929                /* This forces C1 to zero, which isn't right. */
6930                put_C3210(
6931                    unop(Iop_32Uto64,
6932                    binop( Iop_And32,
6933                           binop(Iop_Shl32,
6934                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6935                                 mkU8(8)),
6936                           mkU32(0x4500)
6937                    )));
6938                break;
6939
6940             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
6941                r_dst = (UInt)modrm - 0xE8;
6942                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
6943                /* This forces C1 to zero, which isn't right. */
6944                put_C3210(
6945                    unop(Iop_32Uto64,
6946                    binop( Iop_And32,
6947                           binop(Iop_Shl32,
6948                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6949                                 mkU8(8)),
6950                           mkU32(0x4500)
6951                    )));
6952                fp_pop();
6953                break;
6954
6955             default:
6956                goto decode_fail;
6957          }
6958       }
6959    }
6960
6961    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
6962    else
6963    if (first_opcode == 0xDE) {
6964
6965       if (modrm < 0xC0) {
6966
6967          /* bits 5,4,3 are an opcode extension, and the modRM also
6968             specifies an address. */
6969          IROp   fop;
6970          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6971          delta += len;
6972
6973          switch (gregLO3ofRM(modrm)) {
6974
6975             case 0: /* FIADD m16int */ /* ST(0) += m16int */
6976                DIP("fiaddw %s\n", dis_buf);
6977                fop = Iop_AddF64;
6978                goto do_fop_m16;
6979
6980             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
6981                DIP("fimulw %s\n", dis_buf);
6982                fop = Iop_MulF64;
6983                goto do_fop_m16;
6984
6985             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
6986                DIP("fisubw %s\n", dis_buf);
6987                fop = Iop_SubF64;
6988                goto do_fop_m16;
6989
6990             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
6991                DIP("fisubrw %s\n", dis_buf);
6992                fop = Iop_SubF64;
6993                goto do_foprev_m16;
6994
6995             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
6996                DIP("fisubw %s\n", dis_buf);
6997                fop = Iop_DivF64;
6998                goto do_fop_m16;
6999
7000             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
7001                DIP("fidivrw %s\n", dis_buf);
7002                fop = Iop_DivF64;
7003                goto do_foprev_m16;
7004
7005             do_fop_m16:
7006                put_ST_UNCHECKED(0,
7007                   triop(fop,
7008                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7009                         get_ST(0),
7010                         unop(Iop_I32StoF64,
7011                              unop(Iop_16Sto32,
7012                                   loadLE(Ity_I16, mkexpr(addr))))));
7013                break;
7014
7015             do_foprev_m16:
7016                put_ST_UNCHECKED(0,
7017                   triop(fop,
7018                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
7019                         unop(Iop_I32StoF64,
7020                              unop(Iop_16Sto32,
7021                                   loadLE(Ity_I16, mkexpr(addr)))),
7022                         get_ST(0)));
7023                break;
7024
7025             default:
7026                vex_printf("unhandled opc_aux = 0x%2x\n",
7027                           (UInt)gregLO3ofRM(modrm));
7028                vex_printf("first_opcode == 0xDE\n");
7029                goto decode_fail;
7030          }
7031
7032       } else {
7033
7034          delta++;
7035          switch (modrm) {
7036
7037             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
7038                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
7039                break;
7040
7041             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
7042                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
7043                break;
7044
7045             case 0xD9: /* FCOMPP %st(0),%st(1) */
7046                DIP("fcompp %%st(0),%%st(1)\n");
7047                /* This forces C1 to zero, which isn't right. */
7048                put_C3210(
7049                    unop(Iop_32Uto64,
7050                    binop( Iop_And32,
7051                           binop(Iop_Shl32,
7052                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
7053                                 mkU8(8)),
7054                           mkU32(0x4500)
7055                    )));
7056                fp_pop();
7057                fp_pop();
7058                break;
7059
7060             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
7061                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
7062                break;
7063
7064             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
7065                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
7066                break;
7067
7068             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
7069                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
7070                break;
7071
7072             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
7073                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
7074                break;
7075
7076             default:
7077                goto decode_fail;
7078          }
7079
7080       }
7081    }
7082
7083    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
7084    else
7085    if (first_opcode == 0xDF) {
7086
7087       if (modrm < 0xC0) {
7088
7089          /* bits 5,4,3 are an opcode extension, and the modRM also
7090             specifies an address. */
7091          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7092          delta += len;
7093
7094          switch (gregLO3ofRM(modrm)) {
7095
7096             case 0: /* FILD m16int */
7097                DIP("fildw %s\n", dis_buf);
7098                fp_push();
7099                put_ST(0, unop(Iop_I32StoF64,
7100                               unop(Iop_16Sto32,
7101                                    loadLE(Ity_I16, mkexpr(addr)))));
7102                break;
7103
7104             case 1: /* FISTTPS m16 (SSE3) */
7105                DIP("fisttps %s\n", dis_buf);
7106                storeLE( mkexpr(addr),
7107                         x87ishly_qnarrow_32_to_16(
7108                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
7109                fp_pop();
7110                break;
7111
7112             case 2: /* FIST m16 */
7113                DIP("fists %s\n", dis_buf);
7114                storeLE( mkexpr(addr),
7115                         x87ishly_qnarrow_32_to_16(
7116                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
7117                break;
7118
7119             case 3: /* FISTP m16 */
7120                DIP("fistps %s\n", dis_buf);
7121                storeLE( mkexpr(addr),
7122                         x87ishly_qnarrow_32_to_16(
7123                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
7124                fp_pop();
7125                break;
7126
7127             case 5: /* FILD m64 */
7128                DIP("fildll %s\n", dis_buf);
7129                fp_push();
7130                put_ST(0, binop(Iop_I64StoF64,
7131                                get_roundingmode(),
7132                                loadLE(Ity_I64, mkexpr(addr))));
7133                break;
7134
7135             case 7: /* FISTP m64 */
7136                DIP("fistpll %s\n", dis_buf);
7137                storeLE( mkexpr(addr),
7138                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
7139                fp_pop();
7140                break;
7141
7142             default:
7143                vex_printf("unhandled opc_aux = 0x%2x\n",
7144                           (UInt)gregLO3ofRM(modrm));
7145                vex_printf("first_opcode == 0xDF\n");
7146                goto decode_fail;
7147          }
7148
7149       } else {
7150
7151          delta++;
7152          switch (modrm) {
7153
7154             case 0xC0: /* FFREEP %st(0) */
7155                DIP("ffreep %%st(%d)\n", 0);
7156                put_ST_TAG ( 0, mkU8(0) );
7157                fp_pop();
7158                break;
7159
7160             case 0xE0: /* FNSTSW %ax */
7161                DIP("fnstsw %%ax\n");
7162                /* Invent a plausible-looking FPU status word value and
7163                   dump it in %AX:
7164                      ((ftop & 7) << 11) | (c3210 & 0x4700)
7165                */
7166                putIRegRAX(
7167                   2,
7168                   unop(Iop_32to16,
7169                        binop(Iop_Or32,
7170                              binop(Iop_Shl32,
7171                                    binop(Iop_And32, get_ftop(), mkU32(7)),
7172                                    mkU8(11)),
7173                              binop(Iop_And32,
7174                                    unop(Iop_64to32, get_C3210()),
7175                                    mkU32(0x4700))
7176                )));
7177                break;
7178
7179             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
7180                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
7181                break;
7182
7183             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
7184                /* not really right since COMIP != UCOMIP */
7185                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
7186                break;
7187
7188             default:
7189                goto decode_fail;
7190          }
7191       }
7192
7193    }
7194
7195    else
7196       goto decode_fail;
7197
7198    *decode_ok = True;
7199    return delta;
7200
7201   decode_fail:
7202    *decode_ok = False;
7203    return delta;
7204 }
7205
7206
7207 /*------------------------------------------------------------*/
7208 /*---                                                      ---*/
7209 /*--- MMX INSTRUCTIONS                                     ---*/
7210 /*---                                                      ---*/
7211 /*------------------------------------------------------------*/
7212
7213 /* Effect of MMX insns on x87 FPU state (table 11-2 of
7214    IA32 arch manual, volume 3):
7215
7216    Read from, or write to MMX register (viz, any insn except EMMS):
7217    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
7218    * FP stack pointer set to zero
7219
7220    EMMS:
7221    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
7222    * FP stack pointer set to zero
7223 */
7224
7225 static void do_MMX_preamble ( void )
7226 {
7227    Int         i;
7228    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
7229    IRExpr*     zero  = mkU32(0);
7230    IRExpr*     tag1  = mkU8(1);
7231    put_ftop(zero);
7232    for (i = 0; i < 8; i++)
7233       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
7234 }
7235
7236 static void do_EMMS_preamble ( void )
7237 {
7238    Int         i;
7239    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
7240    IRExpr*     zero  = mkU32(0);
7241    IRExpr*     tag0  = mkU8(0);
7242    put_ftop(zero);
7243    for (i = 0; i < 8; i++)
7244       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
7245 }
7246
7247
7248 static IRExpr* getMMXReg ( UInt archreg )
7249 {
7250    vassert(archreg < 8);
7251    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
7252 }
7253
7254
7255 static void putMMXReg ( UInt archreg, IRExpr* e )
7256 {
7257    vassert(archreg < 8);
7258    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
7259    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
7260 }
7261
7262
7263 /* Helper for non-shift MMX insns.  Note this is incomplete in the
7264    sense that it does not first call do_MMX_preamble() -- that is the
7265    responsibility of its caller. */
7266
7267 static
7268 ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
7269                                 Prefix      pfx,
7270                                 Long        delta,
7271                                 UChar       opc,
7272                                 const HChar* name,
7273                                 Bool        show_granularity )
7274 {
7275    HChar   dis_buf[50];
7276    UChar   modrm = getUChar(delta);
7277    Bool    isReg = epartIsReg(modrm);
7278    IRExpr* argL  = NULL;
7279    IRExpr* argR  = NULL;
7280    IRExpr* argG  = NULL;
7281    IRExpr* argE  = NULL;
7282    IRTemp  res   = newTemp(Ity_I64);
7283
7284    Bool    invG  = False;
7285    IROp    op    = Iop_INVALID;
7286    void*   hAddr = NULL;
7287    const HChar*  hName = NULL;
7288    Bool    eLeft = False;
7289
7290 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
7291
7292    switch (opc) {
7293       /* Original MMX ones */
7294       case 0xFC: op = Iop_Add8x8; break;
7295       case 0xFD: op = Iop_Add16x4; break;
7296       case 0xFE: op = Iop_Add32x2; break;
7297
7298       case 0xEC: op = Iop_QAdd8Sx8; break;
7299       case 0xED: op = Iop_QAdd16Sx4; break;
7300
7301       case 0xDC: op = Iop_QAdd8Ux8; break;
7302       case 0xDD: op = Iop_QAdd16Ux4; break;
7303
7304       case 0xF8: op = Iop_Sub8x8;  break;
7305       case 0xF9: op = Iop_Sub16x4; break;
7306       case 0xFA: op = Iop_Sub32x2; break;
7307
7308       case 0xE8: op = Iop_QSub8Sx8; break;
7309       case 0xE9: op = Iop_QSub16Sx4; break;
7310
7311       case 0xD8: op = Iop_QSub8Ux8; break;
7312       case 0xD9: op = Iop_QSub16Ux4; break;
7313
7314       case 0xE5: op = Iop_MulHi16Sx4; break;
7315       case 0xD5: op = Iop_Mul16x4; break;
7316       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
7317
7318       case 0x74: op = Iop_CmpEQ8x8; break;
7319       case 0x75: op = Iop_CmpEQ16x4; break;
7320       case 0x76: op = Iop_CmpEQ32x2; break;
7321
7322       case 0x64: op = Iop_CmpGT8Sx8; break;
7323       case 0x65: op = Iop_CmpGT16Sx4; break;
7324       case 0x66: op = Iop_CmpGT32Sx2; break;
7325
7326       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
7327       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
7328       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
7329
7330       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
7331       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
7332       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
7333
7334       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
7335       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
7336       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
7337
7338       case 0xDB: op = Iop_And64; break;
7339       case 0xDF: op = Iop_And64; invG = True; break;
7340       case 0xEB: op = Iop_Or64; break;
7341       case 0xEF: /* Possibly do better here if argL and argR are the
7342                     same reg */
7343                  op = Iop_Xor64; break;
7344
7345       /* Introduced in SSE1 */
7346       case 0xE0: op = Iop_Avg8Ux8;    break;
7347       case 0xE3: op = Iop_Avg16Ux4;   break;
7348       case 0xEE: op = Iop_Max16Sx4;   break;
7349       case 0xDE: op = Iop_Max8Ux8;    break;
7350       case 0xEA: op = Iop_Min16Sx4;   break;
7351       case 0xDA: op = Iop_Min8Ux8;    break;
7352       case 0xE4: op = Iop_MulHi16Ux4; break;
7353       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
7354
7355       /* Introduced in SSE2 */
7356       case 0xD4: op = Iop_Add64; break;
7357       case 0xFB: op = Iop_Sub64; break;
7358
7359       default:
7360          vex_printf("\n0x%x\n", (UInt)opc);
7361          vpanic("dis_MMXop_regmem_to_reg");
7362    }
7363
7364 #  undef XXX
7365
7366    argG = getMMXReg(gregLO3ofRM(modrm));
7367    if (invG)
7368       argG = unop(Iop_Not64, argG);
7369
7370    if (isReg) {
7371       delta++;
7372       argE = getMMXReg(eregLO3ofRM(modrm));
7373    } else {
7374       Int    len;
7375       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7376       delta += len;
7377       argE = loadLE(Ity_I64, mkexpr(addr));
7378    }
7379
7380    if (eLeft) {
7381       argL = argE;
7382       argR = argG;
7383    } else {
7384       argL = argG;
7385       argR = argE;
7386    }
7387
7388    if (op != Iop_INVALID) {
7389       vassert(hName == NULL);
7390       vassert(hAddr == NULL);
7391       assign(res, binop(op, argL, argR));
7392    } else {
7393       vassert(hName != NULL);
7394       vassert(hAddr != NULL);
7395       assign( res,
7396               mkIRExprCCall(
7397                  Ity_I64,
7398                  0/*regparms*/, hName, hAddr,
7399                  mkIRExprVec_2( argL, argR )
7400               )
7401             );
7402    }
7403
7404    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
7405
7406    DIP("%s%s %s, %s\n",
7407        name, show_granularity ? nameMMXGran(opc & 3) : "",
7408        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
7409        nameMMXReg(gregLO3ofRM(modrm)) );
7410
7411    return delta;
7412 }
7413
7414
7415 /* Vector by scalar shift of G by the amount specified at the bottom
7416    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
7417
7418 static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
7419                                   Prefix pfx, Long delta,
7420                                   const HChar* opname, IROp op )
7421 {
7422    HChar   dis_buf[50];
7423    Int     alen, size;
7424    IRTemp  addr;
7425    Bool    shl, shr, sar;
7426    UChar   rm   = getUChar(delta);
7427    IRTemp  g0   = newTemp(Ity_I64);
7428    IRTemp  g1   = newTemp(Ity_I64);
7429    IRTemp  amt  = newTemp(Ity_I64);
7430    IRTemp  amt8 = newTemp(Ity_I8);
7431
7432    if (epartIsReg(rm)) {
7433       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
7434       DIP("%s %s,%s\n", opname,
7435                         nameMMXReg(eregLO3ofRM(rm)),
7436                         nameMMXReg(gregLO3ofRM(rm)) );
7437       delta++;
7438    } else {
7439       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
7440       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
7441       DIP("%s %s,%s\n", opname,
7442                         dis_buf,
7443                         nameMMXReg(gregLO3ofRM(rm)) );
7444       delta += alen;
7445    }
7446    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
7447    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
7448
7449    shl = shr = sar = False;
7450    size = 0;
7451    switch (op) {
7452       case Iop_ShlN16x4: shl = True; size = 32; break;
7453       case Iop_ShlN32x2: shl = True; size = 32; break;
7454       case Iop_Shl64:    shl = True; size = 64; break;
7455       case Iop_ShrN16x4: shr = True; size = 16; break;
7456       case Iop_ShrN32x2: shr = True; size = 32; break;
7457       case Iop_Shr64:    shr = True; size = 64; break;
7458       case Iop_SarN16x4: sar = True; size = 16; break;
7459       case Iop_SarN32x2: sar = True; size = 32; break;
7460       default: vassert(0);
7461    }
7462
7463    if (shl || shr) {
7464      assign(
7465         g1,
7466         IRExpr_ITE(
7467            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
7468            binop(op, mkexpr(g0), mkexpr(amt8)),
7469            mkU64(0)
7470         )
7471      );
7472    } else
7473    if (sar) {
7474      assign(
7475         g1,
7476         IRExpr_ITE(
7477            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
7478            binop(op, mkexpr(g0), mkexpr(amt8)),
7479            binop(op, mkexpr(g0), mkU8(size-1))
7480         )
7481      );
7482    } else {
7483       vassert(0);
7484    }
7485
7486    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
7487    return delta;
7488 }
7489
7490
7491 /* Vector by scalar shift of E by an immediate byte.  This is a
7492    straight copy of dis_SSE_shiftE_imm. */
7493
7494 static
7495 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
7496 {
7497    Bool    shl, shr, sar;
7498    UChar   rm   = getUChar(delta);
7499    IRTemp  e0   = newTemp(Ity_I64);
7500    IRTemp  e1   = newTemp(Ity_I64);
7501    UChar   amt, size;
7502    vassert(epartIsReg(rm));
7503    vassert(gregLO3ofRM(rm) == 2
7504            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
7505    amt = getUChar(delta+1);
7506    delta += 2;
7507    DIP("%s $%d,%s\n", opname,
7508                       (Int)amt,
7509                       nameMMXReg(eregLO3ofRM(rm)) );
7510
7511    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
7512
7513    shl = shr = sar = False;
7514    size = 0;
7515    switch (op) {
7516       case Iop_ShlN16x4: shl = True; size = 16; break;
7517       case Iop_ShlN32x2: shl = True; size = 32; break;
7518       case Iop_Shl64:    shl = True; size = 64; break;
7519       case Iop_SarN16x4: sar = True; size = 16; break;
7520       case Iop_SarN32x2: sar = True; size = 32; break;
7521       case Iop_ShrN16x4: shr = True; size = 16; break;
7522       case Iop_ShrN32x2: shr = True; size = 32; break;
7523       case Iop_Shr64:    shr = True; size = 64; break;
7524       default: vassert(0);
7525    }
7526
7527    if (shl || shr) {
7528      assign( e1, amt >= size
7529                     ? mkU64(0)
7530                     : binop(op, mkexpr(e0), mkU8(amt))
7531      );
7532    } else
7533    if (sar) {
7534      assign( e1, amt >= size
7535                     ? binop(op, mkexpr(e0), mkU8(size-1))
7536                     : binop(op, mkexpr(e0), mkU8(amt))
7537      );
7538    } else {
7539       vassert(0);
7540    }
7541
7542    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
7543    return delta;
7544 }
7545
7546
7547 /* Completely handle all MMX instructions except emms. */
7548
7549 static
7550 ULong dis_MMX ( Bool* decode_ok,
7551                 const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
7552 {
7553    Int   len;
7554    UChar modrm;
7555    HChar dis_buf[50];
7556    UChar opc = getUChar(delta);
7557    delta++;
7558
7559    /* dis_MMX handles all insns except emms. */
7560    do_MMX_preamble();
7561
7562    switch (opc) {
7563
7564       case 0x6E:
7565          if (sz == 4) {
7566             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
7567             modrm = getUChar(delta);
7568             if (epartIsReg(modrm)) {
7569                delta++;
7570                putMMXReg(
7571                   gregLO3ofRM(modrm),
7572                   binop( Iop_32HLto64,
7573                          mkU32(0),
7574                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
7575                DIP("movd %s, %s\n",
7576                    nameIReg32(eregOfRexRM(pfx,modrm)),
7577                    nameMMXReg(gregLO3ofRM(modrm)));
7578             } else {
7579                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7580                delta += len;
7581                putMMXReg(
7582                   gregLO3ofRM(modrm),
7583                   binop( Iop_32HLto64,
7584                          mkU32(0),
7585                          loadLE(Ity_I32, mkexpr(addr)) ) );
7586                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7587             }
7588          }
7589          else
7590          if (sz == 8) {
7591             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
7592             modrm = getUChar(delta);
7593             if (epartIsReg(modrm)) {
7594                delta++;
7595                putMMXReg( gregLO3ofRM(modrm),
7596                           getIReg64(eregOfRexRM(pfx,modrm)) );
7597                DIP("movd %s, %s\n",
7598                    nameIReg64(eregOfRexRM(pfx,modrm)),
7599                    nameMMXReg(gregLO3ofRM(modrm)));
7600             } else {
7601                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7602                delta += len;
7603                putMMXReg( gregLO3ofRM(modrm),
7604                           loadLE(Ity_I64, mkexpr(addr)) );
7605                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7606             }
7607          }
7608          else {
7609             goto mmx_decode_failure;
7610          }
7611          break;
7612
7613       case 0x7E:
7614          if (sz == 4) {
7615             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
7616             modrm = getUChar(delta);
7617             if (epartIsReg(modrm)) {
7618                delta++;
7619                putIReg32( eregOfRexRM(pfx,modrm),
7620                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7621                DIP("movd %s, %s\n",
7622                    nameMMXReg(gregLO3ofRM(modrm)),
7623                    nameIReg32(eregOfRexRM(pfx,modrm)));
7624             } else {
7625                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7626                delta += len;
7627                storeLE( mkexpr(addr),
7628                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7629                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7630             }
7631          }
7632          else
7633          if (sz == 8) {
7634             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
7635             modrm = getUChar(delta);
7636             if (epartIsReg(modrm)) {
7637                delta++;
7638                putIReg64( eregOfRexRM(pfx,modrm),
7639                           getMMXReg(gregLO3ofRM(modrm)) );
7640                DIP("movd %s, %s\n",
7641                    nameMMXReg(gregLO3ofRM(modrm)),
7642                    nameIReg64(eregOfRexRM(pfx,modrm)));
7643             } else {
7644                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7645                delta += len;
7646                storeLE( mkexpr(addr),
7647                        getMMXReg(gregLO3ofRM(modrm)) );
7648                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7649             }
7650          } else {
7651             goto mmx_decode_failure;
7652          }
7653          break;
7654
7655       case 0x6F:
7656          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
7657          if (sz != 4
7658              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7659             goto mmx_decode_failure;
7660          modrm = getUChar(delta);
7661          if (epartIsReg(modrm)) {
7662             delta++;
7663             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
7664             DIP("movq %s, %s\n",
7665                 nameMMXReg(eregLO3ofRM(modrm)),
7666                 nameMMXReg(gregLO3ofRM(modrm)));
7667          } else {
7668             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7669             delta += len;
7670             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
7671             DIP("movq %s, %s\n",
7672                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7673          }
7674          break;
7675
7676       case 0x7F:
7677          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
7678          if (sz != 4
7679              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7680             goto mmx_decode_failure;
7681          modrm = getUChar(delta);
7682          if (epartIsReg(modrm)) {
7683             delta++;
7684             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
7685             DIP("movq %s, %s\n",
7686                 nameMMXReg(gregLO3ofRM(modrm)),
7687                 nameMMXReg(eregLO3ofRM(modrm)));
7688          } else {
7689             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7690             delta += len;
7691             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
7692             DIP("mov(nt)q %s, %s\n",
7693                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7694          }
7695          break;
7696
7697       case 0xFC:
7698       case 0xFD:
7699       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
7700          if (sz != 4)
7701             goto mmx_decode_failure;
7702          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
7703          break;
7704
7705       case 0xEC:
7706       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
7707          if (sz != 4
7708              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7709             goto mmx_decode_failure;
7710          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
7711          break;
7712
7713       case 0xDC:
7714       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7715          if (sz != 4)
7716             goto mmx_decode_failure;
7717          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
7718          break;
7719
7720       case 0xF8:
7721       case 0xF9:
7722       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
7723          if (sz != 4)
7724             goto mmx_decode_failure;
7725          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
7726          break;
7727
7728       case 0xE8:
7729       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
7730          if (sz != 4)
7731             goto mmx_decode_failure;
7732          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
7733          break;
7734
7735       case 0xD8:
7736       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7737          if (sz != 4)
7738             goto mmx_decode_failure;
7739          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
7740          break;
7741
7742       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
7743          if (sz != 4)
7744             goto mmx_decode_failure;
7745          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
7746          break;
7747
7748       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
7749          if (sz != 4)
7750             goto mmx_decode_failure;
7751          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
7752          break;
7753
7754       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
7755          vassert(sz == 4);
7756          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
7757          break;
7758
7759       case 0x74:
7760       case 0x75:
7761       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
7762          if (sz != 4)
7763             goto mmx_decode_failure;
7764          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
7765          break;
7766
7767       case 0x64:
7768       case 0x65:
7769       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
7770          if (sz != 4)
7771             goto mmx_decode_failure;
7772          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
7773          break;
7774
7775       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
7776          if (sz != 4)
7777             goto mmx_decode_failure;
7778          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
7779          break;
7780
7781       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
7782          if (sz != 4)
7783             goto mmx_decode_failure;
7784          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
7785          break;
7786
7787       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
7788          if (sz != 4)
7789             goto mmx_decode_failure;
7790          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
7791          break;
7792
7793       case 0x68:
7794       case 0x69:
7795       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
7796          if (sz != 4
7797              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7798             goto mmx_decode_failure;
7799          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
7800          break;
7801
7802       case 0x60:
7803       case 0x61:
7804       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
7805          if (sz != 4
7806              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7807             goto mmx_decode_failure;
7808          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
7809          break;
7810
7811       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
7812          if (sz != 4)
7813             goto mmx_decode_failure;
7814          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
7815          break;
7816
7817       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
7818          if (sz != 4)
7819             goto mmx_decode_failure;
7820          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
7821          break;
7822
7823       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
7824          if (sz != 4)
7825             goto mmx_decode_failure;
7826          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
7827          break;
7828
7829       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
7830          if (sz != 4)
7831             goto mmx_decode_failure;
7832          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
7833          break;
7834
7835 #     define SHIFT_BY_REG(_name,_op)                                     \
7836                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
7837                 break;
7838
7839       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
7840       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
7841       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
7842       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
7843
7844       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
7845       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
7846       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
7847       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
7848
7849       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
7850       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
7851       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
7852
7853 #     undef SHIFT_BY_REG
7854
7855       case 0x71:
7856       case 0x72:
7857       case 0x73: {
7858          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
7859          UChar byte2, subopc;
7860          if (sz != 4)
7861             goto mmx_decode_failure;
7862          byte2  = getUChar(delta);      /* amode / sub-opcode */
7863          subopc = toUChar( (byte2 >> 3) & 7 );
7864
7865 #        define SHIFT_BY_IMM(_name,_op)                        \
7866             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
7867             } while (0)
7868
7869               if (subopc == 2 /*SRL*/ && opc == 0x71)
7870                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
7871          else if (subopc == 2 /*SRL*/ && opc == 0x72)
7872                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
7873          else if (subopc == 2 /*SRL*/ && opc == 0x73)
7874                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
7875
7876          else if (subopc == 4 /*SAR*/ && opc == 0x71)
7877                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
7878          else if (subopc == 4 /*SAR*/ && opc == 0x72)
7879                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
7880
7881          else if (subopc == 6 /*SHL*/ && opc == 0x71)
7882                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
7883          else if (subopc == 6 /*SHL*/ && opc == 0x72)
7884                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
7885          else if (subopc == 6 /*SHL*/ && opc == 0x73)
7886                  SHIFT_BY_IMM("psllq", Iop_Shl64);
7887
7888          else goto mmx_decode_failure;
7889
7890 #        undef SHIFT_BY_IMM
7891          break;
7892       }
7893
7894       case 0xF7: {
7895          IRTemp addr    = newTemp(Ity_I64);
7896          IRTemp regD    = newTemp(Ity_I64);
7897          IRTemp regM    = newTemp(Ity_I64);
7898          IRTemp mask    = newTemp(Ity_I64);
7899          IRTemp olddata = newTemp(Ity_I64);
7900          IRTemp newdata = newTemp(Ity_I64);
7901
7902          modrm = getUChar(delta);
7903          if (sz != 4 || (!epartIsReg(modrm)))
7904             goto mmx_decode_failure;
7905          delta++;
7906
7907          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
7908          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
7909          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
7910          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
7911          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
7912          assign( newdata,
7913                  binop(Iop_Or64,
7914                        binop(Iop_And64,
7915                              mkexpr(regD),
7916                              mkexpr(mask) ),
7917                        binop(Iop_And64,
7918                              mkexpr(olddata),
7919                              unop(Iop_Not64, mkexpr(mask)))) );
7920          storeLE( mkexpr(addr), mkexpr(newdata) );
7921          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
7922                                  nameMMXReg( gregLO3ofRM(modrm) ) );
7923          break;
7924       }
7925
7926       /* --- MMX decode failure --- */
7927       default:
7928       mmx_decode_failure:
7929          *decode_ok = False;
7930          return delta; /* ignored */
7931
7932    }
7933
7934    *decode_ok = True;
7935    return delta;
7936 }
7937
7938
7939 /*------------------------------------------------------------*/
7940 /*--- More misc arithmetic and other obscure insns.        ---*/
7941 /*------------------------------------------------------------*/
7942
7943 /* Generate base << amt with vacated places filled with stuff
7944    from xtra.  amt guaranteed in 0 .. 63. */
7945 static
7946 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
7947 {
7948    /* if   amt == 0
7949       then base
7950       else (base << amt) | (xtra >>u (64-amt))
7951    */
7952    return
7953       IRExpr_ITE(
7954          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
7955          binop(Iop_Or64,
7956                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
7957                binop(Iop_Shr64, mkexpr(xtra),
7958                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7959                ),
7960          mkexpr(base)
7961       );
7962 }
7963
7964 /* Generate base >>u amt with vacated places filled with stuff
7965    from xtra.  amt guaranteed in 0 .. 63. */
7966 static
7967 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
7968 {
7969    /* if   amt == 0
7970       then base
7971       else (base >>u amt) | (xtra << (64-amt))
7972    */
7973    return
7974       IRExpr_ITE(
7975          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
7976          binop(Iop_Or64,
7977                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
7978                binop(Iop_Shl64, mkexpr(xtra),
7979                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7980                ),
7981          mkexpr(base)
7982       );
7983 }
7984
7985 /* Double length left and right shifts.  Apparently only required in
7986    v-size (no b- variant). */
7987 static
7988 ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
7989                         Prefix pfx,
7990                         Long delta, UChar modrm,
7991                         Int sz,
7992                         IRExpr* shift_amt,
7993                         Bool amt_is_literal,
7994                         const HChar* shift_amt_txt,
7995                         Bool left_shift )
7996 {
7997    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
7998       for printing it.   And eip on entry points at the modrm byte. */
7999    Int len;
8000    HChar dis_buf[50];
8001
8002    IRType ty     = szToITy(sz);
8003    IRTemp gsrc   = newTemp(ty);
8004    IRTemp esrc   = newTemp(ty);
8005    IRTemp addr   = IRTemp_INVALID;
8006    IRTemp tmpSH  = newTemp(Ity_I8);
8007    IRTemp tmpSS  = newTemp(Ity_I8);
8008    IRTemp tmp64  = IRTemp_INVALID;
8009    IRTemp res64  = IRTemp_INVALID;
8010    IRTemp rss64  = IRTemp_INVALID;
8011    IRTemp resTy  = IRTemp_INVALID;
8012    IRTemp rssTy  = IRTemp_INVALID;
8013    Int    mask   = sz==8 ? 63 : 31;
8014
8015    vassert(sz == 2 || sz == 4 || sz == 8);
8016
8017    /* The E-part is the destination; this is shifted.  The G-part
8018       supplies bits to be shifted into the E-part, but is not
8019       changed.
8020
8021       If shifting left, form a double-length word with E at the top
8022       and G at the bottom, and shift this left.  The result is then in
8023       the high part.
8024
8025       If shifting right, form a double-length word with G at the top
8026       and E at the bottom, and shift this right.  The result is then
8027       at the bottom.  */
8028
8029    /* Fetch the operands. */
8030
8031    assign( gsrc, getIRegG(sz, pfx, modrm) );
8032
8033    if (epartIsReg(modrm)) {
8034       delta++;
8035       assign( esrc, getIRegE(sz, pfx, modrm) );
8036       DIP("sh%cd%c %s, %s, %s\n",
8037           ( left_shift ? 'l' : 'r' ), nameISize(sz),
8038           shift_amt_txt,
8039           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
8040    } else {
8041       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
8042                         /* # bytes following amode */
8043                         amt_is_literal ? 1 : 0 );
8044       delta += len;
8045       assign( esrc, loadLE(ty, mkexpr(addr)) );
8046       DIP("sh%cd%c %s, %s, %s\n",
8047           ( left_shift ? 'l' : 'r' ), nameISize(sz),
8048           shift_amt_txt,
8049           nameIRegG(sz, pfx, modrm), dis_buf);
8050    }
8051
8052    /* Calculate the masked shift amount (tmpSH), the masked subshift
8053       amount (tmpSS), the shifted value (res64) and the subshifted
8054       value (rss64). */
8055
8056    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
8057    assign( tmpSS, binop(Iop_And8,
8058                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
8059                         mkU8(mask)));
8060
8061    tmp64 = newTemp(Ity_I64);
8062    res64 = newTemp(Ity_I64);
8063    rss64 = newTemp(Ity_I64);
8064
8065    if (sz == 2 || sz == 4) {
8066
8067       /* G is xtra; E is data */
8068       /* what a freaking nightmare: */
8069       if (sz == 4 && left_shift) {
8070          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
8071          assign( res64,
8072                  binop(Iop_Shr64,
8073                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
8074                        mkU8(32)) );
8075          assign( rss64,
8076                  binop(Iop_Shr64,
8077                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
8078                        mkU8(32)) );
8079       }
8080       else
8081       if (sz == 4 && !left_shift) {
8082          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
8083          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
8084          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
8085       }
8086       else
8087       if (sz == 2 && left_shift) {
8088          assign( tmp64,
8089                  binop(Iop_32HLto64,
8090                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
8091                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
8092          ));
8093          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
8094          assign( res64,
8095                  binop(Iop_Shr64,
8096                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
8097                        mkU8(48)) );
8098          /* subshift formed by shifting [esrc'0000'0000'0000] */
8099          assign( rss64,
8100                  binop(Iop_Shr64,
8101                        binop(Iop_Shl64,
8102                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
8103                                               mkU8(48)),
8104                              mkexpr(tmpSS)),
8105                        mkU8(48)) );
8106       }
8107       else
8108       if (sz == 2 && !left_shift) {
8109          assign( tmp64,
8110                  binop(Iop_32HLto64,
8111                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
8112                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
8113          ));
8114          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
8115          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
8116          /* subshift formed by shifting [0000'0000'0000'esrc] */
8117          assign( rss64, binop(Iop_Shr64,
8118                               unop(Iop_16Uto64, mkexpr(esrc)),
8119                               mkexpr(tmpSS)) );
8120       }
8121
8122    } else {
8123
8124       vassert(sz == 8);
8125       if (left_shift) {
8126          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
8127          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
8128       } else {
8129          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
8130          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
8131       }
8132
8133    }
8134
8135    resTy = newTemp(ty);
8136    rssTy = newTemp(ty);
8137    assign( resTy, narrowTo(ty, mkexpr(res64)) );
8138    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
8139
8140    /* Put result back and write the flags thunk. */
8141    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
8142                               resTy, rssTy, ty, tmpSH );
8143
8144    if (epartIsReg(modrm)) {
8145       putIRegE(sz, pfx, modrm, mkexpr(resTy));
8146    } else {
8147       storeLE( mkexpr(addr), mkexpr(resTy) );
8148    }
8149
8150    if (amt_is_literal) delta++;
8151    return delta;
8152 }
8153
8154
8155 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
8156    required. */
8157
8158 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
8159
8160 static const HChar* nameBtOp ( BtOp op )
8161 {
8162    switch (op) {
8163       case BtOpNone:  return "";
8164       case BtOpSet:   return "s";
8165       case BtOpReset: return "r";
8166       case BtOpComp:  return "c";
8167       default: vpanic("nameBtOp(amd64)");
8168    }
8169 }
8170
8171
8172 static
8173 ULong dis_bt_G_E ( const VexAbiInfo* vbi,
8174                    Prefix pfx, Int sz, Long delta, BtOp op,
8175                    /*OUT*/Bool* decode_OK )
8176 {
8177    HChar  dis_buf[50];
8178    UChar  modrm;
8179    Int    len;
8180    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
8181           t_addr1, t_rsp, t_mask, t_new;
8182
8183    vassert(sz == 2 || sz == 4 || sz == 8);
8184
8185    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
8186              = t_addr0 = t_addr1 = t_rsp
8187              = t_mask = t_new = IRTemp_INVALID;
8188
8189    t_fetched = newTemp(Ity_I8);
8190    t_new     = newTemp(Ity_I8);
8191    t_bitno0  = newTemp(Ity_I64);
8192    t_bitno1  = newTemp(Ity_I64);
8193    t_bitno2  = newTemp(Ity_I8);
8194    t_addr1   = newTemp(Ity_I64);
8195    modrm     = getUChar(delta);
8196
8197    *decode_OK = True;
8198    if (epartIsReg(modrm)) {
8199       /* F2 and F3 are never acceptable. */
8200       if (haveF2orF3(pfx)) {
8201          *decode_OK = False;
8202          return delta;
8203       }
8204    } else {
8205       /* F2 or F3 (but not both) are allowed, provided LOCK is also
8206          present, and only for the BTC/BTS/BTR cases (not BT). */
8207       if (haveF2orF3(pfx)) {
8208          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
8209             *decode_OK = False;
8210             return delta;
8211          }
8212       }
8213    }
8214
8215    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
8216
8217    if (epartIsReg(modrm)) {
8218       delta++;
8219       /* Get it onto the client's stack.  Oh, this is a horrible
8220          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
8221          Because of the ELF ABI stack redzone, there may be live data
8222          up to 128 bytes below %RSP.  So we can't just push it on the
8223          stack, else we may wind up trashing live data, and causing
8224          impossible-to-find simulation errors.  (Yes, this did
8225          happen.)  So we need to drop RSP before at least 128 before
8226          pushing it.  That unfortunately means hitting Memcheck's
8227          fast-case painting code.  Ideally we should drop more than
8228          128, to reduce the chances of breaking buggy programs that
8229          have live data below -128(%RSP).  Memcheck fast-cases moves
8230          of 288 bytes due to the need to handle ppc64-linux quickly,
8231          so let's use 288.  Of course the real fix is to get rid of
8232          this kludge entirely.  */
8233       t_rsp = newTemp(Ity_I64);
8234       t_addr0 = newTemp(Ity_I64);
8235
8236       vassert(vbi->guest_stack_redzone_size == 128);
8237       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
8238       putIReg64(R_RSP, mkexpr(t_rsp));
8239
8240       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
8241
8242       /* Make t_addr0 point at it. */
8243       assign( t_addr0, mkexpr(t_rsp) );
8244
8245       /* Mask out upper bits of the shift amount, since we're doing a
8246          reg. */
8247       assign( t_bitno1, binop(Iop_And64,
8248                               mkexpr(t_bitno0),
8249                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
8250
8251    } else {
8252       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
8253       delta += len;
8254       assign( t_bitno1, mkexpr(t_bitno0) );
8255    }
8256
8257    /* At this point: t_addr0 is the address being operated on.  If it
8258       was a reg, we will have pushed it onto the client's stack.
8259       t_bitno1 is the bit number, suitably masked in the case of a
8260       reg.  */
8261
8262    /* Now the main sequence. */
8263    assign( t_addr1,
8264            binop(Iop_Add64,
8265                  mkexpr(t_addr0),
8266                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
8267
8268    /* t_addr1 now holds effective address */
8269
8270    assign( t_bitno2,
8271            unop(Iop_64to8,
8272                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
8273
8274    /* t_bitno2 contains offset of bit within byte */
8275
8276    if (op != BtOpNone) {
8277       t_mask = newTemp(Ity_I8);
8278       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
8279    }
8280
8281    /* t_mask is now a suitable byte mask */
8282
8283    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
8284
8285    if (op != BtOpNone) {
8286       switch (op) {
8287          case BtOpSet:
8288             assign( t_new,
8289                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
8290             break;
8291          case BtOpComp:
8292             assign( t_new,
8293                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
8294             break;
8295          case BtOpReset:
8296             assign( t_new,
8297                     binop(Iop_And8, mkexpr(t_fetched),
8298                                     unop(Iop_Not8, mkexpr(t_mask))) );
8299             break;
8300          default:
8301             vpanic("dis_bt_G_E(amd64)");
8302       }
8303       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
8304          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
8305                                  mkexpr(t_new)/*new*/,
8306                                  guest_RIP_curr_instr );
8307       } else {
8308          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
8309       }
8310    }
8311
8312    /* Side effect done; now get selected bit into Carry flag.  The Intel docs
8313       (as of 2015, at least) say that C holds the result, Z is unchanged, and
8314       O,S,A and P are undefined.  However, on Skylake it appears that O,S,A,P
8315       are also unchanged, so let's do that. */
8316    const ULong maskC     = AMD64G_CC_MASK_C;
8317    const ULong maskOSZAP = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S
8318                            | AMD64G_CC_MASK_Z | AMD64G_CC_MASK_A
8319                            | AMD64G_CC_MASK_P;
8320
8321    IRTemp old_rflags = newTemp(Ity_I64);
8322    assign(old_rflags, mk_amd64g_calculate_rflags_all());
8323
8324    IRTemp new_rflags = newTemp(Ity_I64);
8325    assign(new_rflags,
8326           binop(Iop_Or64,
8327                 binop(Iop_And64, mkexpr(old_rflags), mkU64(maskOSZAP)),
8328                 binop(Iop_And64,
8329                       binop(Iop_Shr64,
8330                             unop(Iop_8Uto64, mkexpr(t_fetched)),
8331                             mkexpr(t_bitno2)),
8332                       mkU64(maskC))));
8333
8334    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8335    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8336    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
8337    /* Set NDEP even though it isn't used.  This makes redundant-PUT
8338       elimination of previous stores to this field work better. */
8339    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8340
8341    /* Move reg operand from stack back to reg */
8342    if (epartIsReg(modrm)) {
8343       /* t_rsp still points at it. */
8344       /* only write the reg if actually modifying it; doing otherwise
8345          zeroes the top half erroneously when doing btl due to
8346          standard zero-extend rule */
8347       if (op != BtOpNone)
8348          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
8349       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
8350    }
8351
8352    DIP("bt%s%c %s, %s\n",
8353        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
8354        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
8355
8356    return delta;
8357 }
8358
8359
8360
8361 /* Handle BSF/BSR.  Only v-size seems necessary. */
8362 static
8363 ULong dis_bs_E_G ( const VexAbiInfo* vbi,
8364                    Prefix pfx, Int sz, Long delta, Bool fwds )
8365 {
8366    Bool   isReg;
8367    UChar  modrm;
8368    HChar  dis_buf[50];
8369
8370    IRType ty    = szToITy(sz);
8371    IRTemp src   = newTemp(ty);
8372    IRTemp dst   = newTemp(ty);
8373    IRTemp src64 = newTemp(Ity_I64);
8374    IRTemp dst64 = newTemp(Ity_I64);
8375    IRTemp srcB  = newTemp(Ity_I1);
8376
8377    vassert(sz == 8 || sz == 4 || sz == 2);
8378
8379    modrm = getUChar(delta);
8380    isReg = epartIsReg(modrm);
8381    if (isReg) {
8382       delta++;
8383       assign( src, getIRegE(sz, pfx, modrm) );
8384    } else {
8385       Int    len;
8386       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
8387       delta += len;
8388       assign( src, loadLE(ty, mkexpr(addr)) );
8389    }
8390
8391    DIP("bs%c%c %s, %s\n",
8392        fwds ? 'f' : 'r', nameISize(sz),
8393        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
8394        nameIRegG(sz, pfx, modrm));
8395
8396    /* First, widen src to 64 bits if it is not already. */
8397    assign( src64, widenUto64(mkexpr(src)) );
8398
8399    /* Generate a bool expression which is zero iff the original is
8400       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
8401       instrumented by Memcheck, is instrumented expensively, since
8402       this may be used on the output of a preceding movmskb insn,
8403       which has been known to be partially defined, and in need of
8404       careful handling. */
8405    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
8406
8407    /* Flags: Z is 1 iff source value is zero.  All others
8408       are undefined -- we force them to zero. */
8409    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8410    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8411    stmt( IRStmt_Put(
8412             OFFB_CC_DEP1,
8413             IRExpr_ITE( mkexpr(srcB),
8414                         /* src!=0 */
8415                         mkU64(0),
8416                         /* src==0 */
8417                         mkU64(AMD64G_CC_MASK_Z)
8418                         )
8419        ));
8420    /* Set NDEP even though it isn't used.  This makes redundant-PUT
8421       elimination of previous stores to this field work better. */
8422    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8423
8424    /* Result: iff source value is zero, we can't use
8425       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
8426       But anyway, amd64 semantics say the result is undefined in
8427       such situations.  Hence handle the zero case specially. */
8428
8429    /* Bleh.  What we compute:
8430
8431           bsf64:  if src == 0 then {dst is unchanged}
8432                               else Ctz64(src)
8433
8434           bsr64:  if src == 0 then {dst is unchanged}
8435                               else 63 - Clz64(src)
8436
8437           bsf32:  if src == 0 then {dst is unchanged}
8438                               else Ctz64(32Uto64(src))
8439
8440           bsr32:  if src == 0 then {dst is unchanged}
8441                               else 63 - Clz64(32Uto64(src))
8442
8443           bsf16:  if src == 0 then {dst is unchanged}
8444                               else Ctz64(32Uto64(16Uto32(src)))
8445
8446           bsr16:  if src == 0 then {dst is unchanged}
8447                               else 63 - Clz64(32Uto64(16Uto32(src)))
8448    */
8449
8450    /* The main computation, guarding against zero. */
8451    assign( dst64,
8452            IRExpr_ITE(
8453               mkexpr(srcB),
8454               /* src != 0 */
8455               fwds ? unop(Iop_Ctz64, mkexpr(src64))
8456                    : binop(Iop_Sub64,
8457                            mkU64(63),
8458                            unop(Iop_Clz64, mkexpr(src64))),
8459               /* src == 0 -- leave dst unchanged */
8460               widenUto64( getIRegG( sz, pfx, modrm ) )
8461            )
8462          );
8463
8464    if (sz == 2)
8465       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
8466    else
8467    if (sz == 4)
8468       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
8469    else
8470       assign( dst, mkexpr(dst64) );
8471
8472    /* dump result back */
8473    putIRegG( sz, pfx, modrm, mkexpr(dst) );
8474
8475    return delta;
8476 }
8477
8478
8479 /* swap rAX with the reg specified by reg and REX.B */
8480 static
8481 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
8482 {
8483    IRType ty = szToITy(sz);
8484    IRTemp t1 = newTemp(ty);
8485    IRTemp t2 = newTemp(ty);
8486    vassert(sz == 2 || sz == 4 || sz == 8);
8487    vassert(regLo3 < 8);
8488    if (sz == 8) {
8489       assign( t1, getIReg64(R_RAX) );
8490       assign( t2, getIRegRexB(8, pfx, regLo3) );
8491       putIReg64( R_RAX, mkexpr(t2) );
8492       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
8493    } else if (sz == 4) {
8494       assign( t1, getIReg32(R_RAX) );
8495       assign( t2, getIRegRexB(4, pfx, regLo3) );
8496       putIReg32( R_RAX, mkexpr(t2) );
8497       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
8498    } else {
8499       assign( t1, getIReg16(R_RAX) );
8500       assign( t2, getIRegRexB(2, pfx, regLo3) );
8501       putIReg16( R_RAX, mkexpr(t2) );
8502       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
8503    }
8504    DIP("xchg%c %s, %s\n",
8505        nameISize(sz), nameIRegRAX(sz),
8506                       nameIRegRexB(sz,pfx, regLo3));
8507 }
8508
8509
8510 static
8511 void codegen_SAHF ( void )
8512 {
8513    /* Set the flags to:
8514       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
8515                                     -- retain the old O flag
8516       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8517                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
8518    */
8519    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8520                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
8521    IRTemp oldflags   = newTemp(Ity_I64);
8522    assign( oldflags, mk_amd64g_calculate_rflags_all() );
8523    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8524    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8525    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8526    stmt( IRStmt_Put( OFFB_CC_DEP1,
8527          binop(Iop_Or64,
8528                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
8529                binop(Iop_And64,
8530                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
8531                      mkU64(mask_SZACP))
8532               )
8533    ));
8534 }
8535
8536
8537 static
8538 void codegen_LAHF ( void  )
8539 {
8540    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
8541    IRExpr* rax_with_hole;
8542    IRExpr* new_byte;
8543    IRExpr* new_rax;
8544    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8545                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
8546
8547    IRTemp  flags = newTemp(Ity_I64);
8548    assign( flags, mk_amd64g_calculate_rflags_all() );
8549
8550    rax_with_hole
8551       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
8552    new_byte
8553       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
8554                         mkU64(1<<1));
8555    new_rax
8556       = binop(Iop_Or64, rax_with_hole,
8557                         binop(Iop_Shl64, new_byte, mkU8(8)));
8558    putIReg64(R_RAX, new_rax);
8559 }
8560
8561
8562 static
8563 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
8564                         const VexAbiInfo*  vbi,
8565                         Prefix       pfx,
8566                         Int          size,
8567                         Long         delta0 )
8568 {
8569    HChar dis_buf[50];
8570    Int   len;
8571
8572    IRType ty    = szToITy(size);
8573    IRTemp acc   = newTemp(ty);
8574    IRTemp src   = newTemp(ty);
8575    IRTemp dest  = newTemp(ty);
8576    IRTemp dest2 = newTemp(ty);
8577    IRTemp acc2  = newTemp(ty);
8578    IRTemp cond  = newTemp(Ity_I1);
8579    IRTemp addr  = IRTemp_INVALID;
8580    UChar  rm    = getUChar(delta0);
8581
8582    /* There are 3 cases to consider:
8583
8584       reg-reg: ignore any lock prefix, generate sequence based
8585                on ITE
8586
8587       reg-mem, not locked: ignore any lock prefix, generate sequence
8588                            based on ITE
8589
8590       reg-mem, locked: use IRCAS
8591    */
8592
8593    /* Decide whether F2 or F3 are acceptable.  Never for register
8594       case, but for the memory case, one or the other is OK provided
8595       LOCK is also present. */
8596    if (epartIsReg(rm)) {
8597       if (haveF2orF3(pfx)) {
8598          *ok = False;
8599          return delta0;
8600       }
8601    } else {
8602       if (haveF2orF3(pfx)) {
8603          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
8604             *ok = False;
8605             return delta0;
8606          }
8607       }
8608    }
8609
8610    if (epartIsReg(rm)) {
8611       /* case 1 */
8612       assign( dest, getIRegE(size, pfx, rm) );
8613       delta0++;
8614       assign( src, getIRegG(size, pfx, rm) );
8615       assign( acc, getIRegRAX(size) );
8616       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8617       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8618       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
8619       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8620       putIRegRAX(size, mkexpr(acc2));
8621       putIRegE(size, pfx, rm, mkexpr(dest2));
8622       DIP("cmpxchg%c %s,%s\n", nameISize(size),
8623                                nameIRegG(size,pfx,rm),
8624                                nameIRegE(size,pfx,rm) );
8625    }
8626    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
8627       /* case 2 */
8628       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8629       assign( dest, loadLE(ty, mkexpr(addr)) );
8630       delta0 += len;
8631       assign( src, getIRegG(size, pfx, rm) );
8632       assign( acc, getIRegRAX(size) );
8633       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8634       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8635       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
8636       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8637       putIRegRAX(size, mkexpr(acc2));
8638       storeLE( mkexpr(addr), mkexpr(dest2) );
8639       DIP("cmpxchg%c %s,%s\n", nameISize(size),
8640                                nameIRegG(size,pfx,rm), dis_buf);
8641    }
8642    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
8643       /* case 3 */
8644       /* src is new value.  acc is expected value.  dest is old value.
8645          Compute success from the output of the IRCAS, and steer the
8646          new value for RAX accordingly: in case of success, RAX is
8647          unchanged. */
8648       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8649       delta0 += len;
8650       assign( src, getIRegG(size, pfx, rm) );
8651       assign( acc, getIRegRAX(size) );
8652       stmt( IRStmt_CAS(
8653          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
8654                   NULL, mkexpr(acc), NULL, mkexpr(src) )
8655       ));
8656       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8657       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8658       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8659       putIRegRAX(size, mkexpr(acc2));
8660       DIP("cmpxchg%c %s,%s\n", nameISize(size),
8661                                nameIRegG(size,pfx,rm), dis_buf);
8662    }
8663    else vassert(0);
8664
8665    *ok = True;
8666    return delta0;
8667 }
8668
8669
8670 /* Handle conditional move instructions of the form
8671       cmovcc E(reg-or-mem), G(reg)
8672
8673    E(src) is reg-or-mem
8674    G(dst) is reg.
8675
8676    If E is reg, -->    GET %E, tmps
8677                        GET %G, tmpd
8678                        CMOVcc tmps, tmpd
8679                        PUT tmpd, %G
8680
8681    If E is mem  -->    (getAddr E) -> tmpa
8682                        LD (tmpa), tmps
8683                        GET %G, tmpd
8684                        CMOVcc tmps, tmpd
8685                        PUT tmpd, %G
8686 */
8687 static
8688 ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
8689                      Prefix        pfx,
8690                      Int           sz,
8691                      AMD64Condcode cond,
8692                      Long          delta0 )
8693 {
8694    UChar rm  = getUChar(delta0);
8695    HChar dis_buf[50];
8696    Int   len;
8697
8698    IRType ty   = szToITy(sz);
8699    IRTemp tmps = newTemp(ty);
8700    IRTemp tmpd = newTemp(ty);
8701
8702    if (epartIsReg(rm)) {
8703       assign( tmps, getIRegE(sz, pfx, rm) );
8704       assign( tmpd, getIRegG(sz, pfx, rm) );
8705
8706       putIRegG( sz, pfx, rm,
8707                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
8708                             mkexpr(tmps),
8709                             mkexpr(tmpd) )
8710               );
8711       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8712                             nameIRegE(sz,pfx,rm),
8713                             nameIRegG(sz,pfx,rm));
8714       return 1+delta0;
8715    }
8716
8717    /* E refers to memory */
8718    {
8719       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8720       assign( tmps, loadLE(ty, mkexpr(addr)) );
8721       assign( tmpd, getIRegG(sz, pfx, rm) );
8722
8723       putIRegG( sz, pfx, rm,
8724                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
8725                             mkexpr(tmps),
8726                             mkexpr(tmpd) )
8727               );
8728
8729       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8730                             dis_buf,
8731                             nameIRegG(sz,pfx,rm));
8732       return len+delta0;
8733    }
8734 }
8735
8736
8737 static
8738 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
8739                      const VexAbiInfo* vbi,
8740                      Prefix pfx, Int sz, Long delta0 )
8741 {
8742    Int   len;
8743    UChar rm = getUChar(delta0);
8744    HChar dis_buf[50];
8745
8746    IRType ty    = szToITy(sz);
8747    IRTemp tmpd  = newTemp(ty);
8748    IRTemp tmpt0 = newTemp(ty);
8749    IRTemp tmpt1 = newTemp(ty);
8750
8751    /* There are 3 cases to consider:
8752
8753       reg-reg: ignore any lock prefix,
8754                generate 'naive' (non-atomic) sequence
8755
8756       reg-mem, not locked: ignore any lock prefix, generate 'naive'
8757                            (non-atomic) sequence
8758
8759       reg-mem, locked: use IRCAS
8760    */
8761
8762    if (epartIsReg(rm)) {
8763       /* case 1 */
8764       assign( tmpd, getIRegE(sz, pfx, rm) );
8765       assign( tmpt0, getIRegG(sz, pfx, rm) );
8766       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8767                            mkexpr(tmpd), mkexpr(tmpt0)) );
8768       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8769       putIRegG(sz, pfx, rm, mkexpr(tmpd));
8770       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
8771       DIP("xadd%c %s, %s\n",
8772           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
8773       *decode_ok = True;
8774       return 1+delta0;
8775    }
8776    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
8777       /* case 2 */
8778       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8779       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8780       assign( tmpt0, getIRegG(sz, pfx, rm) );
8781       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8782                            mkexpr(tmpd), mkexpr(tmpt0)) );
8783       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8784       storeLE( mkexpr(addr), mkexpr(tmpt1) );
8785       putIRegG(sz, pfx, rm, mkexpr(tmpd));
8786       DIP("xadd%c %s, %s\n",
8787           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8788       *decode_ok = True;
8789       return len+delta0;
8790    }
8791    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
8792       /* case 3 */
8793       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8794       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8795       assign( tmpt0, getIRegG(sz, pfx, rm) );
8796       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8797                            mkexpr(tmpd), mkexpr(tmpt0)) );
8798       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
8799                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
8800       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8801       putIRegG(sz, pfx, rm, mkexpr(tmpd));
8802       DIP("xadd%c %s, %s\n",
8803           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8804       *decode_ok = True;
8805       return len+delta0;
8806    }
8807    /*UNREACHED*/
8808    vassert(0);
8809 }
8810
8811 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
8812 //..
8813 //.. static
8814 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
8815 //.. {
8816 //..    Int    len;
8817 //..    IRTemp addr;
8818 //..    UChar  rm  = getUChar(delta0);
8819 //..    HChar  dis_buf[50];
8820 //..
8821 //..    if (epartIsReg(rm)) {
8822 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
8823 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
8824 //..       return 1+delta0;
8825 //..    } else {
8826 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8827 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
8828 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
8829 //..       return len+delta0;
8830 //..    }
8831 //.. }
8832 //..
8833 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
8834 //..    dst is ireg and sz==4, zero out top half of it.  */
8835 //..
8836 //.. static
8837 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
8838 //..                      Int   sz,
8839 //..                      UInt  delta0 )
8840 //.. {
8841 //..    Int    len;
8842 //..    IRTemp addr;
8843 //..    UChar  rm  = getUChar(delta0);
8844 //..    HChar  dis_buf[50];
8845 //..
8846 //..    vassert(sz == 2 || sz == 4);
8847 //..
8848 //..    if (epartIsReg(rm)) {
8849 //..       if (sz == 4)
8850 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
8851 //..       else
8852 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
8853 //..
8854 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
8855 //..       return 1+delta0;
8856 //..    } else {
8857 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8858 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
8859 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
8860 //..       return len+delta0;
8861 //..    }
8862 //.. }
8863
8864 /* Handle move instructions of the form
8865       mov S, E  meaning
8866       mov sreg, reg-or-mem
8867    Is passed the a ptr to the modRM byte, and the data size.  Returns
8868    the address advanced completely over this instruction.
8869
8870    VEX does not currently simulate segment registers on AMD64 which means that
8871    instead of moving a value of a segment register, zero is moved to the
8872    destination.  The zero value represents a null (unused) selector.  This is
8873    not correct (especially for the %cs, %fs and %gs registers) but it seems to
8874    provide a sufficient simulation for currently seen programs that use this
8875    instruction.  If some program actually decides to use the obtained segment
8876    selector for something meaningful then the zero value should be a clear
8877    indicator that there is some problem.
8878
8879    S(src) is sreg.
8880    E(dst) is reg-or-mem
8881
8882    If E is reg, -->    PUT $0, %E
8883
8884    If E is mem, -->    (getAddr E) -> tmpa
8885                        ST $0, (tmpa)
8886 */
8887 static
8888 ULong dis_mov_S_E ( const VexAbiInfo* vbi,
8889                     Prefix      pfx,
8890                     Int         size,
8891                     Long        delta0 )
8892 {
8893    Int   len;
8894    UChar rm = getUChar(delta0);
8895    HChar dis_buf[50];
8896
8897    if (epartIsReg(rm)) {
8898       putIRegE(size, pfx, rm, mkU(szToITy(size), 0));
8899       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
8900                          nameIRegE(size, pfx, rm));
8901       return 1+delta0;
8902    }
8903
8904    /* E refers to memory */
8905    {
8906       IRTemp addr = disAMode(&len, vbi, pfx, delta0, dis_buf, 0);
8907       storeLE(mkexpr(addr), mkU16(0));
8908       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
8909                          dis_buf);
8910       return len+delta0;
8911    }
8912 }
8913
8914 //.. static
8915 //.. void dis_push_segreg ( UInt sreg, Int sz )
8916 //.. {
8917 //..     IRTemp t1 = newTemp(Ity_I16);
8918 //..     IRTemp ta = newTemp(Ity_I32);
8919 //..     vassert(sz == 2 || sz == 4);
8920 //..
8921 //..     assign( t1, getSReg(sreg) );
8922 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
8923 //..     putIReg(4, R_ESP, mkexpr(ta));
8924 //..     storeLE( mkexpr(ta), mkexpr(t1) );
8925 //..
8926 //..     DIP("pushw %s\n", nameSReg(sreg));
8927 //.. }
8928 //..
8929 //.. static
8930 //.. void dis_pop_segreg ( UInt sreg, Int sz )
8931 //.. {
8932 //..     IRTemp t1 = newTemp(Ity_I16);
8933 //..     IRTemp ta = newTemp(Ity_I32);
8934 //..     vassert(sz == 2 || sz == 4);
8935 //..
8936 //..     assign( ta, getIReg(4, R_ESP) );
8937 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
8938 //..
8939 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
8940 //..     putSReg( sreg, mkexpr(t1) );
8941 //..     DIP("pop %s\n", nameSReg(sreg));
8942 //.. }
8943
8944 static
8945 void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
8946 {
8947    IRTemp t1 = newTemp(Ity_I64);
8948    IRTemp t2 = newTemp(Ity_I64);
8949    IRTemp t3 = newTemp(Ity_I64);
8950    assign(t1, getIReg64(R_RSP));
8951    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
8952    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
8953    putIReg64(R_RSP, mkexpr(t3));
8954    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
8955    jmp_treg(dres, Ijk_Ret, t2);
8956    vassert(dres->whatNext == Dis_StopHere);
8957 }
8958
8959
8960 /*------------------------------------------------------------*/
8961 /*--- SSE/SSE2/SSE3 helpers                                ---*/
8962 /*------------------------------------------------------------*/
8963
8964 /* Indicates whether the op requires a rounding-mode argument.  Note
8965    that this covers only vector floating point arithmetic ops, and
8966    omits the scalar ones that need rounding modes.  Note also that
8967    inconsistencies here will get picked up later by the IR sanity
8968    checker, so this isn't correctness-critical. */
8969 static Bool requiresRMode ( IROp op )
8970 {
8971    switch (op) {
8972       /* 128 bit ops */
8973       case Iop_Add32Fx4: case Iop_Sub32Fx4:
8974       case Iop_Mul32Fx4: case Iop_Div32Fx4:
8975       case Iop_Add64Fx2: case Iop_Sub64Fx2:
8976       case Iop_Mul64Fx2: case Iop_Div64Fx2:
8977       /* 256 bit ops */
8978       case Iop_Add32Fx8: case Iop_Sub32Fx8:
8979       case Iop_Mul32Fx8: case Iop_Div32Fx8:
8980       case Iop_Add64Fx4: case Iop_Sub64Fx4:
8981       case Iop_Mul64Fx4: case Iop_Div64Fx4:
8982          return True;
8983       default:
8984          break;
8985    }
8986    return False;
8987 }
8988
8989
8990 /* Worker function; do not call directly.
8991    Handles full width G = G `op` E   and   G = (not G) `op` E.
8992 */
8993
8994 static ULong dis_SSE_E_to_G_all_wrk (
8995                 const VexAbiInfo* vbi,
8996                 Prefix pfx, Long delta,
8997                 const HChar* opname, IROp op,
8998                 Bool   invertG
8999              )
9000 {
9001    HChar   dis_buf[50];
9002    Int     alen;
9003    IRTemp  addr;
9004    UChar   rm = getUChar(delta);
9005    Bool    needsRMode = requiresRMode(op);
9006    IRExpr* gpart
9007       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
9008                 : getXMMReg(gregOfRexRM(pfx,rm));
9009    if (epartIsReg(rm)) {
9010       putXMMReg(
9011          gregOfRexRM(pfx,rm),
9012          needsRMode
9013             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
9014                         gpart,
9015                         getXMMReg(eregOfRexRM(pfx,rm)))
9016             : binop(op, gpart,
9017                         getXMMReg(eregOfRexRM(pfx,rm)))
9018       );
9019       DIP("%s %s,%s\n", opname,
9020                         nameXMMReg(eregOfRexRM(pfx,rm)),
9021                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9022       return delta+1;
9023    } else {
9024       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9025       putXMMReg(
9026          gregOfRexRM(pfx,rm),
9027          needsRMode
9028             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
9029                         gpart,
9030                         loadLE(Ity_V128, mkexpr(addr)))
9031             : binop(op, gpart,
9032                         loadLE(Ity_V128, mkexpr(addr)))
9033       );
9034       DIP("%s %s,%s\n", opname,
9035                         dis_buf,
9036                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9037       return delta+alen;
9038    }
9039 }
9040
9041
9042 /* All lanes SSE binary operation, G = G `op` E. */
9043
9044 static
9045 ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
9046                            Prefix pfx, Long delta,
9047                            const HChar* opname, IROp op )
9048 {
9049    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
9050 }
9051
9052 /* All lanes SSE binary operation, G = (not G) `op` E. */
9053
9054 static
9055 ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
9056                                 Prefix pfx, Long delta,
9057                                 const HChar* opname, IROp op )
9058 {
9059    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
9060 }
9061
9062
9063 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
9064
9065 static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
9066                                    Prefix pfx, Long delta,
9067                                    const HChar* opname, IROp op )
9068 {
9069    HChar   dis_buf[50];
9070    Int     alen;
9071    IRTemp  addr;
9072    UChar   rm = getUChar(delta);
9073    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
9074    if (epartIsReg(rm)) {
9075       putXMMReg( gregOfRexRM(pfx,rm),
9076                  binop(op, gpart,
9077                            getXMMReg(eregOfRexRM(pfx,rm))) );
9078       DIP("%s %s,%s\n", opname,
9079                         nameXMMReg(eregOfRexRM(pfx,rm)),
9080                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9081       return delta+1;
9082    } else {
9083       /* We can only do a 32-bit memory read, so the upper 3/4 of the
9084          E operand needs to be made simply of zeroes. */
9085       IRTemp epart = newTemp(Ity_V128);
9086       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9087       assign( epart, unop( Iop_32UtoV128,
9088                            loadLE(Ity_I32, mkexpr(addr))) );
9089       putXMMReg( gregOfRexRM(pfx,rm),
9090                  binop(op, gpart, mkexpr(epart)) );
9091       DIP("%s %s,%s\n", opname,
9092                         dis_buf,
9093                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9094       return delta+alen;
9095    }
9096 }
9097
9098
9099 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
9100
9101 static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
9102                                    Prefix pfx, Long delta,
9103                                    const HChar* opname, IROp op )
9104 {
9105    HChar   dis_buf[50];
9106    Int     alen;
9107    IRTemp  addr;
9108    UChar   rm = getUChar(delta);
9109    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
9110    if (epartIsReg(rm)) {
9111       putXMMReg( gregOfRexRM(pfx,rm),
9112                  binop(op, gpart,
9113                            getXMMReg(eregOfRexRM(pfx,rm))) );
9114       DIP("%s %s,%s\n", opname,
9115                         nameXMMReg(eregOfRexRM(pfx,rm)),
9116                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9117       return delta+1;
9118    } else {
9119       /* We can only do a 64-bit memory read, so the upper half of the
9120          E operand needs to be made simply of zeroes. */
9121       IRTemp epart = newTemp(Ity_V128);
9122       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9123       assign( epart, unop( Iop_64UtoV128,
9124                            loadLE(Ity_I64, mkexpr(addr))) );
9125       putXMMReg( gregOfRexRM(pfx,rm),
9126                  binop(op, gpart, mkexpr(epart)) );
9127       DIP("%s %s,%s\n", opname,
9128                         dis_buf,
9129                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9130       return delta+alen;
9131    }
9132 }
9133
9134
9135 /* All lanes unary SSE operation, G = op(E). */
9136
9137 static ULong dis_SSE_E_to_G_unary_all (
9138                 const VexAbiInfo* vbi,
9139                 Prefix pfx, Long delta,
9140                 const HChar* opname, IROp op
9141              )
9142 {
9143    HChar   dis_buf[50];
9144    Int     alen;
9145    IRTemp  addr;
9146    UChar   rm = getUChar(delta);
9147    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
9148    // up in the usual way.
9149    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
9150    if (epartIsReg(rm)) {
9151       IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
9152       /* XXXROUNDINGFIXME */
9153       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
9154                               : unop(op, src);
9155       putXMMReg( gregOfRexRM(pfx,rm), res );
9156       DIP("%s %s,%s\n", opname,
9157                         nameXMMReg(eregOfRexRM(pfx,rm)),
9158                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9159       return delta+1;
9160    } else {
9161       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9162       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
9163       /* XXXROUNDINGFIXME */
9164       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
9165                               : unop(op, src);
9166       putXMMReg( gregOfRexRM(pfx,rm), res );
9167       DIP("%s %s,%s\n", opname,
9168                         dis_buf,
9169                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9170       return delta+alen;
9171    }
9172 }
9173
9174
9175 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
9176
9177 static ULong dis_SSE_E_to_G_unary_lo32 (
9178                 const VexAbiInfo* vbi,
9179                 Prefix pfx, Long delta,
9180                 const HChar* opname, IROp op
9181              )
9182 {
9183    /* First we need to get the old G value and patch the low 32 bits
9184       of the E operand into it.  Then apply op and write back to G. */
9185    HChar   dis_buf[50];
9186    Int     alen;
9187    IRTemp  addr;
9188    UChar   rm = getUChar(delta);
9189    IRTemp  oldG0 = newTemp(Ity_V128);
9190    IRTemp  oldG1 = newTemp(Ity_V128);
9191
9192    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
9193
9194    if (epartIsReg(rm)) {
9195       assign( oldG1,
9196               binop( Iop_SetV128lo32,
9197                      mkexpr(oldG0),
9198                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
9199       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9200       DIP("%s %s,%s\n", opname,
9201                         nameXMMReg(eregOfRexRM(pfx,rm)),
9202                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9203       return delta+1;
9204    } else {
9205       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9206       assign( oldG1,
9207               binop( Iop_SetV128lo32,
9208                      mkexpr(oldG0),
9209                      loadLE(Ity_I32, mkexpr(addr)) ));
9210       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9211       DIP("%s %s,%s\n", opname,
9212                         dis_buf,
9213                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9214       return delta+alen;
9215    }
9216 }
9217
9218
9219 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
9220
9221 static ULong dis_SSE_E_to_G_unary_lo64 (
9222                 const VexAbiInfo* vbi,
9223                 Prefix pfx, Long delta,
9224                 const HChar* opname, IROp op
9225              )
9226 {
9227    /* First we need to get the old G value and patch the low 64 bits
9228       of the E operand into it.  Then apply op and write back to G. */
9229    HChar   dis_buf[50];
9230    Int     alen;
9231    IRTemp  addr;
9232    UChar   rm = getUChar(delta);
9233    IRTemp  oldG0 = newTemp(Ity_V128);
9234    IRTemp  oldG1 = newTemp(Ity_V128);
9235
9236    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
9237
9238    if (epartIsReg(rm)) {
9239       assign( oldG1,
9240               binop( Iop_SetV128lo64,
9241                      mkexpr(oldG0),
9242                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
9243       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9244       DIP("%s %s,%s\n", opname,
9245                         nameXMMReg(eregOfRexRM(pfx,rm)),
9246                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9247       return delta+1;
9248    } else {
9249       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9250       assign( oldG1,
9251               binop( Iop_SetV128lo64,
9252                      mkexpr(oldG0),
9253                      loadLE(Ity_I64, mkexpr(addr)) ));
9254       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9255       DIP("%s %s,%s\n", opname,
9256                         dis_buf,
9257                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9258       return delta+alen;
9259    }
9260 }
9261
9262
9263 /* SSE integer binary operation:
9264       G = G `op` E   (eLeft == False)
9265       G = E `op` G   (eLeft == True)
9266 */
9267 static ULong dis_SSEint_E_to_G(
9268                 const VexAbiInfo* vbi,
9269                 Prefix pfx, Long delta,
9270                 const HChar* opname, IROp op,
9271                 Bool   eLeft
9272              )
9273 {
9274    HChar   dis_buf[50];
9275    Int     alen;
9276    IRTemp  addr;
9277    UChar   rm = getUChar(delta);
9278    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
9279    IRExpr* epart = NULL;
9280    if (epartIsReg(rm)) {
9281       epart = getXMMReg(eregOfRexRM(pfx,rm));
9282       DIP("%s %s,%s\n", opname,
9283                         nameXMMReg(eregOfRexRM(pfx,rm)),
9284                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9285       delta += 1;
9286    } else {
9287       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9288       epart = loadLE(Ity_V128, mkexpr(addr));
9289       DIP("%s %s,%s\n", opname,
9290                         dis_buf,
9291                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9292       delta += alen;
9293    }
9294    putXMMReg( gregOfRexRM(pfx,rm),
9295               eLeft ? binop(op, epart, gpart)
9296                     : binop(op, gpart, epart) );
9297    return delta;
9298 }
9299
9300
9301 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
9302    This is all a bit of a kludge in that it ignores the subtleties of
9303    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
9304    spec.  The meaning of the outputs is as follows:
9305
9306    preZeroP: the active lanes of both incoming arguments should be set to zero
9307       before performing the operation.  IOW the actual args are to be ignored
9308       and instead zero bits are to be used.  This is a bit strange but is needed
9309       to make the constant-false/true variants (FALSE_OQ, TRUE_UQ, FALSE_OS,
9310       TRUE_US) work.
9311
9312    preSwapP: the args should be swapped before performing the operation.  Note
9313      that zeroing arg input sections (per preZeroP) and swapping them (per
9314      preSwapP) are allowed to happen in either order; the result is the same.
9315
9316    opP: this returns the actual comparison op to perform.
9317
9318    postNotP: if true, the result(ing vector) of the comparison operation should
9319      be bitwise-not-ed.  Note that only the lanes of the output actually
9320      computed by opP should be not-ed.
9321 */
9322 static Bool findSSECmpOp ( /*OUT*/Bool* preZeroP,
9323                            /*OUT*/Bool* preSwapP,
9324                            /*OUT*/IROp* opP,
9325                            /*OUT*/Bool* postNotP,
9326                            UInt imm8, Bool all_lanes, Int sz )
9327 {
9328    vassert(*preZeroP == False);
9329    vassert(*preSwapP == False);
9330    vassert(*opP == Iop_INVALID);
9331    vassert(*postNotP == False);
9332
9333    if (imm8 >= 32) return False;
9334
9335    /* First, compute a (preZero, preSwap, op, postNot) quad from
9336       the supplied imm8. */
9337    Bool preZero = False;
9338    Bool preSwap = False;
9339    IROp op      = Iop_INVALID;
9340    Bool postNot = False;
9341
9342 #  define XXX(_preZero, _preSwap, _op, _postNot) \
9343       { preZero = _preZero; preSwap = _preSwap; op = _op; postNot = _postNot; }
9344    // If you add a case here, add a corresponding test for both VCMPSD_128
9345    // and VCMPSS_128 in avx-1.c.
9346    // Cases 0xA and above are
9347    //    "Enhanced Comparison Predicate[s] for VEX-Encoded [insns]"
9348    switch (imm8) {
9349       // "O" = ordered, "U" = unordered
9350       // "Q" = non-signalling (quiet), "S" = signalling
9351       //
9352       //             replace active arg lanes in operands with zero
9353       //             |
9354       //             |      swap operands before applying the cmp op?
9355       //             |      |
9356       //             |      |      cmp op          invert active lanes after?
9357       //             |      |      |               |
9358       //             v      v      v               v
9359       case 0x0:  XXX(False, False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
9360       case 0x8:  XXX(False, False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
9361       case 0x10: XXX(False, False, Iop_CmpEQ32Fx4, False); break; // EQ_OS
9362       case 0x18: XXX(False, False, Iop_CmpEQ32Fx4, False); break; // EQ_US
9363       //
9364       case 0x1:  XXX(False, False, Iop_CmpLT32Fx4, False); break; // LT_OS
9365       case 0x11: XXX(False, False, Iop_CmpLT32Fx4, False); break; // LT_OQ
9366       //
9367       case 0x2:  XXX(False, False, Iop_CmpLE32Fx4, False); break; // LE_OS
9368       case 0x12: XXX(False, False, Iop_CmpLE32Fx4, False); break; // LE_OQ
9369       //
9370       case 0x3:  XXX(False, False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
9371       case 0x13: XXX(False, False, Iop_CmpUN32Fx4, False); break; // UNORD_S
9372       //
9373       // 0xC: this isn't really right because it returns all-1s when
9374       // either operand is a NaN, and it should return all-0s.
9375       case 0x4:  XXX(False, False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
9376       case 0xC:  XXX(False, False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
9377       case 0x14: XXX(False, False, Iop_CmpEQ32Fx4, True);  break; // NEQ_US
9378       case 0x1C: XXX(False, False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OS
9379       //
9380       case 0x5:  XXX(False, False, Iop_CmpLT32Fx4, True);  break; // NLT_US
9381       case 0x15: XXX(False, False, Iop_CmpLT32Fx4, True);  break; // NLT_UQ
9382       //
9383       case 0x6:  XXX(False, False, Iop_CmpLE32Fx4, True);  break; // NLE_US
9384       case 0x16: XXX(False, False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
9385       //
9386       case 0x7:  XXX(False, False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
9387       case 0x17: XXX(False, False, Iop_CmpUN32Fx4, True);  break; // ORD_S
9388       //
9389       case 0x9:  XXX(False, True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
9390       case 0x19: XXX(False, True,  Iop_CmpLE32Fx4, True);  break; // NGE_UQ
9391       //
9392       case 0xA:  XXX(False, True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
9393       case 0x1A: XXX(False, True,  Iop_CmpLT32Fx4, True);  break; // NGT_UQ
9394       //
9395       case 0xD:  XXX(False, True,  Iop_CmpLE32Fx4, False); break; // GE_OS
9396       case 0x1D: XXX(False, True,  Iop_CmpLE32Fx4, False); break; // GE_OQ
9397       //
9398       case 0xE:  XXX(False, True,  Iop_CmpLT32Fx4, False); break; // GT_OS
9399       case 0x1E: XXX(False, True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
9400       // Constant-value-result ops
9401       case 0xB:  XXX(True,  False, Iop_CmpEQ32Fx4, True);  break; // FALSE_OQ
9402       case 0xF:  XXX(True,  False, Iop_CmpEQ32Fx4, False); break; // TRUE_UQ
9403       case 0x1B: XXX(True,  False, Iop_CmpEQ32Fx4, True);  break; // FALSE_OS
9404       case 0x1F: XXX(True,  False, Iop_CmpEQ32Fx4, False); break; // TRUE_US
9405       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
9406          avx-1.c if new cases turn up. */
9407       default: break;
9408    }
9409 #  undef XXX
9410    if (op == Iop_INVALID) return False;
9411
9412    /* Now convert the op into one with the same arithmetic but that is
9413       correct for the width and laneage requirements. */
9414
9415    /**/ if (sz == 4 && all_lanes) {
9416       switch (op) {
9417          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
9418          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
9419          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
9420          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
9421          default: vassert(0);
9422       }
9423    }
9424    else if (sz == 4 && !all_lanes) {
9425       switch (op) {
9426          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
9427          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
9428          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
9429          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
9430          default: vassert(0);
9431       }
9432    }
9433    else if (sz == 8 && all_lanes) {
9434       switch (op) {
9435          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
9436          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
9437          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
9438          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
9439          default: vassert(0);
9440       }
9441    }
9442    else if (sz == 8 && !all_lanes) {
9443       switch (op) {
9444          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
9445          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
9446          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
9447          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
9448          default: vassert(0);
9449       }
9450    }
9451    else {
9452       vpanic("findSSECmpOp(amd64,guest)");
9453    }
9454
9455    if (preZero) {
9456       // In this case, preSwap is irrelevant, but assert anyway.
9457       vassert(preSwap == False);
9458    }
9459    *preZeroP = preZero; *preSwapP = preSwap; *opP = op; *postNotP = postNot;
9460    return True;
9461 }
9462
9463
9464 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
9465    returns the original delta to indicate failure. */
9466
9467 static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
9468                                  Prefix pfx, Long delta,
9469                                  const HChar* opname, Bool all_lanes, Int sz )
9470 {
9471    Long    delta0 = delta;
9472    HChar   dis_buf[50];
9473    Int     alen;
9474    UInt    imm8;
9475    IRTemp  addr;
9476    Bool    preZero = False;
9477    Bool    preSwap = False;
9478    IROp    op      = Iop_INVALID;
9479    Bool    postNot = False;
9480    IRTemp  plain   = newTemp(Ity_V128);
9481    UChar   rm      = getUChar(delta);
9482    UShort  mask    = 0;
9483    vassert(sz == 4 || sz == 8);
9484    if (epartIsReg(rm)) {
9485       imm8 = getUChar(delta+1);
9486       if (imm8 >= 8) return delta0; /* FAIL */
9487       Bool ok = findSSECmpOp(&preZero, &preSwap, &op, &postNot,
9488                              imm8, all_lanes, sz);
9489       if (!ok) return delta0; /* FAIL */
9490       vassert(!preZero); /* never needed for imm8 < 8 */
9491       vassert(!preSwap); /* never needed for imm8 < 8 */
9492       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
9493                                getXMMReg(eregOfRexRM(pfx,rm))) );
9494       delta += 2;
9495       DIP("%s $%u,%s,%s\n", opname,
9496                             imm8,
9497                             nameXMMReg(eregOfRexRM(pfx,rm)),
9498                             nameXMMReg(gregOfRexRM(pfx,rm)) );
9499    } else {
9500       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
9501       imm8 = getUChar(delta+alen);
9502       if (imm8 >= 8) return delta0; /* FAIL */
9503       Bool ok = findSSECmpOp(&preZero, &preSwap, &op, &postNot,
9504                              imm8, all_lanes, sz);
9505       if (!ok) return delta0; /* FAIL */
9506       vassert(!preZero); /* never needed for imm8 < 8 */
9507       vassert(!preSwap); /* never needed for imm8 < 8 */
9508       assign( plain,
9509               binop(
9510                  op,
9511                  getXMMReg(gregOfRexRM(pfx,rm)),
9512                    all_lanes
9513                       ? loadLE(Ity_V128, mkexpr(addr))
9514                    : sz == 8
9515                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
9516                    : /*sz==4*/
9517                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
9518               )
9519       );
9520       delta += alen+1;
9521       DIP("%s $%u,%s,%s\n", opname,
9522                             imm8,
9523                             dis_buf,
9524                             nameXMMReg(gregOfRexRM(pfx,rm)) );
9525    }
9526
9527    if (postNot && all_lanes) {
9528       putXMMReg( gregOfRexRM(pfx,rm),
9529                  unop(Iop_NotV128, mkexpr(plain)) );
9530    }
9531    else
9532    if (postNot && !all_lanes) {
9533       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
9534       putXMMReg( gregOfRexRM(pfx,rm),
9535                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
9536    }
9537    else {
9538       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
9539    }
9540
9541    return delta;
9542 }
9543
9544
9545 /* Vector by scalar shift of G by the amount specified at the bottom
9546    of E. */
9547
9548 static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
9549                                   Prefix pfx, Long delta,
9550                                   const HChar* opname, IROp op )
9551 {
9552    HChar   dis_buf[50];
9553    Int     alen, size;
9554    IRTemp  addr;
9555    Bool    shl, shr, sar;
9556    UChar   rm   = getUChar(delta);
9557    IRTemp  g0   = newTemp(Ity_V128);
9558    IRTemp  g1   = newTemp(Ity_V128);
9559    IRTemp  amt  = newTemp(Ity_I64);
9560    IRTemp  amt8 = newTemp(Ity_I8);
9561    if (epartIsReg(rm)) {
9562       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
9563       DIP("%s %s,%s\n", opname,
9564                         nameXMMReg(eregOfRexRM(pfx,rm)),
9565                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9566       delta++;
9567    } else {
9568       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9569       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
9570       DIP("%s %s,%s\n", opname,
9571                         dis_buf,
9572                         nameXMMReg(gregOfRexRM(pfx,rm)) );
9573       delta += alen;
9574    }
9575    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
9576    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
9577
9578    shl = shr = sar = False;
9579    size = 0;
9580    switch (op) {
9581       case Iop_ShlN16x8: shl = True; size = 32; break;
9582       case Iop_ShlN32x4: shl = True; size = 32; break;
9583       case Iop_ShlN64x2: shl = True; size = 64; break;
9584       case Iop_SarN16x8: sar = True; size = 16; break;
9585       case Iop_SarN32x4: sar = True; size = 32; break;
9586       case Iop_ShrN16x8: shr = True; size = 16; break;
9587       case Iop_ShrN32x4: shr = True; size = 32; break;
9588       case Iop_ShrN64x2: shr = True; size = 64; break;
9589       default: vassert(0);
9590    }
9591
9592    if (shl || shr) {
9593      assign(
9594         g1,
9595         IRExpr_ITE(
9596            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
9597            binop(op, mkexpr(g0), mkexpr(amt8)),
9598            mkV128(0x0000)
9599         )
9600      );
9601    } else
9602    if (sar) {
9603      assign(
9604         g1,
9605         IRExpr_ITE(
9606            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
9607            binop(op, mkexpr(g0), mkexpr(amt8)),
9608            binop(op, mkexpr(g0), mkU8(size-1))
9609         )
9610      );
9611    } else {
9612       vassert(0);
9613    }
9614
9615    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
9616    return delta;
9617 }
9618
9619
9620 /* Vector by scalar shift of E by an immediate byte. */
9621
9622 static
9623 ULong dis_SSE_shiftE_imm ( Prefix pfx,
9624                            Long delta, const HChar* opname, IROp op )
9625 {
9626    Bool    shl, shr, sar;
9627    UChar   rm   = getUChar(delta);
9628    IRTemp  e0   = newTemp(Ity_V128);
9629    IRTemp  e1   = newTemp(Ity_V128);
9630    UChar   amt, size;
9631    vassert(epartIsReg(rm));
9632    vassert(gregLO3ofRM(rm) == 2
9633            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
9634    amt = getUChar(delta+1);
9635    delta += 2;
9636    DIP("%s $%d,%s\n", opname,
9637                       (Int)amt,
9638                       nameXMMReg(eregOfRexRM(pfx,rm)) );
9639    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
9640
9641    shl = shr = sar = False;
9642    size = 0;
9643    switch (op) {
9644       case Iop_ShlN16x8: shl = True; size = 16; break;
9645       case Iop_ShlN32x4: shl = True; size = 32; break;
9646       case Iop_ShlN64x2: shl = True; size = 64; break;
9647       case Iop_SarN16x8: sar = True; size = 16; break;
9648       case Iop_SarN32x4: sar = True; size = 32; break;
9649       case Iop_ShrN16x8: shr = True; size = 16; break;
9650       case Iop_ShrN32x4: shr = True; size = 32; break;
9651       case Iop_ShrN64x2: shr = True; size = 64; break;
9652       default: vassert(0);
9653    }
9654
9655    if (shl || shr) {
9656      assign( e1, amt >= size
9657                     ? mkV128(0x0000)
9658                     : binop(op, mkexpr(e0), mkU8(amt))
9659      );
9660    } else
9661    if (sar) {
9662      assign( e1, amt >= size
9663                     ? binop(op, mkexpr(e0), mkU8(size-1))
9664                     : binop(op, mkexpr(e0), mkU8(amt))
9665      );
9666    } else {
9667       vassert(0);
9668    }
9669
9670    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
9671    return delta;
9672 }
9673
9674
9675 /* Get the current SSE rounding mode. */
9676
9677 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
9678 {
9679    return
9680       unop( Iop_64to32,
9681             binop( Iop_And64,
9682                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
9683                    mkU64(3) ));
9684 }
9685
9686 static void put_sse_roundingmode ( IRExpr* sseround )
9687 {
9688    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
9689    stmt( IRStmt_Put( OFFB_SSEROUND,
9690                      unop(Iop_32Uto64,sseround) ) );
9691 }
9692
9693 /* Break a V128-bit value up into four 32-bit ints. */
9694
9695 static void breakupV128to32s ( IRTemp t128,
9696                                /*OUTs*/
9697                                IRTemp* t3, IRTemp* t2,
9698                                IRTemp* t1, IRTemp* t0 )
9699 {
9700    IRTemp hi64 = newTemp(Ity_I64);
9701    IRTemp lo64 = newTemp(Ity_I64);
9702    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
9703    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
9704
9705    vassert(t0 && *t0 == IRTemp_INVALID);
9706    vassert(t1 && *t1 == IRTemp_INVALID);
9707    vassert(t2 && *t2 == IRTemp_INVALID);
9708    vassert(t3 && *t3 == IRTemp_INVALID);
9709
9710    *t0 = newTemp(Ity_I32);
9711    *t1 = newTemp(Ity_I32);
9712    *t2 = newTemp(Ity_I32);
9713    *t3 = newTemp(Ity_I32);
9714    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
9715    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
9716    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
9717    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
9718 }
9719
9720 /* Construct a V128-bit value from four 32-bit ints. */
9721
9722 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
9723                                IRTemp t1, IRTemp t0 )
9724 {
9725    return
9726       binop( Iop_64HLtoV128,
9727              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9728              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
9729    );
9730 }
9731
9732 /* Break a 64-bit value up into four 16-bit ints. */
9733
9734 static void breakup64to16s ( IRTemp t64,
9735                              /*OUTs*/
9736                              IRTemp* t3, IRTemp* t2,
9737                              IRTemp* t1, IRTemp* t0 )
9738 {
9739    IRTemp hi32 = newTemp(Ity_I32);
9740    IRTemp lo32 = newTemp(Ity_I32);
9741    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
9742    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
9743
9744    vassert(t0 && *t0 == IRTemp_INVALID);
9745    vassert(t1 && *t1 == IRTemp_INVALID);
9746    vassert(t2 && *t2 == IRTemp_INVALID);
9747    vassert(t3 && *t3 == IRTemp_INVALID);
9748
9749    *t0 = newTemp(Ity_I16);
9750    *t1 = newTemp(Ity_I16);
9751    *t2 = newTemp(Ity_I16);
9752    *t3 = newTemp(Ity_I16);
9753    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
9754    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
9755    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
9756    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
9757 }
9758
9759 /* Construct a 64-bit value from four 16-bit ints. */
9760
9761 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
9762                              IRTemp t1, IRTemp t0 )
9763 {
9764    return
9765       binop( Iop_32HLto64,
9766              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
9767              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
9768    );
9769 }
9770
9771 /* Break a V256-bit value up into four 64-bit ints. */
9772
9773 static void breakupV256to64s ( IRTemp t256,
9774                                /*OUTs*/
9775                                IRTemp* t3, IRTemp* t2,
9776                                IRTemp* t1, IRTemp* t0 )
9777 {
9778    vassert(t0 && *t0 == IRTemp_INVALID);
9779    vassert(t1 && *t1 == IRTemp_INVALID);
9780    vassert(t2 && *t2 == IRTemp_INVALID);
9781    vassert(t3 && *t3 == IRTemp_INVALID);
9782    *t0 = newTemp(Ity_I64);
9783    *t1 = newTemp(Ity_I64);
9784    *t2 = newTemp(Ity_I64);
9785    *t3 = newTemp(Ity_I64);
9786    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
9787    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
9788    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
9789    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
9790 }
9791
9792 /* Break a V256-bit value up into two V128s. */
9793
9794 static void breakupV256toV128s ( IRTemp t256,
9795                                  /*OUTs*/
9796                                  IRTemp* t1, IRTemp* t0 )
9797 {
9798    vassert(t0 && *t0 == IRTemp_INVALID);
9799    vassert(t1 && *t1 == IRTemp_INVALID);
9800    *t0 = newTemp(Ity_V128);
9801    *t1 = newTemp(Ity_V128);
9802    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
9803    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
9804 }
9805
9806 /* Break a V256-bit value up into eight 32-bit ints.  */
9807
9808 static void breakupV256to32s ( IRTemp t256,
9809                                /*OUTs*/
9810                                IRTemp* t7, IRTemp* t6,
9811                                IRTemp* t5, IRTemp* t4,
9812                                IRTemp* t3, IRTemp* t2,
9813                                IRTemp* t1, IRTemp* t0 )
9814 {
9815    IRTemp t128_1 = IRTemp_INVALID;
9816    IRTemp t128_0 = IRTemp_INVALID;
9817    breakupV256toV128s( t256, &t128_1, &t128_0 );
9818    breakupV128to32s( t128_1, t7, t6, t5, t4 );
9819    breakupV128to32s( t128_0, t3, t2, t1, t0 );
9820 }
9821
9822 /* Break a V128-bit value up into two 64-bit ints. */
9823
9824 static void breakupV128to64s ( IRTemp t128,
9825                                /*OUTs*/
9826                                IRTemp* t1, IRTemp* t0 )
9827 {
9828    vassert(t0 && *t0 == IRTemp_INVALID);
9829    vassert(t1 && *t1 == IRTemp_INVALID);
9830    *t0 = newTemp(Ity_I64);
9831    *t1 = newTemp(Ity_I64);
9832    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
9833    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
9834 }
9835
9836 /* Construct a V256-bit value from eight 32-bit ints. */
9837
9838 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
9839                                IRTemp t5, IRTemp t4,
9840                                IRTemp t3, IRTemp t2,
9841                                IRTemp t1, IRTemp t0 )
9842 {
9843    return
9844       binop( Iop_V128HLtoV256,
9845              binop( Iop_64HLtoV128,
9846                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
9847                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
9848              binop( Iop_64HLtoV128,
9849                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9850                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
9851    );
9852 }
9853
9854 /* Construct a V256-bit value from four 64-bit ints. */
9855
9856 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
9857                                IRTemp t1, IRTemp t0 )
9858 {
9859    return
9860       binop( Iop_V128HLtoV256,
9861              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
9862              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
9863    );
9864 }
9865
9866 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
9867    values (aa,bb), computes, for each of the 4 16-bit lanes:
9868
9869    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
9870 */
9871 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
9872 {
9873    IRTemp aa      = newTemp(Ity_I64);
9874    IRTemp bb      = newTemp(Ity_I64);
9875    IRTemp aahi32s = newTemp(Ity_I64);
9876    IRTemp aalo32s = newTemp(Ity_I64);
9877    IRTemp bbhi32s = newTemp(Ity_I64);
9878    IRTemp bblo32s = newTemp(Ity_I64);
9879    IRTemp rHi     = newTemp(Ity_I64);
9880    IRTemp rLo     = newTemp(Ity_I64);
9881    IRTemp one32x2 = newTemp(Ity_I64);
9882    assign(aa, aax);
9883    assign(bb, bbx);
9884    assign( aahi32s,
9885            binop(Iop_SarN32x2,
9886                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
9887                  mkU8(16) ));
9888    assign( aalo32s,
9889            binop(Iop_SarN32x2,
9890                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
9891                  mkU8(16) ));
9892    assign( bbhi32s,
9893            binop(Iop_SarN32x2,
9894                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
9895                  mkU8(16) ));
9896    assign( bblo32s,
9897            binop(Iop_SarN32x2,
9898                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
9899                  mkU8(16) ));
9900    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
9901    assign(
9902       rHi,
9903       binop(
9904          Iop_ShrN32x2,
9905          binop(
9906             Iop_Add32x2,
9907             binop(
9908                Iop_ShrN32x2,
9909                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
9910                mkU8(14)
9911             ),
9912             mkexpr(one32x2)
9913          ),
9914          mkU8(1)
9915       )
9916    );
9917    assign(
9918       rLo,
9919       binop(
9920          Iop_ShrN32x2,
9921          binop(
9922             Iop_Add32x2,
9923             binop(
9924                Iop_ShrN32x2,
9925                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
9926                mkU8(14)
9927             ),
9928             mkexpr(one32x2)
9929          ),
9930          mkU8(1)
9931       )
9932    );
9933    return
9934       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
9935 }
9936
9937 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
9938    values (aa,bb), computes, for each lane:
9939
9940           if aa_lane < 0 then - bb_lane
9941      else if aa_lane > 0 then bb_lane
9942      else 0
9943 */
9944 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
9945 {
9946    IRTemp aa       = newTemp(Ity_I64);
9947    IRTemp bb       = newTemp(Ity_I64);
9948    IRTemp zero     = newTemp(Ity_I64);
9949    IRTemp bbNeg    = newTemp(Ity_I64);
9950    IRTemp negMask  = newTemp(Ity_I64);
9951    IRTemp posMask  = newTemp(Ity_I64);
9952    IROp   opSub    = Iop_INVALID;
9953    IROp   opCmpGTS = Iop_INVALID;
9954
9955    switch (laneszB) {
9956       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
9957       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
9958       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
9959       default: vassert(0);
9960    }
9961
9962    assign( aa,      aax );
9963    assign( bb,      bbx );
9964    assign( zero,    mkU64(0) );
9965    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
9966    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
9967    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
9968
9969    return
9970       binop(Iop_Or64,
9971             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
9972             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
9973
9974 }
9975
9976
9977 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
9978    value aa, computes, for each lane
9979
9980    if aa < 0 then -aa else aa
9981
9982    Note that the result is interpreted as unsigned, so that the
9983    absolute value of the most negative signed input can be
9984    represented.
9985 */
9986 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
9987 {
9988    IRTemp res     = newTemp(Ity_I64);
9989    IRTemp zero    = newTemp(Ity_I64);
9990    IRTemp aaNeg   = newTemp(Ity_I64);
9991    IRTemp negMask = newTemp(Ity_I64);
9992    IRTemp posMask = newTemp(Ity_I64);
9993    IROp   opSub   = Iop_INVALID;
9994    IROp   opSarN  = Iop_INVALID;
9995
9996    switch (laneszB) {
9997       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
9998       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
9999       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
10000       default: vassert(0);
10001    }
10002
10003    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
10004    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
10005    assign( zero,    mkU64(0) );
10006    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
10007    assign( res,
10008            binop(Iop_Or64,
10009                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
10010                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
10011    return res;
10012 }
10013
10014 /* XMM version of math_PABS_MMX. */
10015 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
10016 {
10017    IRTemp res  = newTemp(Ity_V128);
10018    IRTemp aaHi = newTemp(Ity_I64);
10019    IRTemp aaLo = newTemp(Ity_I64);
10020    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
10021    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
10022    assign(res, binop(Iop_64HLtoV128,
10023                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
10024                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
10025    return res;
10026 }
10027
10028 /* Specialisations of math_PABS_XMM, since there's no easy way to do
10029    partial applications in C :-( */
10030 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
10031    return math_PABS_XMM(aa, 4);
10032 }
10033
10034 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
10035    return math_PABS_XMM(aa, 2);
10036 }
10037
10038 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
10039    return math_PABS_XMM(aa, 1);
10040 }
10041
10042 /* YMM version of math_PABS_XMM. */
10043 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
10044 {
10045    IRTemp res  = newTemp(Ity_V256);
10046    IRTemp aaHi = IRTemp_INVALID;
10047    IRTemp aaLo = IRTemp_INVALID;
10048    breakupV256toV128s(aa, &aaHi, &aaLo);
10049    assign(res, binop(Iop_V128HLtoV256,
10050                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
10051                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
10052    return res;
10053 }
10054
10055 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
10056    return math_PABS_YMM(aa, 4);
10057 }
10058
10059 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
10060    return math_PABS_YMM(aa, 2);
10061 }
10062
10063 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
10064    return math_PABS_YMM(aa, 1);
10065 }
10066
10067 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
10068                                         IRTemp lo64, Long byteShift )
10069 {
10070    vassert(byteShift >= 1 && byteShift <= 7);
10071    return
10072       binop(Iop_Or64,
10073             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
10074             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
10075       );
10076 }
10077
10078 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
10079 {
10080    IRTemp res = newTemp(Ity_V128);
10081    IRTemp sHi = newTemp(Ity_I64);
10082    IRTemp sLo = newTemp(Ity_I64);
10083    IRTemp dHi = newTemp(Ity_I64);
10084    IRTemp dLo = newTemp(Ity_I64);
10085    IRTemp rHi = newTemp(Ity_I64);
10086    IRTemp rLo = newTemp(Ity_I64);
10087
10088    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
10089    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
10090    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
10091    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
10092
10093    if (imm8 == 0) {
10094       assign( rHi, mkexpr(sHi) );
10095       assign( rLo, mkexpr(sLo) );
10096    }
10097    else if (imm8 >= 1 && imm8 <= 7) {
10098       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
10099       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
10100    }
10101    else if (imm8 == 8) {
10102       assign( rHi, mkexpr(dLo) );
10103       assign( rLo, mkexpr(sHi) );
10104    }
10105    else if (imm8 >= 9 && imm8 <= 15) {
10106       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
10107       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
10108    }
10109    else if (imm8 == 16) {
10110       assign( rHi, mkexpr(dHi) );
10111       assign( rLo, mkexpr(dLo) );
10112    }
10113    else if (imm8 >= 17 && imm8 <= 23) {
10114       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
10115       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
10116    }
10117    else if (imm8 == 24) {
10118       assign( rHi, mkU64(0) );
10119       assign( rLo, mkexpr(dHi) );
10120    }
10121    else if (imm8 >= 25 && imm8 <= 31) {
10122       assign( rHi, mkU64(0) );
10123       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
10124    }
10125    else if (imm8 >= 32 && imm8 <= 255) {
10126       assign( rHi, mkU64(0) );
10127       assign( rLo, mkU64(0) );
10128    }
10129    else
10130       vassert(0);
10131
10132    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
10133    return res;
10134 }
10135
10136 /* Generate a SIGSEGV followed by a restart of the current instruction
10137    if effective_addr is not 16-aligned.  This is required behaviour
10138    for some SSE3 instructions and all 128-bit SSSE3 instructions.
10139    This assumes that guest_RIP_curr_instr is set correctly!
10140    On FreeBSD, this kind of error generates a SIGBUS. */
10141 static
10142 void gen_SIGNAL_if_not_XX_aligned ( const VexAbiInfo* vbi,
10143                                     IRTemp effective_addr, ULong mask )
10144 {
10145    stmt(
10146       IRStmt_Exit(
10147          binop(Iop_CmpNE64,
10148                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
10149                mkU64(0)),
10150          vbi->guest_amd64_sigbus_on_misalign ? Ijk_SigBUS : Ijk_SigSEGV,
10151          IRConst_U64(guest_RIP_curr_instr),
10152          OFFB_RIP
10153       )
10154    );
10155 }
10156
10157 static void gen_SIGNAL_if_not_16_aligned ( const VexAbiInfo* vbi,
10158                                            IRTemp effective_addr ) {
10159    gen_SIGNAL_if_not_XX_aligned(vbi, effective_addr, 16-1);
10160 }
10161
10162 static void gen_SIGNAL_if_not_32_aligned ( const VexAbiInfo* vbi,
10163                                            IRTemp effective_addr ) {
10164    gen_SIGNAL_if_not_XX_aligned(vbi, effective_addr, 32-1);
10165 }
10166
10167 static void gen_SIGNAL_if_not_64_aligned ( const VexAbiInfo* vbi,
10168                                            IRTemp effective_addr ) {
10169    gen_SIGNAL_if_not_XX_aligned(vbi, effective_addr, 64-1);
10170 }
10171
10172
10173 /* Helper for deciding whether a given insn (starting at the opcode
10174    byte) may validly be used with a LOCK prefix.  The following insns
10175    may be used with LOCK when their destination operand is in memory.
10176    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
10177
10178    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
10179    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
10180    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
10181    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
10182    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
10183    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
10184    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
10185
10186    DEC        FE /1,  FF /1
10187    INC        FE /0,  FF /0
10188
10189    NEG        F6 /3,  F7 /3
10190    NOT        F6 /2,  F7 /2
10191
10192    XCHG       86, 87
10193
10194    BTC        0F BB,  0F BA /7
10195    BTR        0F B3,  0F BA /6
10196    BTS        0F AB,  0F BA /5
10197
10198    CMPXCHG    0F B0,  0F B1
10199    CMPXCHG8B  0F C7 /1
10200
10201    XADD       0F C0,  0F C1
10202
10203    ------------------------------
10204
10205    80 /0  =  addb $imm8,  rm8
10206    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
10207    82 /0  =  addb $imm8,  rm8
10208    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
10209
10210    00     =  addb r8,  rm8
10211    01     =  addl r32, rm32  and  addw r16, rm16
10212
10213    Same for ADD OR ADC SBB AND SUB XOR
10214
10215    FE /1  = dec rm8
10216    FF /1  = dec rm32  and  dec rm16
10217
10218    FE /0  = inc rm8
10219    FF /0  = inc rm32  and  inc rm16
10220
10221    F6 /3  = neg rm8
10222    F7 /3  = neg rm32  and  neg rm16
10223
10224    F6 /2  = not rm8
10225    F7 /2  = not rm32  and  not rm16
10226
10227    0F BB     = btcw r16, rm16    and  btcl r32, rm32
10228    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
10229
10230    Same for BTS, BTR
10231 */
10232 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
10233 {
10234    switch (opc[0]) {
10235       case 0x00: case 0x01: case 0x08: case 0x09:
10236       case 0x10: case 0x11: case 0x18: case 0x19:
10237       case 0x20: case 0x21: case 0x28: case 0x29:
10238       case 0x30: case 0x31:
10239          if (!epartIsReg(opc[1]))
10240             return True;
10241          break;
10242
10243       case 0x80: case 0x81: case 0x82: case 0x83:
10244          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
10245              && !epartIsReg(opc[1]))
10246             return True;
10247          break;
10248
10249       case 0xFE: case 0xFF:
10250          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
10251              && !epartIsReg(opc[1]))
10252             return True;
10253          break;
10254
10255       case 0xF6: case 0xF7:
10256          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
10257              && !epartIsReg(opc[1]))
10258             return True;
10259          break;
10260
10261       case 0x86: case 0x87:
10262          if (!epartIsReg(opc[1]))
10263             return True;
10264          break;
10265
10266       case 0x0F: {
10267          switch (opc[1]) {
10268             case 0xBB: case 0xB3: case 0xAB:
10269                if (!epartIsReg(opc[2]))
10270                   return True;
10271                break;
10272             case 0xBA:
10273                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
10274                    && !epartIsReg(opc[2]))
10275                   return True;
10276                break;
10277             case 0xB0: case 0xB1:
10278                if (!epartIsReg(opc[2]))
10279                   return True;
10280                break;
10281             case 0xC7:
10282                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
10283                   return True;
10284                break;
10285             case 0xC0: case 0xC1:
10286                if (!epartIsReg(opc[2]))
10287                   return True;
10288                break;
10289             default:
10290                break;
10291          } /* switch (opc[1]) */
10292          break;
10293       }
10294
10295       default:
10296          break;
10297    } /* switch (opc[0]) */
10298
10299    return False;
10300 }
10301
10302
10303 /*------------------------------------------------------------*/
10304 /*---                                                      ---*/
10305 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
10306 /*---                                                      ---*/
10307 /*------------------------------------------------------------*/
10308
10309 static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
10310                          Long delta, Bool isAvx, UChar opc )
10311 {
10312    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
10313    Int    alen  = 0;
10314    HChar  dis_buf[50];
10315    IRTemp argL  = newTemp(Ity_F64);
10316    IRTemp argR  = newTemp(Ity_F64);
10317    UChar  modrm = getUChar(delta);
10318    IRTemp addr  = IRTemp_INVALID;
10319    if (epartIsReg(modrm)) {
10320       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
10321                                       0/*lowest lane*/ ) );
10322       delta += 1;
10323       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
10324                                 opc==0x2E ? "u" : "",
10325                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
10326                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10327    } else {
10328       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10329       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
10330       delta += alen;
10331       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
10332                                 opc==0x2E ? "u" : "",
10333                                 dis_buf,
10334                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10335    }
10336    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
10337                                    0/*lowest lane*/ ) );
10338
10339    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
10340    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
10341    stmt( IRStmt_Put(
10342             OFFB_CC_DEP1,
10343             binop( Iop_And64,
10344                    unop( Iop_32Uto64,
10345                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
10346                    mkU64(0x45)
10347        )));
10348    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
10349    return delta;
10350 }
10351
10352
10353 static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
10354                          Long delta, Bool isAvx, UChar opc )
10355 {
10356    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
10357    Int    alen  = 0;
10358    HChar  dis_buf[50];
10359    IRTemp argL  = newTemp(Ity_F32);
10360    IRTemp argR  = newTemp(Ity_F32);
10361    UChar  modrm = getUChar(delta);
10362    IRTemp addr  = IRTemp_INVALID;
10363    if (epartIsReg(modrm)) {
10364       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
10365                                       0/*lowest lane*/ ) );
10366       delta += 1;
10367       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
10368                                 opc==0x2E ? "u" : "",
10369                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
10370                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10371    } else {
10372       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10373       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
10374       delta += alen;
10375       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
10376                                 opc==0x2E ? "u" : "",
10377                                 dis_buf,
10378                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10379    }
10380    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
10381                                    0/*lowest lane*/ ) );
10382
10383    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
10384    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
10385    stmt( IRStmt_Put(
10386             OFFB_CC_DEP1,
10387             binop( Iop_And64,
10388                    unop( Iop_32Uto64,
10389                          binop(Iop_CmpF64,
10390                                unop(Iop_F32toF64,mkexpr(argL)),
10391                                unop(Iop_F32toF64,mkexpr(argR)))),
10392                    mkU64(0x45)
10393        )));
10394    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
10395    return delta;
10396 }
10397
10398
10399 static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
10400                               Long delta, Bool writesYmm )
10401 {
10402    Int    order;
10403    Int    alen  = 0;
10404    HChar  dis_buf[50];
10405    IRTemp sV    = newTemp(Ity_V128);
10406    UChar  modrm = getUChar(delta);
10407    const HChar* strV  = writesYmm ? "v" : "";
10408    IRTemp addr  = IRTemp_INVALID;
10409    if (epartIsReg(modrm)) {
10410       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10411       order = (Int)getUChar(delta+1);
10412       delta += 1+1;
10413       DIP("%spshufd $%d,%s,%s\n", strV, order,
10414                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
10415                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
10416    } else {
10417       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
10418                         1/*byte after the amode*/ );
10419       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10420       order = (Int)getUChar(delta+alen);
10421       delta += alen+1;
10422       DIP("%spshufd $%d,%s,%s\n", strV, order,
10423                                  dis_buf,
10424                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
10425    }
10426
10427    IRTemp s3, s2, s1, s0;
10428    s3 = s2 = s1 = s0 = IRTemp_INVALID;
10429    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10430
10431 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10432    IRTemp dV = newTemp(Ity_V128);
10433    assign(dV,
10434           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
10435                          SEL((order>>2)&3), SEL((order>>0)&3) )
10436    );
10437 #  undef SEL
10438
10439    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
10440       (gregOfRexRM(pfx,modrm), mkexpr(dV));
10441    return delta;
10442 }
10443
10444
10445 static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
10446 {
10447    Int    order;
10448    Int    alen  = 0;
10449    HChar  dis_buf[50];
10450    IRTemp sV    = newTemp(Ity_V256);
10451    UChar  modrm = getUChar(delta);
10452    IRTemp addr  = IRTemp_INVALID;
10453    UInt   rG    = gregOfRexRM(pfx,modrm);
10454    if (epartIsReg(modrm)) {
10455       UInt rE = eregOfRexRM(pfx,modrm);
10456       assign( sV, getYMMReg(rE) );
10457       order = (Int)getUChar(delta+1);
10458       delta += 1+1;
10459       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
10460    } else {
10461       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
10462                         1/*byte after the amode*/ );
10463       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
10464       order = (Int)getUChar(delta+alen);
10465       delta += alen+1;
10466       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
10467    }
10468
10469    IRTemp s[8];
10470    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
10471    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
10472                          &s[3], &s[2], &s[1], &s[0] );
10473
10474    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
10475                                  s[4 + ((order>>4)&3)],
10476                                  s[4 + ((order>>2)&3)],
10477                                  s[4 + ((order>>0)&3)],
10478                                  s[0 + ((order>>6)&3)],
10479                                  s[0 + ((order>>4)&3)],
10480                                  s[0 + ((order>>2)&3)],
10481                                  s[0 + ((order>>0)&3)] ) );
10482    return delta;
10483 }
10484
10485
10486 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
10487 {
10488    IRTemp dV    = newTemp(Ity_V128);
10489    IRTemp hi64  = newTemp(Ity_I64);
10490    IRTemp lo64  = newTemp(Ity_I64);
10491    IRTemp hi64r = newTemp(Ity_I64);
10492    IRTemp lo64r = newTemp(Ity_I64);
10493
10494    vassert(imm >= 0 && imm <= 255);
10495    if (imm >= 16) {
10496       assign(dV, mkV128(0x0000));
10497       return dV;
10498    }
10499
10500    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
10501    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
10502
10503    if (imm == 0) {
10504       assign( lo64r, mkexpr(lo64) );
10505       assign( hi64r, mkexpr(hi64) );
10506    }
10507    else
10508    if (imm == 8) {
10509       assign( hi64r, mkU64(0) );
10510       assign( lo64r, mkexpr(hi64) );
10511    }
10512    else
10513    if (imm > 8) {
10514       assign( hi64r, mkU64(0) );
10515       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
10516    } else {
10517       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
10518       assign( lo64r,
10519               binop( Iop_Or64,
10520                      binop(Iop_Shr64, mkexpr(lo64),
10521                            mkU8(8 * imm)),
10522                      binop(Iop_Shl64, mkexpr(hi64),
10523                            mkU8(8 * (8 - imm)) )
10524                      )
10525               );
10526    }
10527
10528    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
10529    return dV;
10530 }
10531
10532
10533 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
10534 {
10535    IRTemp       dV    = newTemp(Ity_V128);
10536    IRTemp       hi64  = newTemp(Ity_I64);
10537    IRTemp       lo64  = newTemp(Ity_I64);
10538    IRTemp       hi64r = newTemp(Ity_I64);
10539    IRTemp       lo64r = newTemp(Ity_I64);
10540
10541    vassert(imm >= 0 && imm <= 255);
10542    if (imm >= 16) {
10543       assign(dV, mkV128(0x0000));
10544       return dV;
10545    }
10546
10547    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
10548    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
10549
10550    if (imm == 0) {
10551       assign( lo64r, mkexpr(lo64) );
10552       assign( hi64r, mkexpr(hi64) );
10553    }
10554    else
10555    if (imm == 8) {
10556       assign( lo64r, mkU64(0) );
10557       assign( hi64r, mkexpr(lo64) );
10558    }
10559    else
10560    if (imm > 8) {
10561       assign( lo64r, mkU64(0) );
10562       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
10563    } else {
10564       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
10565       assign( hi64r,
10566               binop( Iop_Or64,
10567                      binop(Iop_Shl64, mkexpr(hi64),
10568                            mkU8(8 * imm)),
10569                      binop(Iop_Shr64, mkexpr(lo64),
10570                            mkU8(8 * (8 - imm)) )
10571                      )
10572               );
10573    }
10574
10575    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
10576    return dV;
10577 }
10578
10579
10580 static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
10581                             Long delta, Bool isAvx, UChar opc, Int sz )
10582 {
10583    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
10584    HChar  dis_buf[50];
10585    Int    alen   = 0;
10586    UChar  modrm  = getUChar(delta);
10587    IRTemp addr   = IRTemp_INVALID;
10588    IRTemp rmode  = newTemp(Ity_I32);
10589    IRTemp f64lo  = newTemp(Ity_F64);
10590    Bool   r2zero = toBool(opc == 0x2C);
10591
10592    if (epartIsReg(modrm)) {
10593       delta += 1;
10594       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
10595       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10596                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
10597                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
10598                                            False));
10599    } else {
10600       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10601       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10602       delta += alen;
10603       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10604                                   dis_buf,
10605                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
10606                                            False));
10607    }
10608
10609    if (r2zero) {
10610       assign( rmode, mkU32((UInt)Irrm_ZERO) );
10611    } else {
10612       assign( rmode, get_sse_roundingmode() );
10613    }
10614
10615    if (sz == 4) {
10616       putIReg32( gregOfRexRM(pfx,modrm),
10617                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
10618    } else {
10619       vassert(sz == 8);
10620       putIReg64( gregOfRexRM(pfx,modrm),
10621                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
10622    }
10623
10624    return delta;
10625 }
10626
10627
10628 static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
10629                             Long delta, Bool isAvx, UChar opc, Int sz )
10630 {
10631    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
10632    HChar  dis_buf[50];
10633    Int    alen   = 0;
10634    UChar  modrm  = getUChar(delta);
10635    IRTemp addr   = IRTemp_INVALID;
10636    IRTemp rmode  = newTemp(Ity_I32);
10637    IRTemp f32lo  = newTemp(Ity_F32);
10638    Bool   r2zero = toBool(opc == 0x2C);
10639
10640    if (epartIsReg(modrm)) {
10641       delta += 1;
10642       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
10643       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10644                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
10645                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
10646                                            False));
10647    } else {
10648       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10649       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
10650       delta += alen;
10651       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10652                                   dis_buf,
10653                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
10654                                            False));
10655    }
10656
10657    if (r2zero) {
10658       assign( rmode, mkU32((UInt)Irrm_ZERO) );
10659    } else {
10660       assign( rmode, get_sse_roundingmode() );
10661    }
10662
10663    if (sz == 4) {
10664       putIReg32( gregOfRexRM(pfx,modrm),
10665                  binop( Iop_F64toI32S,
10666                         mkexpr(rmode),
10667                         unop(Iop_F32toF64, mkexpr(f32lo))) );
10668    } else {
10669       vassert(sz == 8);
10670       putIReg64( gregOfRexRM(pfx,modrm),
10671                  binop( Iop_F64toI64S,
10672                         mkexpr(rmode),
10673                         unop(Iop_F32toF64, mkexpr(f32lo))) );
10674    }
10675
10676    return delta;
10677 }
10678
10679
10680 static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
10681                                Long delta, Bool isAvx )
10682 {
10683    IRTemp addr  = IRTemp_INVALID;
10684    Int    alen  = 0;
10685    HChar  dis_buf[50];
10686    IRTemp f32lo = newTemp(Ity_F32);
10687    IRTemp f32hi = newTemp(Ity_F32);
10688    UChar  modrm = getUChar(delta);
10689    UInt   rG    = gregOfRexRM(pfx,modrm);
10690    if (epartIsReg(modrm)) {
10691       UInt rE = eregOfRexRM(pfx,modrm);
10692       assign( f32lo, getXMMRegLane32F(rE, 0) );
10693       assign( f32hi, getXMMRegLane32F(rE, 1) );
10694       delta += 1;
10695       DIP("%scvtps2pd %s,%s\n",
10696           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10697    } else {
10698       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10699       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
10700       assign( f32hi, loadLE(Ity_F32,
10701                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10702       delta += alen;
10703       DIP("%scvtps2pd %s,%s\n",
10704           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
10705    }
10706
10707    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
10708    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
10709    if (isAvx)
10710       putYMMRegLane128( rG, 1, mkV128(0));
10711    return delta;
10712 }
10713
10714
10715 static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
10716                                Long delta )
10717 {
10718    IRTemp addr  = IRTemp_INVALID;
10719    Int    alen  = 0;
10720    HChar  dis_buf[50];
10721    IRTemp f32_0 = newTemp(Ity_F32);
10722    IRTemp f32_1 = newTemp(Ity_F32);
10723    IRTemp f32_2 = newTemp(Ity_F32);
10724    IRTemp f32_3 = newTemp(Ity_F32);
10725    UChar  modrm = getUChar(delta);
10726    UInt   rG    = gregOfRexRM(pfx,modrm);
10727    if (epartIsReg(modrm)) {
10728       UInt rE = eregOfRexRM(pfx,modrm);
10729       assign( f32_0, getXMMRegLane32F(rE, 0) );
10730       assign( f32_1, getXMMRegLane32F(rE, 1) );
10731       assign( f32_2, getXMMRegLane32F(rE, 2) );
10732       assign( f32_3, getXMMRegLane32F(rE, 3) );
10733       delta += 1;
10734       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
10735    } else {
10736       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10737       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
10738       assign( f32_1, loadLE(Ity_F32,
10739                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10740       assign( f32_2, loadLE(Ity_F32,
10741                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
10742       assign( f32_3, loadLE(Ity_F32,
10743                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
10744       delta += alen;
10745       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
10746    }
10747
10748    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
10749    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
10750    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
10751    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
10752    return delta;
10753 }
10754
10755
10756 static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
10757                                Long delta, Bool isAvx )
10758 {
10759    IRTemp addr  = IRTemp_INVALID;
10760    Int    alen  = 0;
10761    HChar  dis_buf[50];
10762    UChar  modrm = getUChar(delta);
10763    UInt   rG    = gregOfRexRM(pfx,modrm);
10764    IRTemp argV  = newTemp(Ity_V128);
10765    IRTemp rmode = newTemp(Ity_I32);
10766    if (epartIsReg(modrm)) {
10767       UInt rE = eregOfRexRM(pfx,modrm);
10768       assign( argV, getXMMReg(rE) );
10769       delta += 1;
10770       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
10771           nameXMMReg(rE), nameXMMReg(rG));
10772    } else {
10773       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10774       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10775       delta += alen;
10776       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
10777           dis_buf, nameXMMReg(rG) );
10778    }
10779
10780    assign( rmode, get_sse_roundingmode() );
10781    IRTemp t0 = newTemp(Ity_F64);
10782    IRTemp t1 = newTemp(Ity_F64);
10783    assign( t0, unop(Iop_ReinterpI64asF64,
10784                     unop(Iop_V128to64, mkexpr(argV))) );
10785    assign( t1, unop(Iop_ReinterpI64asF64,
10786                     unop(Iop_V128HIto64, mkexpr(argV))) );
10787
10788 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
10789    putXMMRegLane32(  rG, 3, mkU32(0) );
10790    putXMMRegLane32(  rG, 2, mkU32(0) );
10791    putXMMRegLane32F( rG, 1, CVT(t1) );
10792    putXMMRegLane32F( rG, 0, CVT(t0) );
10793 #  undef CVT
10794    if (isAvx)
10795       putYMMRegLane128( rG, 1, mkV128(0) );
10796
10797    return delta;
10798 }
10799
10800
10801 static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
10802                                 Long delta, Bool isAvx, Bool r2zero )
10803 {
10804    IRTemp addr  = IRTemp_INVALID;
10805    Int    alen  = 0;
10806    HChar  dis_buf[50];
10807    UChar  modrm = getUChar(delta);
10808    IRTemp argV  = newTemp(Ity_V128);
10809    IRTemp rmode = newTemp(Ity_I32);
10810    UInt   rG    = gregOfRexRM(pfx,modrm);
10811
10812    if (epartIsReg(modrm)) {
10813       UInt rE = eregOfRexRM(pfx,modrm);
10814       assign( argV, getXMMReg(rE) );
10815       delta += 1;
10816       DIP("%scvt%sps2dq %s,%s\n",
10817           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10818    } else {
10819       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10820       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10821       delta += alen;
10822       DIP("%scvt%sps2dq %s,%s\n",
10823           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10824    }
10825
10826    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10827                          : get_sse_roundingmode() );
10828    putXMMReg( rG, binop(Iop_F32toI32Sx4, mkexpr(rmode), mkexpr(argV)) );
10829    if (isAvx)
10830       putYMMRegLane128( rG, 1, mkV128(0) );
10831
10832    return delta;
10833 }
10834
10835
10836 static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
10837                                 Long delta, Bool r2zero )
10838 {
10839    IRTemp addr  = IRTemp_INVALID;
10840    Int    alen  = 0;
10841    HChar  dis_buf[50];
10842    UChar  modrm = getUChar(delta);
10843    IRTemp argV  = newTemp(Ity_V256);
10844    IRTemp rmode = newTemp(Ity_I32);
10845    UInt   rG    = gregOfRexRM(pfx,modrm);
10846
10847    if (epartIsReg(modrm)) {
10848       UInt rE = eregOfRexRM(pfx,modrm);
10849       assign( argV, getYMMReg(rE) );
10850       delta += 1;
10851       DIP("vcvt%sps2dq %s,%s\n",
10852           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
10853    } else {
10854       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10855       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10856       delta += alen;
10857       DIP("vcvt%sps2dq %s,%s\n",
10858           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
10859    }
10860
10861    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10862                          : get_sse_roundingmode() );
10863    putYMMReg( rG,  binop(Iop_F32toI32Sx8, mkexpr(rmode), mkexpr(argV)) );
10864    return delta;
10865 }
10866
10867
10868 static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
10869                                 Long delta, Bool isAvx, Bool r2zero )
10870 {
10871    IRTemp addr  = IRTemp_INVALID;
10872    Int    alen  = 0;
10873    HChar  dis_buf[50];
10874    UChar  modrm = getUChar(delta);
10875    IRTemp argV  = newTemp(Ity_V128);
10876    IRTemp rmode = newTemp(Ity_I32);
10877    UInt   rG    = gregOfRexRM(pfx,modrm);
10878    IRTemp t0, t1;
10879
10880    if (epartIsReg(modrm)) {
10881       UInt rE = eregOfRexRM(pfx,modrm);
10882       assign( argV, getXMMReg(rE) );
10883       delta += 1;
10884       DIP("%scvt%spd2dq %s,%s\n",
10885           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10886    } else {
10887       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10888       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10889       delta += alen;
10890       DIP("%scvt%spd2dqx %s,%s\n",
10891           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10892    }
10893
10894    if (r2zero) {
10895       assign(rmode, mkU32((UInt)Irrm_ZERO) );
10896    } else {
10897       assign( rmode, get_sse_roundingmode() );
10898    }
10899
10900    t0 = newTemp(Ity_F64);
10901    t1 = newTemp(Ity_F64);
10902    assign( t0, unop(Iop_ReinterpI64asF64,
10903                     unop(Iop_V128to64, mkexpr(argV))) );
10904    assign( t1, unop(Iop_ReinterpI64asF64,
10905                     unop(Iop_V128HIto64, mkexpr(argV))) );
10906
10907 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
10908                           mkexpr(rmode),                   \
10909                           mkexpr(_t) )
10910
10911    putXMMRegLane32( rG, 3, mkU32(0) );
10912    putXMMRegLane32( rG, 2, mkU32(0) );
10913    putXMMRegLane32( rG, 1, CVT(t1) );
10914    putXMMRegLane32( rG, 0, CVT(t0) );
10915 #  undef CVT
10916    if (isAvx)
10917       putYMMRegLane128( rG, 1, mkV128(0) );
10918
10919    return delta;
10920 }
10921
10922
10923 static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
10924                                 Long delta, Bool r2zero )
10925 {
10926    IRTemp addr  = IRTemp_INVALID;
10927    Int    alen  = 0;
10928    HChar  dis_buf[50];
10929    UChar  modrm = getUChar(delta);
10930    IRTemp argV  = newTemp(Ity_V256);
10931    IRTemp rmode = newTemp(Ity_I32);
10932    UInt   rG    = gregOfRexRM(pfx,modrm);
10933    IRTemp t0, t1, t2, t3;
10934
10935    if (epartIsReg(modrm)) {
10936       UInt rE = eregOfRexRM(pfx,modrm);
10937       assign( argV, getYMMReg(rE) );
10938       delta += 1;
10939       DIP("vcvt%spd2dq %s,%s\n",
10940           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
10941    } else {
10942       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10943       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10944       delta += alen;
10945       DIP("vcvt%spd2dqy %s,%s\n",
10946           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10947    }
10948
10949    if (r2zero) {
10950       assign(rmode, mkU32((UInt)Irrm_ZERO) );
10951    } else {
10952       assign( rmode, get_sse_roundingmode() );
10953    }
10954
10955    t0 = IRTemp_INVALID;
10956    t1 = IRTemp_INVALID;
10957    t2 = IRTemp_INVALID;
10958    t3 = IRTemp_INVALID;
10959    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
10960
10961 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
10962                           mkexpr(rmode),                   \
10963                           unop( Iop_ReinterpI64asF64,      \
10964                                 mkexpr(_t) ) )
10965
10966    putXMMRegLane32( rG, 3, CVT(t3) );
10967    putXMMRegLane32( rG, 2, CVT(t2) );
10968    putXMMRegLane32( rG, 1, CVT(t1) );
10969    putXMMRegLane32( rG, 0, CVT(t0) );
10970 #  undef CVT
10971    putYMMRegLane128( rG, 1, mkV128(0) );
10972
10973    return delta;
10974 }
10975
10976
10977 static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
10978                                Long delta, Bool isAvx )
10979 {
10980    IRTemp addr  = IRTemp_INVALID;
10981    Int    alen  = 0;
10982    HChar  dis_buf[50];
10983    UChar  modrm = getUChar(delta);
10984    IRTemp argV  = newTemp(Ity_V128);
10985    IRTemp rmode = newTemp(Ity_I32);
10986    UInt   rG    = gregOfRexRM(pfx,modrm);
10987
10988    if (epartIsReg(modrm)) {
10989       UInt rE = eregOfRexRM(pfx,modrm);
10990       assign( argV, getXMMReg(rE) );
10991       delta += 1;
10992       DIP("%scvtdq2ps %s,%s\n",
10993           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10994    } else {
10995       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10996       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10997       delta += alen;
10998       DIP("%scvtdq2ps %s,%s\n",
10999           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
11000    }
11001
11002    assign( rmode, get_sse_roundingmode() );
11003    putXMMReg(rG, binop(Iop_I32StoF32x4, mkexpr(rmode), mkexpr(argV)));
11004
11005    if (isAvx)
11006       putYMMRegLane128( rG, 1, mkV128(0) );
11007
11008    return delta;
11009 }
11010
11011 static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
11012                                Long delta )
11013 {
11014    IRTemp addr   = IRTemp_INVALID;
11015    Int    alen   = 0;
11016    HChar  dis_buf[50];
11017    UChar  modrm  = getUChar(delta);
11018    IRTemp argV   = newTemp(Ity_V256);
11019    IRTemp rmode  = newTemp(Ity_I32);
11020    UInt   rG     = gregOfRexRM(pfx,modrm);
11021
11022    if (epartIsReg(modrm)) {
11023       UInt rE = eregOfRexRM(pfx,modrm);
11024       assign( argV, getYMMReg(rE) );
11025       delta += 1;
11026       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
11027    } else {
11028       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11029       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
11030       delta += alen;
11031       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
11032    }
11033
11034    assign( rmode, get_sse_roundingmode() );
11035    putYMMReg(rG, binop(Iop_I32StoF32x8, mkexpr(rmode), mkexpr(argV)));
11036
11037    return delta;
11038 }
11039
11040
11041 static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
11042                                Long delta, Bool isAvx )
11043 {
11044    UChar modrm = getUChar(delta);
11045    vassert(epartIsReg(modrm)); /* ensured by caller */
11046    UInt   rE = eregOfRexRM(pfx,modrm);
11047    UInt   rG = gregOfRexRM(pfx,modrm);
11048    IRTemp t0 = newTemp(Ity_V128);
11049    IRTemp t1 = newTemp(Ity_I32);
11050    assign(t0, getXMMReg(rE));
11051    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
11052    putIReg32(rG, mkexpr(t1));
11053    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
11054        nameIReg32(rG));
11055    delta += 1;
11056    return delta;
11057 }
11058
11059
11060 static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
11061                                Long delta  )
11062 {
11063    UChar modrm = getUChar(delta);
11064    vassert(epartIsReg(modrm)); /* ensured by caller */
11065    UInt   rE = eregOfRexRM(pfx,modrm);
11066    UInt   rG = gregOfRexRM(pfx,modrm);
11067    IRTemp t0 = newTemp(Ity_V128);
11068    IRTemp t1 = newTemp(Ity_V128);
11069    IRTemp t2 = newTemp(Ity_I16);
11070    IRTemp t3 = newTemp(Ity_I16);
11071    assign(t0, getYMMRegLane128(rE, 0));
11072    assign(t1, getYMMRegLane128(rE, 1));
11073    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
11074    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
11075    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
11076    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
11077    delta += 1;
11078    return delta;
11079 }
11080
11081
11082 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
11083    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
11084 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
11085 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
11086 {
11087    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11088    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11089    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11090    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11091    IRTemp res = newTemp(Ity_V128);
11092    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
11093                      : mkV128from32s( s1, d1, s0, d0 ));
11094    return res;
11095 }
11096
11097
11098 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
11099 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
11100 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
11101 {
11102    IRTemp s1 = newTemp(Ity_I64);
11103    IRTemp s0 = newTemp(Ity_I64);
11104    IRTemp d1 = newTemp(Ity_I64);
11105    IRTemp d0 = newTemp(Ity_I64);
11106    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11107    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11108    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11109    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11110    IRTemp res = newTemp(Ity_V128);
11111    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
11112                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
11113    return res;
11114 }
11115
11116
11117 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
11118    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
11119    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
11120    way. */
11121 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
11122 {
11123    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11124    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11125    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
11126    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
11127    IRTemp res = newTemp(Ity_V256);
11128    assign(res, xIsH
11129                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
11130                                             mkexpr(s1), mkexpr(d1))
11131                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
11132                                             mkexpr(s0), mkexpr(d0)));
11133    return res;
11134 }
11135
11136
11137 /* FIXME: this is really bad.  Surely can do something better here?
11138    One observation is that the steering in the upper and lower 128 bit
11139    halves is the same as with math_UNPCKxPS_128, so we simply split
11140    into two halves, and use that.  Consequently any improvement in
11141    math_UNPCKxPS_128 (probably, to use interleave-style primops)
11142    benefits this too. */
11143 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
11144 {
11145    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11146    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11147    breakupV256toV128s( sV, &sVhi, &sVlo );
11148    breakupV256toV128s( dV, &dVhi, &dVlo );
11149    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
11150    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
11151    IRTemp rV   = newTemp(Ity_V256);
11152    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11153    return rV;
11154 }
11155
11156
11157 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11158 {
11159    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11160    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11161    vassert(imm8 < 256);
11162
11163    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11164    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11165
11166 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
11167 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11168    IRTemp res = newTemp(Ity_V128);
11169    assign(res,
11170           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
11171                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
11172 #  undef SELD
11173 #  undef SELS
11174    return res;
11175 }
11176
11177
11178 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
11179    identically.  Hence do the clueless thing and use math_SHUFPS_128
11180    twice. */
11181 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11182 {
11183    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11184    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11185    breakupV256toV128s( sV, &sVhi, &sVlo );
11186    breakupV256toV128s( dV, &dVhi, &dVlo );
11187    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
11188    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
11189    IRTemp rV   = newTemp(Ity_V256);
11190    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11191    return rV;
11192 }
11193
11194
11195 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11196 {
11197    IRTemp s1 = newTemp(Ity_I64);
11198    IRTemp s0 = newTemp(Ity_I64);
11199    IRTemp d1 = newTemp(Ity_I64);
11200    IRTemp d0 = newTemp(Ity_I64);
11201
11202    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11203    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11204    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11205    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11206
11207 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
11208 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
11209
11210    IRTemp res = newTemp(Ity_V128);
11211    assign(res, binop( Iop_64HLtoV128,
11212                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
11213
11214 #  undef SELD
11215 #  undef SELS
11216    return res;
11217 }
11218
11219
11220 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11221 {
11222    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11223    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11224    breakupV256toV128s( sV, &sVhi, &sVlo );
11225    breakupV256toV128s( dV, &dVhi, &dVlo );
11226    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
11227    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
11228    IRTemp rV   = newTemp(Ity_V256);
11229    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11230    return rV;
11231 }
11232
11233
11234 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11235 {
11236    UShort imm8_mask_16;
11237    IRTemp imm8_mask = newTemp(Ity_V128);
11238
11239    switch( imm8 & 3 ) {
11240       case 0:  imm8_mask_16 = 0x0000; break;
11241       case 1:  imm8_mask_16 = 0x00FF; break;
11242       case 2:  imm8_mask_16 = 0xFF00; break;
11243       case 3:  imm8_mask_16 = 0xFFFF; break;
11244       default: vassert(0);            break;
11245    }
11246    assign( imm8_mask, mkV128( imm8_mask_16 ) );
11247
11248    IRTemp res = newTemp(Ity_V128);
11249    assign ( res, binop( Iop_OrV128,
11250                         binop( Iop_AndV128, mkexpr(sV),
11251                                             mkexpr(imm8_mask) ),
11252                         binop( Iop_AndV128, mkexpr(dV),
11253                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
11254    return res;
11255 }
11256
11257
11258 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11259 {
11260    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11261    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11262    breakupV256toV128s( sV, &sVhi, &sVlo );
11263    breakupV256toV128s( dV, &dVhi, &dVlo );
11264    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
11265    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
11266    IRTemp rV   = newTemp(Ity_V256);
11267    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11268    return rV;
11269 }
11270
11271
11272 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11273 {
11274    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
11275                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
11276                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
11277                              0xFFFF };
11278    IRTemp imm8_mask = newTemp(Ity_V128);
11279    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
11280
11281    IRTemp res = newTemp(Ity_V128);
11282    assign ( res, binop( Iop_OrV128,
11283                         binop( Iop_AndV128, mkexpr(sV),
11284                                             mkexpr(imm8_mask) ),
11285                         binop( Iop_AndV128, mkexpr(dV),
11286                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
11287    return res;
11288 }
11289
11290
11291 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11292 {
11293    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11294    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11295    breakupV256toV128s( sV, &sVhi, &sVlo );
11296    breakupV256toV128s( dV, &dVhi, &dVlo );
11297    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
11298    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
11299    IRTemp rV   = newTemp(Ity_V256);
11300    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11301    return rV;
11302 }
11303
11304
11305 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11306 {
11307    /* Make w be a 16-bit version of imm8, formed by duplicating each
11308       bit in imm8. */
11309    Int i;
11310    UShort imm16 = 0;
11311    for (i = 0; i < 8; i++) {
11312       if (imm8 & (1 << i))
11313          imm16 |= (3 << (2*i));
11314    }
11315    IRTemp imm16_mask = newTemp(Ity_V128);
11316    assign( imm16_mask, mkV128( imm16 ));
11317
11318    IRTemp res = newTemp(Ity_V128);
11319    assign ( res, binop( Iop_OrV128,
11320                         binop( Iop_AndV128, mkexpr(sV),
11321                                             mkexpr(imm16_mask) ),
11322                         binop( Iop_AndV128, mkexpr(dV),
11323                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
11324    return res;
11325 }
11326
11327
11328 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
11329 {
11330    /* This is a really poor translation -- could be improved if
11331       performance critical */
11332    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11333    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11334    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11335    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11336    IRTemp res = newTemp(Ity_V128);
11337    assign(res, binop(Iop_64HLtoV128,
11338                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
11339                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
11340    return res;
11341 }
11342
11343
11344 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
11345 {
11346    /* This is a really poor translation -- could be improved if
11347       performance critical */
11348    IRTemp sHi, sLo, dHi, dLo;
11349    sHi = sLo = dHi = dLo = IRTemp_INVALID;
11350    breakupV256toV128s( dV, &dHi, &dLo);
11351    breakupV256toV128s( sV, &sHi, &sLo);
11352    IRTemp res = newTemp(Ity_V256);
11353    assign(res, binop(Iop_V128HLtoV256,
11354                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
11355                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
11356    return res;
11357 }
11358
11359
11360 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
11361 {
11362    /* This is a really poor translation -- could be improved if
11363       performance critical */
11364    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11365    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11366    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11367    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11368    IRTemp res = newTemp(Ity_V128);
11369    assign(res, binop(Iop_64HLtoV128,
11370                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
11371                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
11372    return res;
11373 }
11374
11375
11376 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
11377 {
11378    /* This is a really poor translation -- could be improved if
11379       performance critical */
11380    IRTemp sHi, sLo, dHi, dLo;
11381    sHi = sLo = dHi = dLo = IRTemp_INVALID;
11382    breakupV256toV128s( dV, &dHi, &dLo);
11383    breakupV256toV128s( sV, &sHi, &sLo);
11384    IRTemp res = newTemp(Ity_V256);
11385    assign(res, binop(Iop_V128HLtoV256,
11386                      mkexpr(math_PMULDQ_128(sHi, dHi)),
11387                      mkexpr(math_PMULDQ_128(sLo, dLo))));
11388    return res;
11389 }
11390
11391
11392 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
11393 {
11394    IRTemp sVhi, sVlo, dVhi, dVlo;
11395    IRTemp resHi = newTemp(Ity_I64);
11396    IRTemp resLo = newTemp(Ity_I64);
11397    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
11398    breakupV128to64s( sV, &sVhi, &sVlo );
11399    breakupV128to64s( dV, &dVhi, &dVlo );
11400    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
11401                                 "amd64g_calculate_mmx_pmaddwd",
11402                                 &amd64g_calculate_mmx_pmaddwd,
11403                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
11404    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
11405                                 "amd64g_calculate_mmx_pmaddwd",
11406                                 &amd64g_calculate_mmx_pmaddwd,
11407                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
11408    IRTemp res = newTemp(Ity_V128);
11409    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
11410    return res;
11411 }
11412
11413
11414 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
11415 {
11416    IRTemp sHi, sLo, dHi, dLo;
11417    sHi = sLo = dHi = dLo = IRTemp_INVALID;
11418    breakupV256toV128s( dV, &dHi, &dLo);
11419    breakupV256toV128s( sV, &sHi, &sLo);
11420    IRTemp res = newTemp(Ity_V256);
11421    assign(res, binop(Iop_V128HLtoV256,
11422                      mkexpr(math_PMADDWD_128(dHi, sHi)),
11423                      mkexpr(math_PMADDWD_128(dLo, sLo))));
11424    return res;
11425 }
11426
11427
11428 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
11429 {
11430    IRTemp addV = newTemp(Ity_V128);
11431    IRTemp subV = newTemp(Ity_V128);
11432    IRTemp a1   = newTemp(Ity_I64);
11433    IRTemp s0   = newTemp(Ity_I64);
11434    IRTemp rm   = newTemp(Ity_I32);
11435
11436    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11437    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11438    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11439
11440    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11441    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11442
11443    IRTemp res = newTemp(Ity_V128);
11444    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11445    return res;
11446 }
11447
11448
11449 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
11450 {
11451    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11452    IRTemp addV = newTemp(Ity_V256);
11453    IRTemp subV = newTemp(Ity_V256);
11454    IRTemp rm   = newTemp(Ity_I32);
11455    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11456
11457    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11458    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11459    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11460
11461    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
11462    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
11463
11464    IRTemp res = newTemp(Ity_V256);
11465    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
11466    return res;
11467 }
11468
11469
11470 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
11471 {
11472    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11473    IRTemp addV = newTemp(Ity_V128);
11474    IRTemp subV = newTemp(Ity_V128);
11475    IRTemp rm   = newTemp(Ity_I32);
11476    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11477
11478    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11479    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11480    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11481
11482    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
11483    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
11484
11485    IRTemp res = newTemp(Ity_V128);
11486    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
11487    return res;
11488 }
11489
11490
11491 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
11492 {
11493    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
11494    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
11495    IRTemp addV = newTemp(Ity_V256);
11496    IRTemp subV = newTemp(Ity_V256);
11497    IRTemp rm   = newTemp(Ity_I32);
11498    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
11499    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11500
11501    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11502    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11503    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11504
11505    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
11506    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
11507
11508    IRTemp res = newTemp(Ity_V256);
11509    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
11510    return res;
11511 }
11512
11513
11514 /* Handle 128 bit PSHUFLW and PSHUFHW. */
11515 static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
11516                               Long delta, Bool isAvx, Bool xIsH )
11517 {
11518    IRTemp addr  = IRTemp_INVALID;
11519    Int    alen  = 0;
11520    HChar  dis_buf[50];
11521    UChar  modrm = getUChar(delta);
11522    UInt   rG = gregOfRexRM(pfx,modrm);
11523    UInt   imm8;
11524    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
11525    s3 = s2 = s1 = s0 = IRTemp_INVALID;
11526    sV    = newTemp(Ity_V128);
11527    dV    = newTemp(Ity_V128);
11528    sVmut = newTemp(Ity_I64);
11529    dVmut = newTemp(Ity_I64);
11530    sVcon = newTemp(Ity_I64);
11531    if (epartIsReg(modrm)) {
11532       UInt rE = eregOfRexRM(pfx,modrm);
11533       assign( sV, getXMMReg(rE) );
11534       imm8 = (UInt)getUChar(delta+1);
11535       delta += 1+1;
11536       DIP("%spshuf%cw $%u,%s,%s\n",
11537           isAvx ? "v" : "", xIsH ? 'h' : 'l',
11538           imm8, nameXMMReg(rE), nameXMMReg(rG));
11539    } else {
11540       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
11541       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11542       imm8 = (UInt)getUChar(delta+alen);
11543       delta += alen+1;
11544       DIP("%spshuf%cw $%u,%s,%s\n",
11545           isAvx ? "v" : "", xIsH ? 'h' : 'l',
11546           imm8, dis_buf, nameXMMReg(rG));
11547    }
11548
11549    /* Get the to-be-changed (mut) and unchanging (con) bits of the
11550       source. */
11551    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
11552    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
11553
11554    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
11555 #  define SEL(n) \
11556              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11557    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
11558                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
11559 #  undef SEL
11560
11561    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
11562                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
11563
11564    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
11565    return delta;
11566 }
11567
11568
11569 /* Handle 256 bit PSHUFLW and PSHUFHW. */
11570 static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
11571                               Long delta, Bool xIsH )
11572 {
11573    IRTemp addr  = IRTemp_INVALID;
11574    Int    alen  = 0;
11575    HChar  dis_buf[50];
11576    UChar  modrm = getUChar(delta);
11577    UInt   rG = gregOfRexRM(pfx,modrm);
11578    UInt   imm8;
11579    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
11580    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
11581    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
11582    sV    = newTemp(Ity_V256);
11583    dVhi  = newTemp(Ity_I64);
11584    dVlo  = newTemp(Ity_I64);
11585    if (epartIsReg(modrm)) {
11586       UInt rE = eregOfRexRM(pfx,modrm);
11587       assign( sV, getYMMReg(rE) );
11588       imm8 = (UInt)getUChar(delta+1);
11589       delta += 1+1;
11590       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
11591           imm8, nameYMMReg(rE), nameYMMReg(rG));
11592    } else {
11593       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
11594       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
11595       imm8 = (UInt)getUChar(delta+alen);
11596       delta += alen+1;
11597       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
11598           imm8, dis_buf, nameYMMReg(rG));
11599    }
11600
11601    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
11602    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
11603    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
11604
11605    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
11606                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
11607    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
11608                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
11609    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
11610                                  xIsH ? sV64[2] : dVhi,
11611                                  xIsH ? dVlo : sV64[1],
11612                                  xIsH ? sV64[0] : dVlo ) );
11613    return delta;
11614 }
11615
11616
11617 static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
11618                                           Long delta, Bool isAvx )
11619 {
11620    Long   deltaIN = delta;
11621    UChar  modrm   = getUChar(delta);
11622    UInt   rG      = gregOfRexRM(pfx,modrm);
11623    IRTemp sV      = newTemp(Ity_V128);
11624    IRTemp d16     = newTemp(Ity_I16);
11625    UInt   imm8;
11626    IRTemp s0, s1, s2, s3;
11627    if (epartIsReg(modrm)) {
11628       UInt rE = eregOfRexRM(pfx,modrm);
11629       assign(sV, getXMMReg(rE));
11630       imm8 = getUChar(delta+1) & 7;
11631       delta += 1+1;
11632       DIP("%spextrw $%u,%s,%s\n", isAvx ? "v" : "",
11633           imm8, nameXMMReg(rE), nameIReg32(rG));
11634    } else {
11635       /* The memory case is disallowed, apparently. */
11636       return deltaIN; /* FAIL */
11637    }
11638    s3 = s2 = s1 = s0 = IRTemp_INVALID;
11639    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11640    switch (imm8) {
11641       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
11642       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
11643       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
11644       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
11645       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
11646       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
11647       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
11648       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
11649       default: vassert(0);
11650    }
11651    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
11652    return delta;
11653 }
11654
11655
11656 static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
11657                                Long delta, Bool isAvx )
11658 {
11659    IRTemp addr  = IRTemp_INVALID;
11660    Int    alen  = 0;
11661    HChar  dis_buf[50];
11662    UChar  modrm = getUChar(delta);
11663    IRTemp arg64 = newTemp(Ity_I64);
11664    UInt   rG    = gregOfRexRM(pfx,modrm);
11665    const HChar* mbV   = isAvx ? "v" : "";
11666    if (epartIsReg(modrm)) {
11667       UInt rE = eregOfRexRM(pfx,modrm);
11668       assign( arg64, getXMMRegLane64(rE, 0) );
11669       delta += 1;
11670       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
11671    } else {
11672       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11673       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11674       delta += alen;
11675       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
11676    }
11677    putXMMRegLane64F(
11678       rG, 0,
11679       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
11680    );
11681    putXMMRegLane64F(
11682       rG, 1,
11683       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
11684    );
11685    if (isAvx)
11686       putYMMRegLane128(rG, 1, mkV128(0));
11687    return delta;
11688 }
11689
11690
11691 static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
11692                           Long delta, Bool isAvx )
11693 {
11694    IRTemp addr  = IRTemp_INVALID;
11695    Int    alen  = 0;
11696    HChar  dis_buf[50];
11697    UChar  modrm = getUChar(delta);
11698    vassert(!epartIsReg(modrm)); /* ensured by caller */
11699    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
11700
11701    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11702    delta += alen;
11703
11704    /* Fake up a native SSE mxcsr word.  The only thing it depends on
11705       is SSEROUND[1:0], so call a clean helper to cook it up.
11706    */
11707    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
11708    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
11709    storeLE(
11710       mkexpr(addr),
11711       unop(Iop_64to32,
11712            mkIRExprCCall(
11713               Ity_I64, 0/*regp*/,
11714               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
11715               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
11716            )
11717       )
11718    );
11719    return delta;
11720 }
11721
11722
11723 static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
11724                           Long delta, Bool isAvx )
11725 {
11726    IRTemp addr  = IRTemp_INVALID;
11727    Int    alen  = 0;
11728    HChar  dis_buf[50];
11729    UChar  modrm = getUChar(delta);
11730    vassert(!epartIsReg(modrm)); /* ensured by caller */
11731    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
11732
11733    IRTemp t64 = newTemp(Ity_I64);
11734    IRTemp ew  = newTemp(Ity_I32);
11735
11736    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11737    delta += alen;
11738    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
11739
11740    /* The only thing we observe in %mxcsr is the rounding mode.
11741       Therefore, pass the 32-bit value (SSE native-format control
11742       word) to a clean helper, getting back a 64-bit value, the
11743       lower half of which is the SSEROUND value to store, and the
11744       upper half of which is the emulation-warning token which may
11745       be generated.
11746    */
11747    /* ULong amd64h_check_ldmxcsr ( ULong ); */
11748    assign( t64, mkIRExprCCall(
11749                    Ity_I64, 0/*regparms*/,
11750                    "amd64g_check_ldmxcsr",
11751                    &amd64g_check_ldmxcsr,
11752                    mkIRExprVec_1(
11753                       unop(Iop_32Uto64,
11754                            loadLE(Ity_I32, mkexpr(addr))
11755                       )
11756                    )
11757                 )
11758          );
11759
11760    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
11761    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
11762    put_emwarn( mkexpr(ew) );
11763    /* Finally, if an emulation warning was reported, side-exit to
11764       the next insn, reporting the warning, so that Valgrind's
11765       dispatcher sees the warning. */
11766    stmt(
11767       IRStmt_Exit(
11768          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
11769          Ijk_EmWarn,
11770          IRConst_U64(guest_RIP_bbstart+delta),
11771          OFFB_RIP
11772       )
11773    );
11774    return delta;
11775 }
11776
11777
11778 static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
11779 {
11780    /* ------ rfbm[0] gates the x87 state ------ */
11781
11782    /* Uses dirty helper:
11783          void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
11784    */
11785    IRDirty* d0 = unsafeIRDirty_0_N (
11786                     0/*regparms*/,
11787                     "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
11788                     &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
11789                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
11790                  );
11791    d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
11792                      mkU64(1));
11793
11794    /* Declare we're writing memory.  Really, bytes 24 through 31
11795       (MXCSR and MXCSR_MASK) aren't written, but we can't express more
11796       than 1 memory area here, so just mark the whole thing as
11797       written. */
11798    d0->mFx   = Ifx_Write;
11799    d0->mAddr = mkexpr(addr);
11800    d0->mSize = 160;
11801
11802    /* declare we're reading guest state */
11803    d0->nFxState = 5;
11804    vex_bzero(&d0->fxState, sizeof(d0->fxState));
11805
11806    d0->fxState[0].fx     = Ifx_Read;
11807    d0->fxState[0].offset = OFFB_FTOP;
11808    d0->fxState[0].size   = sizeof(UInt);
11809
11810    d0->fxState[1].fx     = Ifx_Read;
11811    d0->fxState[1].offset = OFFB_FPREGS;
11812    d0->fxState[1].size   = 8 * sizeof(ULong);
11813
11814    d0->fxState[2].fx     = Ifx_Read;
11815    d0->fxState[2].offset = OFFB_FPTAGS;
11816    d0->fxState[2].size   = 8 * sizeof(UChar);
11817
11818    d0->fxState[3].fx     = Ifx_Read;
11819    d0->fxState[3].offset = OFFB_FPROUND;
11820    d0->fxState[3].size   = sizeof(ULong);
11821
11822    d0->fxState[4].fx     = Ifx_Read;
11823    d0->fxState[4].offset = OFFB_FC3210;
11824    d0->fxState[4].size   = sizeof(ULong);
11825
11826    stmt( IRStmt_Dirty(d0) );
11827
11828    /* ------ rfbm[1] gates the SSE state ------ */
11829
11830    IRTemp rfbm_1    = newTemp(Ity_I64);
11831    IRTemp rfbm_1or2 = newTemp(Ity_I64);
11832    assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
11833    assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
11834
11835    IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
11836    IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
11837
11838    /* Uses dirty helper:
11839          void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
11840                  ( VexGuestAMD64State*, ULong )
11841       This creates only MXCSR and MXCSR_MASK.  We need to do this if
11842       either components 1 (SSE) or 2 (AVX) are requested.  Hence the
11843       guard condition is a bit more complex.
11844    */
11845    IRDirty* d1 = unsafeIRDirty_0_N (
11846                     0/*regparms*/,
11847                     "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
11848                     &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
11849                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
11850                  );
11851    d1->guard = guard_1or2;
11852
11853    /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
11854       the code for rbfm[0] just above claims a write of 0 .. 159, so
11855       this duplicates it.  But at least correctly connects 24 .. 31 to
11856       the MXCSR guest state representation (SSEROUND field). */
11857    d1->mFx   = Ifx_Write;
11858    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
11859    d1->mSize = 8;
11860
11861    /* declare we're reading guest state */
11862    d1->nFxState = 1;
11863    vex_bzero(&d1->fxState, sizeof(d1->fxState));
11864
11865    d1->fxState[0].fx     = Ifx_Read;
11866    d1->fxState[0].offset = OFFB_SSEROUND;
11867    d1->fxState[0].size   = sizeof(ULong);
11868
11869    /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
11870       else.  We do the actual register array, XMM[0..15], separately,
11871       in order that any undefinedness in the XMM registers is tracked
11872       separately by Memcheck and does not "infect" the in-memory
11873       shadow for the other parts of the image. */
11874    stmt( IRStmt_Dirty(d1) );
11875
11876    /* And now the XMMs themselves. */
11877    UInt reg;
11878    for (reg = 0; reg < 16; reg++) {
11879       stmt( IRStmt_StoreG(
11880                Iend_LE,
11881                binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
11882                getXMMReg(reg),
11883                guard_1
11884       ));
11885    }
11886
11887    /* ------ rfbm[2] gates the AVX state ------ */
11888    /* Component 2 is just a bunch of register saves, so we'll do it
11889       inline, just to be simple and to be Memcheck friendly. */
11890
11891    IRTemp rfbm_2 = newTemp(Ity_I64);
11892    assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
11893
11894    IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
11895
11896    for (reg = 0; reg < 16; reg++) {
11897       stmt( IRStmt_StoreG(
11898                Iend_LE,
11899                binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
11900                getYMMRegLane128(reg,1),
11901                guard_2
11902       ));
11903    }
11904 }
11905
11906
11907 static Long dis_XSAVE ( const VexAbiInfo* vbi,
11908                         Prefix pfx, Long delta, Int sz )
11909 {
11910    /* Note that the presence or absence of REX.W (indicated here by
11911       |sz|) slightly affects the written format: whether the saved FPU
11912       IP and DP pointers are 64 or 32 bits.  But the helper function
11913       we call simply writes zero bits in the relevant fields, which
11914       are 64 bits regardless of what REX.W is, and so it's good enough
11915       (iow, equally broken) in both cases. */
11916    IRTemp addr  = IRTemp_INVALID;
11917    Int    alen  = 0;
11918    HChar  dis_buf[50];
11919    UChar  modrm = getUChar(delta);
11920    vassert(!epartIsReg(modrm)); /* ensured by caller */
11921    vassert(sz == 4 || sz == 8); /* ditto */
11922
11923    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11924    delta += alen;
11925    gen_SIGNAL_if_not_64_aligned(vbi, addr);
11926
11927    DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
11928
11929    /* VEX's caller is assumed to have checked this. */
11930    const ULong aSSUMED_XCR0_VALUE = 7;
11931
11932    IRTemp rfbm = newTemp(Ity_I64);
11933    assign(rfbm,
11934           binop(Iop_And64,
11935                 binop(Iop_Or64,
11936                       binop(Iop_Shl64,
11937                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
11938                       unop(Iop_32Uto64, getIRegRAX(4))),
11939                 mkU64(aSSUMED_XCR0_VALUE)));
11940
11941    gen_XSAVE_SEQUENCE(addr, rfbm);
11942
11943    /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
11944       OR-ing the RFBM value into it. */
11945    IRTemp addr_plus_512 = newTemp(Ity_I64);
11946    assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
11947    storeLE( mkexpr(addr_plus_512),
11948             binop(Iop_Or8,
11949                   unop(Iop_64to8, mkexpr(rfbm)),
11950                   loadLE(Ity_I8, mkexpr(addr_plus_512))) );
11951
11952    return delta;
11953 }
11954
11955
11956 static Long dis_FXSAVE ( const VexAbiInfo* vbi,
11957                          Prefix pfx, Long delta, Int sz )
11958 {
11959    /* See comment in dis_XSAVE about the significance of REX.W. */
11960    IRTemp addr  = IRTemp_INVALID;
11961    Int    alen  = 0;
11962    HChar  dis_buf[50];
11963    UChar  modrm = getUChar(delta);
11964    vassert(!epartIsReg(modrm)); /* ensured by caller */
11965    vassert(sz == 4 || sz == 8); /* ditto */
11966
11967    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11968    delta += alen;
11969    gen_SIGNAL_if_not_16_aligned(vbi, addr);
11970
11971    DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
11972
11973    /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
11974       to 0b011, generate the XSAVE sequence accordingly, and let iropt
11975       fold out the unused (AVX) parts accordingly. */
11976    IRTemp rfbm = newTemp(Ity_I64);
11977    assign(rfbm, mkU64(3));
11978    gen_XSAVE_SEQUENCE(addr, rfbm);
11979
11980    return delta;
11981 }
11982
11983
11984 static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
11985 {
11986    /* ------ rfbm[0] gates the x87 state ------ */
11987
11988    /* If rfbm[0] == 1, we have to write the x87 state.  If
11989       xstate_bv[0] == 1, we will read it from the memory image, else
11990       we'll set it to initial values.  Doing this with a helper
11991       function and getting the definedness flow annotations correct is
11992       too difficult, so generate stupid but simple code: first set the
11993       registers to initial values, regardless of xstate_bv[0].  Then,
11994       conditionally restore from the memory image. */
11995
11996    IRTemp rfbm_0       = newTemp(Ity_I64);
11997    IRTemp xstate_bv_0  = newTemp(Ity_I64);
11998    IRTemp restore_0    = newTemp(Ity_I64);
11999    assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
12000    assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
12001    assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
12002
12003    gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
12004
12005    /* Uses dirty helper:
12006          void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
12007    */
12008    IRDirty* d0 = unsafeIRDirty_0_N (
12009                     0/*regparms*/,
12010                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
12011                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
12012                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
12013                  );
12014    d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
12015
12016    /* Declare we're reading memory.  Really, bytes 24 through 31
12017       (MXCSR and MXCSR_MASK) aren't read, but we can't express more
12018       than 1 memory area here, so just mark the whole thing as
12019       read. */
12020    d0->mFx   = Ifx_Read;
12021    d0->mAddr = mkexpr(addr);
12022    d0->mSize = 160;
12023
12024    /* declare we're writing guest state */
12025    d0->nFxState = 5;
12026    vex_bzero(&d0->fxState, sizeof(d0->fxState));
12027
12028    d0->fxState[0].fx     = Ifx_Write;
12029    d0->fxState[0].offset = OFFB_FTOP;
12030    d0->fxState[0].size   = sizeof(UInt);
12031
12032    d0->fxState[1].fx     = Ifx_Write;
12033    d0->fxState[1].offset = OFFB_FPREGS;
12034    d0->fxState[1].size   = 8 * sizeof(ULong);
12035
12036    d0->fxState[2].fx     = Ifx_Write;
12037    d0->fxState[2].offset = OFFB_FPTAGS;
12038    d0->fxState[2].size   = 8 * sizeof(UChar);
12039
12040    d0->fxState[3].fx     = Ifx_Write;
12041    d0->fxState[3].offset = OFFB_FPROUND;
12042    d0->fxState[3].size   = sizeof(ULong);
12043
12044    d0->fxState[4].fx     = Ifx_Write;
12045    d0->fxState[4].offset = OFFB_FC3210;
12046    d0->fxState[4].size   = sizeof(ULong);
12047
12048    stmt( IRStmt_Dirty(d0) );
12049
12050    /* ------ rfbm[1] gates the SSE state ------ */
12051
12052    /* Same scheme as component 0: first zero it out, and then possibly
12053       restore from the memory area. */
12054    IRTemp rfbm_1       = newTemp(Ity_I64);
12055    IRTemp xstate_bv_1  = newTemp(Ity_I64);
12056    IRTemp restore_1    = newTemp(Ity_I64);
12057    assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
12058    assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
12059    assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
12060    IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
12061    IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
12062
12063    IRTemp rfbm_1or2       = newTemp(Ity_I64);
12064    IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
12065    IRTemp restore_1or2    = newTemp(Ity_I64);
12066    assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
12067    assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
12068    assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
12069                                            mkexpr(xstate_bv_1or2)));
12070    IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
12071    IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
12072
12073    /* The areas in question are: SSEROUND, and the XMM register array. */
12074    putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
12075
12076    UInt reg;
12077    for (reg = 0; reg < 16; reg++) {
12078       putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
12079    }
12080
12081    /* And now possibly restore from MXCSR/MXCSR_MASK */
12082    /* Uses dirty helper:
12083          void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
12084                  ( VexGuestAMD64State*, ULong )
12085       This restores from only MXCSR and MXCSR_MASK.  We need to do
12086       this if either components 1 (SSE) or 2 (AVX) are requested.
12087       Hence the guard condition is a bit more complex.
12088    */
12089    IRDirty* d1 = unsafeIRDirty_0_N (
12090                     0/*regparms*/,
12091                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
12092                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
12093                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
12094                 ) ;
12095    d1->guard = restore_1or2e;
12096
12097    /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
12098       the code for rbfm[0] just above claims a read of 0 .. 159, so
12099       this duplicates it.  But at least correctly connects 24 .. 31 to
12100       the MXCSR guest state representation (SSEROUND field). */
12101    d1->mFx   = Ifx_Read;
12102    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
12103    d1->mSize = 8;
12104
12105    /* declare we're writing guest state */
12106    d1->nFxState = 1;
12107    vex_bzero(&d1->fxState, sizeof(d1->fxState));
12108
12109    d1->fxState[0].fx     = Ifx_Write;
12110    d1->fxState[0].offset = OFFB_SSEROUND;
12111    d1->fxState[0].size   = sizeof(ULong);
12112
12113    /* Call the helper.  This creates SSEROUND but nothing
12114       else.  We do the actual register array, XMM[0..15], separately,
12115       in order that any undefinedness in the XMM registers is tracked
12116       separately by Memcheck and is not "infected" by the in-memory
12117       shadow for the other parts of the image. */
12118    stmt( IRStmt_Dirty(d1) );
12119
12120    /* And now the XMMs themselves.  For each register, we PUT either
12121       its old value, or the value loaded from memory.  One convenient
12122       way to do that is with a conditional load that has its the
12123       default value, the old value of the register. */
12124    for (reg = 0; reg < 16; reg++) {
12125       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
12126       IRExpr* alt = getXMMReg(reg);
12127       IRTemp  loadedValue = newTemp(Ity_V128);
12128       stmt( IRStmt_LoadG(Iend_LE,
12129                          ILGop_IdentV128,
12130                          loadedValue, ea, alt, restore_1e) );
12131       putXMMReg(reg, mkexpr(loadedValue));
12132    }
12133
12134    /* ------ rfbm[2] gates the AVX state ------ */
12135    /* Component 2 is just a bunch of register loads, so we'll do it
12136       inline, just to be simple and to be Memcheck friendly. */
12137
12138    /* Same scheme as component 0: first zero it out, and then possibly
12139       restore from the memory area. */
12140    IRTemp rfbm_2      = newTemp(Ity_I64);
12141    IRTemp xstate_bv_2 = newTemp(Ity_I64);
12142    IRTemp restore_2   = newTemp(Ity_I64);
12143    assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
12144    assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
12145    assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
12146
12147    IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
12148    IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
12149
12150    for (reg = 0; reg < 16; reg++) {
12151       putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
12152    }
12153
12154    for (reg = 0; reg < 16; reg++) {
12155       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
12156       IRExpr* alt = getYMMRegLane128(reg, 1);
12157       IRTemp  loadedValue = newTemp(Ity_V128);
12158       stmt( IRStmt_LoadG(Iend_LE,
12159                          ILGop_IdentV128,
12160                          loadedValue, ea, alt, restore_2e) );
12161       putYMMRegLane128(reg, 1, mkexpr(loadedValue));
12162    }
12163 }
12164
12165
12166 static Long dis_XRSTOR ( const VexAbiInfo* vbi,
12167                          Prefix pfx, Long delta, Int sz )
12168 {
12169    /* As with XRSTOR above we ignore the value of REX.W since we're
12170       not bothering with the FPU DP and IP fields. */
12171    IRTemp addr  = IRTemp_INVALID;
12172    Int    alen  = 0;
12173    HChar  dis_buf[50];
12174    UChar  modrm = getUChar(delta);
12175    vassert(!epartIsReg(modrm)); /* ensured by caller */
12176    vassert(sz == 4 || sz == 8); /* ditto */
12177
12178    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12179    delta += alen;
12180    gen_SIGNAL_if_not_64_aligned(vbi, addr);
12181
12182    DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
12183
12184    /* VEX's caller is assumed to have checked this. */
12185    const ULong aSSUMED_XCR0_VALUE = 7;
12186
12187    IRTemp rfbm = newTemp(Ity_I64);
12188    assign(rfbm,
12189           binop(Iop_And64,
12190                 binop(Iop_Or64,
12191                       binop(Iop_Shl64,
12192                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
12193                       unop(Iop_32Uto64, getIRegRAX(4))),
12194                 mkU64(aSSUMED_XCR0_VALUE)));
12195
12196    IRTemp xstate_bv = newTemp(Ity_I64);
12197    assign(xstate_bv, loadLE(Ity_I64,
12198                             binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
12199
12200    IRTemp xcomp_bv = newTemp(Ity_I64);
12201    assign(xcomp_bv, loadLE(Ity_I64,
12202                            binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
12203
12204    IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
12205    assign( xsavehdr_23_16,
12206            loadLE(Ity_I64,
12207                   binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
12208
12209    /* We must fault if
12210       * xcomp_bv[63] == 1, since this simulated CPU does not support
12211         the compaction extension.
12212       * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
12213       * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
12214         imply that xcomp_bv must be zero.
12215       xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
12216    */
12217    IRTemp fault_if_nonzero = newTemp(Ity_I64);
12218    assign(fault_if_nonzero,
12219           binop(Iop_Or64,
12220                 binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
12221                 binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
12222    stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
12223                      Ijk_SigSEGV,
12224                      IRConst_U64(guest_RIP_curr_instr),
12225                      OFFB_RIP
12226    ));
12227
12228    /* We are guaranteed now that both xstate_bv and rfbm are in the
12229       range 0 .. 7.  Generate the restore sequence proper. */
12230    gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
12231
12232    return delta;
12233 }
12234
12235
12236 static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
12237                           Prefix pfx, Long delta, Int sz )
12238 {
12239    /* As with FXSAVE above we ignore the value of REX.W since we're
12240       not bothering with the FPU DP and IP fields. */
12241    IRTemp addr  = IRTemp_INVALID;
12242    Int    alen  = 0;
12243    HChar  dis_buf[50];
12244    UChar  modrm = getUChar(delta);
12245    vassert(!epartIsReg(modrm)); /* ensured by caller */
12246    vassert(sz == 4 || sz == 8); /* ditto */
12247
12248    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12249    delta += alen;
12250    gen_SIGNAL_if_not_16_aligned(vbi, addr);
12251
12252    DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
12253
12254    /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
12255       as if components 0 and 1 are set as present in XSTATE_BV in the
12256       XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
12257       generate the XRSTOR sequence accordingly, and let iropt fold out
12258       the unused (AVX) parts accordingly. */
12259    IRTemp three = newTemp(Ity_I64);
12260    assign(three, mkU64(3));
12261    gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
12262
12263    return delta;
12264 }
12265
12266
12267 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
12268 {
12269    vassert(imm8 >= 0 && imm8 <= 7);
12270
12271    // Create a V128 value which has the selected word in the
12272    // specified lane, and zeroes everywhere else.
12273    IRTemp tmp128    = newTemp(Ity_V128);
12274    IRTemp halfshift = newTemp(Ity_I64);
12275    assign(halfshift, binop(Iop_Shl64,
12276                            unop(Iop_16Uto64, mkexpr(u16)),
12277                            mkU8(16 * (imm8 & 3))));
12278    if (imm8 < 4) {
12279       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
12280    } else {
12281       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
12282    }
12283
12284    UShort mask = ~(3 << (imm8 * 2));
12285    IRTemp res  = newTemp(Ity_V128);
12286    assign( res, binop(Iop_OrV128,
12287                       mkexpr(tmp128),
12288                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
12289    return res;
12290 }
12291
12292
12293 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
12294 {
12295    IRTemp s1, s0, d1, d0;
12296    s1 = s0 = d1 = d0 = IRTemp_INVALID;
12297
12298    breakupV128to64s( sV, &s1, &s0 );
12299    breakupV128to64s( dV, &d1, &d0 );
12300
12301    IRTemp res = newTemp(Ity_V128);
12302    assign( res,
12303            binop(Iop_64HLtoV128,
12304                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
12305                                "amd64g_calculate_mmx_psadbw",
12306                                &amd64g_calculate_mmx_psadbw,
12307                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
12308                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
12309                                "amd64g_calculate_mmx_psadbw",
12310                                &amd64g_calculate_mmx_psadbw,
12311                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
12312    return res;
12313 }
12314
12315
12316 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
12317 {
12318    IRTemp sHi, sLo, dHi, dLo;
12319    sHi = sLo = dHi = dLo = IRTemp_INVALID;
12320    breakupV256toV128s( dV, &dHi, &dLo);
12321    breakupV256toV128s( sV, &sHi, &sLo);
12322    IRTemp res = newTemp(Ity_V256);
12323    assign(res, binop(Iop_V128HLtoV256,
12324                      mkexpr(math_PSADBW_128(dHi, sHi)),
12325                      mkexpr(math_PSADBW_128(dLo, sLo))));
12326    return res;
12327 }
12328
12329
12330 static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
12331                              Long delta, Bool isAvx )
12332 {
12333    IRTemp regD    = newTemp(Ity_V128);
12334    IRTemp mask    = newTemp(Ity_V128);
12335    IRTemp olddata = newTemp(Ity_V128);
12336    IRTemp newdata = newTemp(Ity_V128);
12337    IRTemp addr    = newTemp(Ity_I64);
12338    UChar  modrm   = getUChar(delta);
12339    UInt   rG      = gregOfRexRM(pfx,modrm);
12340    UInt   rE      = eregOfRexRM(pfx,modrm);
12341
12342    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
12343    assign( regD, getXMMReg( rG ));
12344
12345    /* Unfortunately can't do the obvious thing with SarN8x16
12346       here since that can't be re-emitted as SSE2 code - no such
12347       insn. */
12348    assign( mask,
12349            binop(Iop_64HLtoV128,
12350                  binop(Iop_SarN8x8,
12351                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
12352                        mkU8(7) ),
12353                  binop(Iop_SarN8x8,
12354                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
12355                        mkU8(7) ) ));
12356    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
12357    assign( newdata, binop(Iop_OrV128,
12358                           binop(Iop_AndV128,
12359                                 mkexpr(regD),
12360                                 mkexpr(mask) ),
12361                           binop(Iop_AndV128,
12362                                 mkexpr(olddata),
12363                                 unop(Iop_NotV128, mkexpr(mask)))) );
12364    storeLE( mkexpr(addr), mkexpr(newdata) );
12365
12366    delta += 1;
12367    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
12368        nameXMMReg(rE), nameXMMReg(rG) );
12369    return delta;
12370 }
12371
12372
12373 static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
12374                                Long delta, Bool isAvx )
12375 {
12376    UChar modrm = getUChar(delta);
12377    UInt   rG   = gregOfRexRM(pfx,modrm);
12378    UInt   rE   = eregOfRexRM(pfx,modrm);
12379    IRTemp t0   = newTemp(Ity_I32);
12380    IRTemp t1   = newTemp(Ity_I32);
12381    IRTemp t2   = newTemp(Ity_I32);
12382    IRTemp t3   = newTemp(Ity_I32);
12383    delta += 1;
12384    assign( t0, binop( Iop_And32,
12385                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
12386                       mkU32(1) ));
12387    assign( t1, binop( Iop_And32,
12388                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
12389                       mkU32(2) ));
12390    assign( t2, binop( Iop_And32,
12391                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
12392                       mkU32(4) ));
12393    assign( t3, binop( Iop_And32,
12394                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
12395                       mkU32(8) ));
12396    putIReg32( rG, binop(Iop_Or32,
12397                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
12398                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
12399    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
12400        nameXMMReg(rE), nameIReg32(rG));
12401    return delta;
12402 }
12403
12404
12405 static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
12406 {
12407    UChar modrm = getUChar(delta);
12408    UInt   rG   = gregOfRexRM(pfx,modrm);
12409    UInt   rE   = eregOfRexRM(pfx,modrm);
12410    IRTemp t0   = newTemp(Ity_I32);
12411    IRTemp t1   = newTemp(Ity_I32);
12412    IRTemp t2   = newTemp(Ity_I32);
12413    IRTemp t3   = newTemp(Ity_I32);
12414    IRTemp t4   = newTemp(Ity_I32);
12415    IRTemp t5   = newTemp(Ity_I32);
12416    IRTemp t6   = newTemp(Ity_I32);
12417    IRTemp t7   = newTemp(Ity_I32);
12418    delta += 1;
12419    assign( t0, binop( Iop_And32,
12420                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
12421                       mkU32(1) ));
12422    assign( t1, binop( Iop_And32,
12423                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
12424                       mkU32(2) ));
12425    assign( t2, binop( Iop_And32,
12426                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
12427                       mkU32(4) ));
12428    assign( t3, binop( Iop_And32,
12429                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
12430                       mkU32(8) ));
12431    assign( t4, binop( Iop_And32,
12432                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
12433                       mkU32(16) ));
12434    assign( t5, binop( Iop_And32,
12435                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
12436                       mkU32(32) ));
12437    assign( t6, binop( Iop_And32,
12438                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
12439                       mkU32(64) ));
12440    assign( t7, binop( Iop_And32,
12441                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
12442                       mkU32(128) ));
12443    putIReg32( rG, binop(Iop_Or32,
12444                         binop(Iop_Or32,
12445                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
12446                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
12447                         binop(Iop_Or32,
12448                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
12449                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
12450    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
12451    return delta;
12452 }
12453
12454
12455 static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
12456                                Long delta, Bool isAvx )
12457 {
12458    UChar modrm = getUChar(delta);
12459    UInt   rG   = gregOfRexRM(pfx,modrm);
12460    UInt   rE   = eregOfRexRM(pfx,modrm);
12461    IRTemp t0   = newTemp(Ity_I32);
12462    IRTemp t1   = newTemp(Ity_I32);
12463    delta += 1;
12464    assign( t0, binop( Iop_And32,
12465                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
12466                       mkU32(1) ));
12467    assign( t1, binop( Iop_And32,
12468                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
12469                       mkU32(2) ));
12470    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
12471    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
12472        nameXMMReg(rE), nameIReg32(rG));
12473    return delta;
12474 }
12475
12476
12477 static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
12478 {
12479    UChar modrm = getUChar(delta);
12480    UInt   rG   = gregOfRexRM(pfx,modrm);
12481    UInt   rE   = eregOfRexRM(pfx,modrm);
12482    IRTemp t0   = newTemp(Ity_I32);
12483    IRTemp t1   = newTemp(Ity_I32);
12484    IRTemp t2   = newTemp(Ity_I32);
12485    IRTemp t3   = newTemp(Ity_I32);
12486    delta += 1;
12487    assign( t0, binop( Iop_And32,
12488                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
12489                       mkU32(1) ));
12490    assign( t1, binop( Iop_And32,
12491                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
12492                       mkU32(2) ));
12493    assign( t2, binop( Iop_And32,
12494                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
12495                       mkU32(4) ));
12496    assign( t3, binop( Iop_And32,
12497                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
12498                       mkU32(8) ));
12499    putIReg32( rG, binop(Iop_Or32,
12500                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
12501                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
12502    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
12503    return delta;
12504 }
12505
12506
12507 /* Note, this also handles SSE(1) insns. */
12508 __attribute__((noinline))
12509 static
12510 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
12511                         const VexArchInfo* archinfo,
12512                         const VexAbiInfo* vbi,
12513                         Prefix pfx, Int sz, Long deltaIN,
12514                         DisResult* dres )
12515 {
12516    IRTemp addr  = IRTemp_INVALID;
12517    IRTemp t0    = IRTemp_INVALID;
12518    IRTemp t1    = IRTemp_INVALID;
12519    IRTemp t2    = IRTemp_INVALID;
12520    IRTemp t3    = IRTemp_INVALID;
12521    IRTemp t4    = IRTemp_INVALID;
12522    IRTemp t5    = IRTemp_INVALID;
12523    IRTemp t6    = IRTemp_INVALID;
12524    UChar  modrm = 0;
12525    Int    alen  = 0;
12526    HChar  dis_buf[50];
12527
12528    *decode_OK = False;
12529
12530    Long   delta = deltaIN;
12531    UChar  opc   = getUChar(delta);
12532    delta++;
12533    switch (opc) {
12534
12535    case 0x10:
12536       if (have66noF2noF3(pfx)
12537           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12538          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
12539          modrm = getUChar(delta);
12540          if (epartIsReg(modrm)) {
12541             putXMMReg( gregOfRexRM(pfx,modrm),
12542                        getXMMReg( eregOfRexRM(pfx,modrm) ));
12543             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12544                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12545             delta += 1;
12546          } else {
12547             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12548             putXMMReg( gregOfRexRM(pfx,modrm),
12549                        loadLE(Ity_V128, mkexpr(addr)) );
12550             DIP("movupd %s,%s\n", dis_buf,
12551                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12552             delta += alen;
12553          }
12554          goto decode_success;
12555       }
12556       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
12557          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
12558          If E is reg, upper half of G is unchanged. */
12559       if (haveF2no66noF3(pfx)
12560           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
12561          modrm = getUChar(delta);
12562          if (epartIsReg(modrm)) {
12563             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
12564                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
12565             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12566                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12567             delta += 1;
12568          } else {
12569             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12570             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
12571             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
12572                              loadLE(Ity_I64, mkexpr(addr)) );
12573             DIP("movsd %s,%s\n", dis_buf,
12574                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12575             delta += alen;
12576          }
12577          goto decode_success;
12578       }
12579       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
12580          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
12581       if (haveF3no66noF2(pfx)
12582           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12583          modrm = getUChar(delta);
12584          if (epartIsReg(modrm)) {
12585             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
12586                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
12587             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12588                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12589             delta += 1;
12590          } else {
12591             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12592             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
12593             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
12594                              loadLE(Ity_I32, mkexpr(addr)) );
12595             DIP("movss %s,%s\n", dis_buf,
12596                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12597             delta += alen;
12598          }
12599          goto decode_success;
12600       }
12601       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
12602       if (haveNo66noF2noF3(pfx)
12603           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12604          modrm = getUChar(delta);
12605          if (epartIsReg(modrm)) {
12606             putXMMReg( gregOfRexRM(pfx,modrm),
12607                        getXMMReg( eregOfRexRM(pfx,modrm) ));
12608             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12609                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12610             delta += 1;
12611          } else {
12612             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12613             putXMMReg( gregOfRexRM(pfx,modrm),
12614                        loadLE(Ity_V128, mkexpr(addr)) );
12615             DIP("movups %s,%s\n", dis_buf,
12616                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
12617             delta += alen;
12618          }
12619          goto decode_success;
12620       }
12621       break;
12622
12623    case 0x11:
12624       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
12625          or lo half xmm). */
12626       if (haveF2no66noF3(pfx)
12627           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12628          modrm = getUChar(delta);
12629          if (epartIsReg(modrm)) {
12630             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
12631                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
12632             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12633                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
12634             delta += 1;
12635          } else {
12636             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12637             storeLE( mkexpr(addr),
12638                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
12639             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12640                                  dis_buf);
12641             delta += alen;
12642          }
12643          goto decode_success;
12644       }
12645       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
12646          or lo 1/4 xmm). */
12647       if (haveF3no66noF2(pfx) && sz == 4) {
12648          modrm = getUChar(delta);
12649          if (epartIsReg(modrm)) {
12650             /* fall through, we don't yet have a test case */
12651          } else {
12652             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12653             storeLE( mkexpr(addr),
12654                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
12655             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12656                                  dis_buf);
12657             delta += alen;
12658             goto decode_success;
12659          }
12660       }
12661       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
12662       if (have66noF2noF3(pfx)
12663           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12664          modrm = getUChar(delta);
12665          if (epartIsReg(modrm)) {
12666             putXMMReg( eregOfRexRM(pfx,modrm),
12667                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
12668             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12669                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
12670             delta += 1;
12671          } else {
12672             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12673             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12674             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12675                                   dis_buf );
12676             delta += alen;
12677          }
12678          goto decode_success;
12679       }
12680       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
12681       if (haveNo66noF2noF3(pfx)
12682           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12683          modrm = getUChar(delta);
12684          if (epartIsReg(modrm)) {
12685             /* fall through; awaiting test case */
12686          } else {
12687             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12688             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12689             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12690                                   dis_buf );
12691             delta += alen;
12692             goto decode_success;
12693          }
12694       }
12695       break;
12696
12697    case 0x12:
12698       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
12699       /* Identical to MOVLPS ? */
12700       if (have66noF2noF3(pfx)
12701           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12702          modrm = getUChar(delta);
12703          if (epartIsReg(modrm)) {
12704             /* fall through; apparently reg-reg is not possible */
12705          } else {
12706             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12707             delta += alen;
12708             putXMMRegLane64( gregOfRexRM(pfx,modrm),
12709                              0/*lower lane*/,
12710                              loadLE(Ity_I64, mkexpr(addr)) );
12711             DIP("movlpd %s, %s\n",
12712                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
12713             goto decode_success;
12714          }
12715       }
12716       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
12717       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
12718       if (haveNo66noF2noF3(pfx)
12719           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12720          modrm = getUChar(delta);
12721          if (epartIsReg(modrm)) {
12722             delta += 1;
12723             putXMMRegLane64( gregOfRexRM(pfx,modrm),
12724                              0/*lower lane*/,
12725                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
12726             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12727                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
12728          } else {
12729             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12730             delta += alen;
12731             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
12732                              loadLE(Ity_I64, mkexpr(addr)) );
12733             DIP("movlps %s, %s\n",
12734                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
12735          }
12736          goto decode_success;
12737       }
12738       break;
12739
12740    case 0x13:
12741       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
12742       if (haveNo66noF2noF3(pfx)
12743           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12744          modrm = getUChar(delta);
12745          if (!epartIsReg(modrm)) {
12746             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12747             delta += alen;
12748             storeLE( mkexpr(addr),
12749                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
12750                                       0/*lower lane*/ ) );
12751             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12752                                    dis_buf);
12753             goto decode_success;
12754          }
12755          /* else fall through */
12756       }
12757       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
12758       /* Identical to MOVLPS ? */
12759       if (have66noF2noF3(pfx)
12760           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12761          modrm = getUChar(delta);
12762          if (!epartIsReg(modrm)) {
12763             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12764             delta += alen;
12765             storeLE( mkexpr(addr),
12766                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
12767                                       0/*lower lane*/ ) );
12768             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12769                                    dis_buf);
12770             goto decode_success;
12771          }
12772          /* else fall through */
12773       }
12774       break;
12775
12776    case 0x14:
12777    case 0x15:
12778       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
12779       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
12780       /* These just appear to be special cases of SHUFPS */
12781       if (haveNo66noF2noF3(pfx) && sz == 4) {
12782          Bool   hi = toBool(opc == 0x15);
12783          IRTemp sV = newTemp(Ity_V128);
12784          IRTemp dV = newTemp(Ity_V128);
12785          modrm = getUChar(delta);
12786          UInt   rG = gregOfRexRM(pfx,modrm);
12787          assign( dV, getXMMReg(rG) );
12788          if (epartIsReg(modrm)) {
12789             UInt rE = eregOfRexRM(pfx,modrm);
12790             assign( sV, getXMMReg(rE) );
12791             delta += 1;
12792             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12793                 nameXMMReg(rE), nameXMMReg(rG));
12794          } else {
12795             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12796             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12797             delta += alen;
12798             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12799                 dis_buf, nameXMMReg(rG));
12800          }
12801          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
12802          putXMMReg( rG, mkexpr(res) );
12803          goto decode_success;
12804       }
12805       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
12806       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
12807       /* These just appear to be special cases of SHUFPS */
12808       if (have66noF2noF3(pfx)
12809           && sz == 2 /* could be 8 if rex also present */) {
12810          Bool   hi = toBool(opc == 0x15);
12811          IRTemp sV = newTemp(Ity_V128);
12812          IRTemp dV = newTemp(Ity_V128);
12813          modrm = getUChar(delta);
12814          UInt   rG = gregOfRexRM(pfx,modrm);
12815          assign( dV, getXMMReg(rG) );
12816          if (epartIsReg(modrm)) {
12817             UInt rE = eregOfRexRM(pfx,modrm);
12818             assign( sV, getXMMReg(rE) );
12819             delta += 1;
12820             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12821                 nameXMMReg(rE), nameXMMReg(rG));
12822          } else {
12823             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12824             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12825             delta += alen;
12826             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12827                 dis_buf, nameXMMReg(rG));
12828          }
12829          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
12830          putXMMReg( rG, mkexpr(res) );
12831          goto decode_success;
12832       }
12833       break;
12834
12835    case 0x16:
12836       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
12837       /* These seems identical to MOVHPS.  This instruction encoding is
12838          completely crazy. */
12839       if (have66noF2noF3(pfx)
12840           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12841          modrm = getUChar(delta);
12842          if (epartIsReg(modrm)) {
12843             /* fall through; apparently reg-reg is not possible */
12844          } else {
12845             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12846             delta += alen;
12847             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12848                              loadLE(Ity_I64, mkexpr(addr)) );
12849             DIP("movhpd %s,%s\n", dis_buf,
12850                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
12851             goto decode_success;
12852          }
12853       }
12854       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
12855       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
12856       if (haveNo66noF2noF3(pfx)
12857           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12858          modrm = getUChar(delta);
12859          if (epartIsReg(modrm)) {
12860             delta += 1;
12861             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12862                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
12863             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12864                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12865          } else {
12866             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12867             delta += alen;
12868             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12869                              loadLE(Ity_I64, mkexpr(addr)) );
12870             DIP("movhps %s,%s\n", dis_buf,
12871                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
12872          }
12873          goto decode_success;
12874       }
12875       break;
12876
12877    case 0x17:
12878       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
12879       if (haveNo66noF2noF3(pfx)
12880           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12881          modrm = getUChar(delta);
12882          if (!epartIsReg(modrm)) {
12883             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12884             delta += alen;
12885             storeLE( mkexpr(addr),
12886                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
12887                                       1/*upper lane*/ ) );
12888             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12889                                   dis_buf);
12890             goto decode_success;
12891          }
12892          /* else fall through */
12893       }
12894       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
12895       /* Again, this seems identical to MOVHPS. */
12896       if (have66noF2noF3(pfx)
12897           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12898          modrm = getUChar(delta);
12899          if (!epartIsReg(modrm)) {
12900             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12901             delta += alen;
12902             storeLE( mkexpr(addr),
12903                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
12904                                       1/*upper lane*/ ) );
12905             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12906                                   dis_buf);
12907             goto decode_success;
12908          }
12909          /* else fall through */
12910       }
12911       break;
12912
12913    case 0x18:
12914       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
12915       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
12916       /* 0F 18 /2 = PREFETCH1 */
12917       /* 0F 18 /3 = PREFETCH2 */
12918       if (haveNo66noF2noF3(pfx)
12919           && !epartIsReg(getUChar(delta))
12920           && gregLO3ofRM(getUChar(delta)) >= 0
12921           && gregLO3ofRM(getUChar(delta)) <= 3) {
12922          const HChar* hintstr = "??";
12923
12924          modrm = getUChar(delta);
12925          vassert(!epartIsReg(modrm));
12926
12927          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12928          delta += alen;
12929
12930          switch (gregLO3ofRM(modrm)) {
12931             case 0: hintstr = "nta"; break;
12932             case 1: hintstr = "t0"; break;
12933             case 2: hintstr = "t1"; break;
12934             case 3: hintstr = "t2"; break;
12935             default: vassert(0);
12936          }
12937
12938          DIP("prefetch%s %s\n", hintstr, dis_buf);
12939          goto decode_success;
12940       }
12941       break;
12942
12943    case 0x28:
12944       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
12945       if (have66noF2noF3(pfx)
12946           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12947          modrm = getUChar(delta);
12948          if (epartIsReg(modrm)) {
12949             putXMMReg( gregOfRexRM(pfx,modrm),
12950                        getXMMReg( eregOfRexRM(pfx,modrm) ));
12951             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12952                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12953             delta += 1;
12954          } else {
12955             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12956             gen_SIGNAL_if_not_16_aligned( vbi, addr );
12957             putXMMReg( gregOfRexRM(pfx,modrm),
12958                        loadLE(Ity_V128, mkexpr(addr)) );
12959             DIP("movapd %s,%s\n", dis_buf,
12960                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12961             delta += alen;
12962          }
12963          goto decode_success;
12964       }
12965       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
12966       if (haveNo66noF2noF3(pfx)
12967           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12968          modrm = getUChar(delta);
12969          if (epartIsReg(modrm)) {
12970             putXMMReg( gregOfRexRM(pfx,modrm),
12971                        getXMMReg( eregOfRexRM(pfx,modrm) ));
12972             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12973                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12974             delta += 1;
12975          } else {
12976             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12977             gen_SIGNAL_if_not_16_aligned( vbi, addr );
12978             putXMMReg( gregOfRexRM(pfx,modrm),
12979                        loadLE(Ity_V128, mkexpr(addr)) );
12980             DIP("movaps %s,%s\n", dis_buf,
12981                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12982             delta += alen;
12983          }
12984          goto decode_success;
12985       }
12986       break;
12987
12988    case 0x29:
12989       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
12990       if (haveNo66noF2noF3(pfx)
12991           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12992          modrm = getUChar(delta);
12993          if (epartIsReg(modrm)) {
12994             putXMMReg( eregOfRexRM(pfx,modrm),
12995                        getXMMReg( gregOfRexRM(pfx,modrm) ));
12996             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12997                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
12998             delta += 1;
12999          } else {
13000             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13001             gen_SIGNAL_if_not_16_aligned( vbi, addr );
13002             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13003             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13004                                   dis_buf );
13005             delta += alen;
13006          }
13007          goto decode_success;
13008       }
13009       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
13010       if (have66noF2noF3(pfx)
13011           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13012          modrm = getUChar(delta);
13013          if (epartIsReg(modrm)) {
13014             putXMMReg( eregOfRexRM(pfx,modrm),
13015                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
13016             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13017                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
13018             delta += 1;
13019          } else {
13020             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13021             gen_SIGNAL_if_not_16_aligned( vbi, addr );
13022             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13023             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13024                                   dis_buf );
13025             delta += alen;
13026          }
13027          goto decode_success;
13028       }
13029       break;
13030
13031    case 0x2A:
13032       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
13033          half xmm */
13034       if (haveNo66noF2noF3(pfx) && sz == 4) {
13035          IRTemp arg64 = newTemp(Ity_I64);
13036          IRTemp rmode = newTemp(Ity_I32);
13037
13038          modrm = getUChar(delta);
13039          if (epartIsReg(modrm)) {
13040             /* Only switch to MMX mode if the source is a MMX register.
13041                See comments on CVTPI2PD for details.  Fixes #357059. */
13042             do_MMX_preamble();
13043             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
13044             delta += 1;
13045             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13046                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13047          } else {
13048             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13049             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13050             delta += alen;
13051             DIP("cvtpi2ps %s,%s\n", dis_buf,
13052                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
13053          }
13054
13055          assign( rmode, get_sse_roundingmode() );
13056
13057          putXMMRegLane32F(
13058             gregOfRexRM(pfx,modrm), 0,
13059             binop(Iop_F64toF32,
13060                   mkexpr(rmode),
13061                   unop(Iop_I32StoF64,
13062                        unop(Iop_64to32, mkexpr(arg64)) )) );
13063
13064          putXMMRegLane32F(
13065             gregOfRexRM(pfx,modrm), 1,
13066             binop(Iop_F64toF32,
13067                   mkexpr(rmode),
13068                   unop(Iop_I32StoF64,
13069                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
13070
13071          goto decode_success;
13072       }
13073       /* F3 0F 2A = CVTSI2SS
13074          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
13075          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
13076       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
13077          IRTemp rmode = newTemp(Ity_I32);
13078          assign( rmode, get_sse_roundingmode() );
13079          modrm = getUChar(delta);
13080          if (sz == 4) {
13081             IRTemp arg32 = newTemp(Ity_I32);
13082             if (epartIsReg(modrm)) {
13083                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
13084                delta += 1;
13085                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13086                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
13087             } else {
13088                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13089                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
13090                delta += alen;
13091                DIP("cvtsi2ss %s,%s\n", dis_buf,
13092                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
13093             }
13094             putXMMRegLane32F(
13095                gregOfRexRM(pfx,modrm), 0,
13096                binop(Iop_F64toF32,
13097                      mkexpr(rmode),
13098                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
13099          } else {
13100             /* sz == 8 */
13101             IRTemp arg64 = newTemp(Ity_I64);
13102             if (epartIsReg(modrm)) {
13103                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
13104                delta += 1;
13105                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13106                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
13107             } else {
13108                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13109                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13110                delta += alen;
13111                DIP("cvtsi2ssq %s,%s\n", dis_buf,
13112                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
13113             }
13114             putXMMRegLane32F(
13115                gregOfRexRM(pfx,modrm), 0,
13116                binop(Iop_F64toF32,
13117                      mkexpr(rmode),
13118                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
13119          }
13120          goto decode_success;
13121       }
13122       /* F2 0F 2A = CVTSI2SD
13123          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
13124          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
13125       */
13126       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
13127          modrm = getUChar(delta);
13128          if (sz == 4) {
13129             IRTemp arg32 = newTemp(Ity_I32);
13130             if (epartIsReg(modrm)) {
13131                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
13132                delta += 1;
13133                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13134                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
13135             } else {
13136                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13137                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
13138                delta += alen;
13139                DIP("cvtsi2sdl %s,%s\n", dis_buf,
13140                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
13141             }
13142             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
13143                               unop(Iop_I32StoF64, mkexpr(arg32))
13144             );
13145          } else {
13146             /* sz == 8 */
13147             IRTemp arg64 = newTemp(Ity_I64);
13148             if (epartIsReg(modrm)) {
13149                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
13150                delta += 1;
13151                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13152                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
13153             } else {
13154                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13155                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13156                delta += alen;
13157                DIP("cvtsi2sdq %s,%s\n", dis_buf,
13158                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
13159             }
13160             putXMMRegLane64F(
13161                gregOfRexRM(pfx,modrm),
13162                0,
13163                binop( Iop_I64StoF64,
13164                       get_sse_roundingmode(),
13165                       mkexpr(arg64)
13166                )
13167             );
13168          }
13169          goto decode_success;
13170       }
13171       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
13172          xmm(G) */
13173       if (have66noF2noF3(pfx) && sz == 2) {
13174          IRTemp arg64 = newTemp(Ity_I64);
13175
13176          modrm = getUChar(delta);
13177          if (epartIsReg(modrm)) {
13178             /* Only switch to MMX mode if the source is a MMX register.
13179                This is inconsistent with all other instructions which
13180                convert between XMM and (M64 or MMX), which always switch
13181                to MMX mode even if 64-bit operand is M64 and not MMX.  At
13182                least, that's what the Intel docs seem to me to say.
13183                Fixes #210264. */
13184             do_MMX_preamble();
13185             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
13186             delta += 1;
13187             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13188                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13189          } else {
13190             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13191             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13192             delta += alen;
13193             DIP("cvtpi2pd %s,%s\n", dis_buf,
13194                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
13195          }
13196
13197          putXMMRegLane64F(
13198             gregOfRexRM(pfx,modrm), 0,
13199             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
13200          );
13201
13202          putXMMRegLane64F(
13203             gregOfRexRM(pfx,modrm), 1,
13204             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
13205          );
13206
13207          goto decode_success;
13208       }
13209       break;
13210
13211    case 0x2B:
13212       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
13213       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
13214       if ( (haveNo66noF2noF3(pfx) && sz == 4)
13215            || (have66noF2noF3(pfx) && sz == 2) ) {
13216          modrm = getUChar(delta);
13217          if (!epartIsReg(modrm)) {
13218             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13219             gen_SIGNAL_if_not_16_aligned( vbi, addr );
13220             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13221             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
13222                                     dis_buf,
13223                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13224             delta += alen;
13225             goto decode_success;
13226          }
13227          /* else fall through */
13228       }
13229       break;
13230
13231    case 0x2C:
13232    case 0x2D:
13233       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
13234          I32 in mmx, according to prevailing SSE rounding mode */
13235       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
13236          I32 in mmx, rounding towards zero */
13237       if (haveNo66noF2noF3(pfx) && sz == 4) {
13238          IRTemp dst64  = newTemp(Ity_I64);
13239          IRTemp rmode  = newTemp(Ity_I32);
13240          IRTemp f32lo  = newTemp(Ity_F32);
13241          IRTemp f32hi  = newTemp(Ity_F32);
13242          Bool   r2zero = toBool(opc == 0x2C);
13243
13244          do_MMX_preamble();
13245          modrm = getUChar(delta);
13246
13247          if (epartIsReg(modrm)) {
13248             delta += 1;
13249             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
13250             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
13251             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
13252                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
13253                                       nameMMXReg(gregLO3ofRM(modrm)));
13254          } else {
13255             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13256             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
13257             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
13258                                                  mkexpr(addr),
13259                                                  mkU64(4) )));
13260             delta += alen;
13261             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
13262                                       dis_buf,
13263                                       nameMMXReg(gregLO3ofRM(modrm)));
13264          }
13265
13266          if (r2zero) {
13267             assign(rmode, mkU32((UInt)Irrm_ZERO) );
13268          } else {
13269             assign( rmode, get_sse_roundingmode() );
13270          }
13271
13272          assign(
13273             dst64,
13274             binop( Iop_32HLto64,
13275                    binop( Iop_F64toI32S,
13276                           mkexpr(rmode),
13277                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
13278                    binop( Iop_F64toI32S,
13279                           mkexpr(rmode),
13280                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
13281                  )
13282          );
13283
13284          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
13285          goto decode_success;
13286       }
13287       /* F3 0F 2D = CVTSS2SI
13288          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
13289                        according to prevailing SSE rounding mode
13290          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
13291                        according to prevailing SSE rounding mode
13292       */
13293       /* F3 0F 2C = CVTTSS2SI
13294          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
13295                        truncating towards zero
13296          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
13297                        truncating towards zero
13298       */
13299       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
13300          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
13301          goto decode_success;
13302       }
13303       /* F2 0F 2D = CVTSD2SI
13304          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
13305                        according to prevailing SSE rounding mode
13306          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
13307                        according to prevailing SSE rounding mode
13308       */
13309       /* F2 0F 2C = CVTTSD2SI
13310          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
13311                        truncating towards zero
13312          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
13313                        truncating towards zero
13314       */
13315       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
13316          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
13317          goto decode_success;
13318       }
13319       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
13320          I32 in mmx, according to prevailing SSE rounding mode */
13321       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
13322          I32 in mmx, rounding towards zero */
13323       if (have66noF2noF3(pfx) && sz == 2) {
13324          IRTemp dst64  = newTemp(Ity_I64);
13325          IRTemp rmode  = newTemp(Ity_I32);
13326          IRTemp f64lo  = newTemp(Ity_F64);
13327          IRTemp f64hi  = newTemp(Ity_F64);
13328          Bool   r2zero = toBool(opc == 0x2C);
13329
13330          do_MMX_preamble();
13331          modrm = getUChar(delta);
13332
13333          if (epartIsReg(modrm)) {
13334             delta += 1;
13335             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
13336             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
13337             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
13338                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
13339                                       nameMMXReg(gregLO3ofRM(modrm)));
13340          } else {
13341             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13342             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
13343             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
13344                                                  mkexpr(addr),
13345                                                  mkU64(8) )));
13346             delta += alen;
13347             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
13348                                       dis_buf,
13349                                       nameMMXReg(gregLO3ofRM(modrm)));
13350          }
13351
13352          if (r2zero) {
13353             assign(rmode, mkU32((UInt)Irrm_ZERO) );
13354          } else {
13355             assign( rmode, get_sse_roundingmode() );
13356          }
13357
13358          assign(
13359             dst64,
13360             binop( Iop_32HLto64,
13361                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
13362                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
13363                  )
13364          );
13365
13366          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
13367          goto decode_success;
13368       }
13369       break;
13370
13371    case 0x2E:
13372    case 0x2F:
13373       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
13374       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
13375       if (have66noF2noF3(pfx) && sz == 2) {
13376          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
13377          goto decode_success;
13378       }
13379       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
13380       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
13381       if (haveNo66noF2noF3(pfx) && sz == 4) {
13382          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
13383          goto decode_success;
13384       }
13385       break;
13386
13387    case 0x50:
13388       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
13389          to 4 lowest bits of ireg(G) */
13390       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
13391           && epartIsReg(getUChar(delta))) {
13392          /* sz == 8 is a kludge to handle insns with REX.W redundantly
13393             set to 1, which has been known to happen:
13394
13395             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
13396
13397             20071106: Intel docs say that REX.W isn't redundant: when
13398             present, a 64-bit register is written; when not present, only
13399             the 32-bit half is written.  However, testing on a Core2
13400             machine suggests the entire 64 bit register is written
13401             irrespective of the status of REX.W.  That could be because
13402             of the default rule that says "if the lower half of a 32-bit
13403             register is written, the upper half is zeroed".  By using
13404             putIReg32 here we inadvertantly produce the same behaviour as
13405             the Core2, for the same reason -- putIReg32 implements said
13406             rule.
13407
13408             AMD docs give no indication that REX.W is even valid for this
13409             insn. */
13410          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
13411          goto decode_success;
13412       }
13413       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
13414          2 lowest bits of ireg(G) */
13415       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
13416          /* sz == 8 is a kludge to handle insns with REX.W redundantly
13417             set to 1, which has been known to happen:
13418             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
13419             20071106: see further comments on MOVMSKPS implementation above.
13420          */
13421          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
13422          goto decode_success;
13423       }
13424       break;
13425
13426    case 0x51:
13427       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
13428       if (haveF3no66noF2(pfx) && sz == 4) {
13429          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
13430                                             "sqrtss", Iop_Sqrt32F0x4 );
13431          goto decode_success;
13432       }
13433       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
13434       if (haveNo66noF2noF3(pfx) && sz == 4) {
13435          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13436                                            "sqrtps", Iop_Sqrt32Fx4 );
13437          goto decode_success;
13438       }
13439       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
13440       if (haveF2no66noF3(pfx) && sz == 4) {
13441          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
13442                                             "sqrtsd", Iop_Sqrt64F0x2 );
13443          goto decode_success;
13444       }
13445       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
13446       if (have66noF2noF3(pfx) && sz == 2) {
13447          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13448                                            "sqrtpd", Iop_Sqrt64Fx2 );
13449          goto decode_success;
13450       }
13451       break;
13452
13453    case 0x52:
13454       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
13455       if (haveF3no66noF2(pfx) && sz == 4) {
13456          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
13457                                             "rsqrtss", Iop_RSqrtEst32F0x4 );
13458          goto decode_success;
13459       }
13460       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
13461       if (haveNo66noF2noF3(pfx) && sz == 4) {
13462          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13463                                            "rsqrtps", Iop_RSqrtEst32Fx4 );
13464          goto decode_success;
13465       }
13466       break;
13467
13468    case 0x53:
13469       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
13470       if (haveF3no66noF2(pfx) && sz == 4) {
13471          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
13472                                             "rcpss", Iop_RecipEst32F0x4 );
13473          goto decode_success;
13474       }
13475       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
13476       if (haveNo66noF2noF3(pfx) && sz == 4) {
13477          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13478                                            "rcpps", Iop_RecipEst32Fx4 );
13479          goto decode_success;
13480       }
13481       break;
13482
13483    case 0x54:
13484       /* 0F 54 = ANDPS -- G = G and E */
13485       if (haveNo66noF2noF3(pfx) && sz == 4) {
13486          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
13487          goto decode_success;
13488       }
13489       /* 66 0F 54 = ANDPD -- G = G and E */
13490       if (have66noF2noF3(pfx) && sz == 2) {
13491          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
13492          goto decode_success;
13493       }
13494       break;
13495
13496    case 0x55:
13497       /* 0F 55 = ANDNPS -- G = (not G) and E */
13498       if (haveNo66noF2noF3(pfx) && sz == 4) {
13499          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
13500                                                            Iop_AndV128 );
13501          goto decode_success;
13502       }
13503       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
13504       if (have66noF2noF3(pfx) && sz == 2) {
13505          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
13506                                                            Iop_AndV128 );
13507          goto decode_success;
13508       }
13509       break;
13510
13511    case 0x56:
13512       /* 0F 56 = ORPS -- G = G and E */
13513       if (haveNo66noF2noF3(pfx) && sz == 4) {
13514          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
13515          goto decode_success;
13516       }
13517       /* 66 0F 56 = ORPD -- G = G and E */
13518       if (have66noF2noF3(pfx) && sz == 2) {
13519          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
13520          goto decode_success;
13521       }
13522       break;
13523
13524    case 0x57:
13525       /* 66 0F 57 = XORPD -- G = G xor E */
13526       if (have66noF2noF3(pfx) && sz == 2) {
13527          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
13528          goto decode_success;
13529       }
13530       /* 0F 57 = XORPS -- G = G xor E */
13531       if (haveNo66noF2noF3(pfx) && sz == 4) {
13532          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
13533          goto decode_success;
13534       }
13535       break;
13536
13537    case 0x58:
13538       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
13539       if (haveNo66noF2noF3(pfx) && sz == 4) {
13540          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
13541          goto decode_success;
13542       }
13543       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
13544       if (haveF3no66noF2(pfx) && sz == 4) {
13545          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
13546          goto decode_success;
13547       }
13548       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
13549       if (haveF2no66noF3(pfx)
13550           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13551          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
13552          goto decode_success;
13553       }
13554       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
13555       if (have66noF2noF3(pfx)
13556           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13557          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
13558          goto decode_success;
13559       }
13560       break;
13561
13562    case 0x59:
13563       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
13564       if (haveF2no66noF3(pfx)
13565           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13566          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
13567          goto decode_success;
13568       }
13569       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
13570       if (haveF3no66noF2(pfx) && sz == 4) {
13571          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
13572          goto decode_success;
13573       }
13574       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
13575       if (haveNo66noF2noF3(pfx) && sz == 4) {
13576          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
13577          goto decode_success;
13578       }
13579       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
13580       if (have66noF2noF3(pfx)
13581           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13582          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
13583          goto decode_success;
13584       }
13585       break;
13586
13587    case 0x5A:
13588       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
13589          F64 in xmm(G). */
13590       if (haveNo66noF2noF3(pfx)
13591           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13592          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
13593          goto decode_success;
13594       }
13595       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
13596          low half xmm(G) */
13597       if (haveF3no66noF2(pfx) && sz == 4) {
13598          IRTemp f32lo = newTemp(Ity_F32);
13599
13600          modrm = getUChar(delta);
13601          if (epartIsReg(modrm)) {
13602             delta += 1;
13603             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
13604             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13605                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13606          } else {
13607             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13608             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
13609             delta += alen;
13610             DIP("cvtss2sd %s,%s\n", dis_buf,
13611                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13612          }
13613
13614          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
13615                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
13616
13617          goto decode_success;
13618       }
13619       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
13620          low 1/4 xmm(G), according to prevailing SSE rounding mode */
13621       if (haveF2no66noF3(pfx) && sz == 4) {
13622          IRTemp rmode = newTemp(Ity_I32);
13623          IRTemp f64lo = newTemp(Ity_F64);
13624
13625          modrm = getUChar(delta);
13626          if (epartIsReg(modrm)) {
13627             delta += 1;
13628             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
13629             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13630                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13631          } else {
13632             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13633             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
13634             delta += alen;
13635             DIP("cvtsd2ss %s,%s\n", dis_buf,
13636                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13637          }
13638
13639          assign( rmode, get_sse_roundingmode() );
13640          putXMMRegLane32F(
13641             gregOfRexRM(pfx,modrm), 0,
13642             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
13643          );
13644
13645          goto decode_success;
13646       }
13647       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
13648          lo half xmm(G), rounding according to prevailing SSE rounding
13649          mode, and zero upper half */
13650       /* Note, this is practically identical to CVTPD2DQ.  It would have
13651          be nice to merge them together. */
13652       if (have66noF2noF3(pfx) && sz == 2) {
13653          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
13654          goto decode_success;
13655       }
13656       break;
13657
13658    case 0x5B:
13659       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
13660          xmm(G), rounding towards zero */
13661       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
13662          xmm(G), as per the prevailing rounding mode */
13663       if ( (have66noF2noF3(pfx) && sz == 2)
13664            || (haveF3no66noF2(pfx) && sz == 4) ) {
13665          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
13666          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
13667          goto decode_success;
13668       }
13669       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
13670          xmm(G) */
13671       if (haveNo66noF2noF3(pfx) && sz == 4) {
13672          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
13673          goto decode_success;
13674       }
13675       break;
13676
13677    case 0x5C:
13678       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
13679       if (haveF3no66noF2(pfx) && sz == 4) {
13680          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
13681          goto decode_success;
13682       }
13683       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
13684       if (haveF2no66noF3(pfx)
13685           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13686          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
13687          goto decode_success;
13688       }
13689       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
13690       if (haveNo66noF2noF3(pfx) && sz == 4) {
13691          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
13692          goto decode_success;
13693       }
13694       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
13695       if (have66noF2noF3(pfx) && sz == 2) {
13696          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
13697          goto decode_success;
13698       }
13699       break;
13700
13701    case 0x5D:
13702       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
13703       if (haveNo66noF2noF3(pfx) && sz == 4) {
13704          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
13705          goto decode_success;
13706       }
13707       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
13708       if (haveF3no66noF2(pfx) && sz == 4) {
13709          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
13710          goto decode_success;
13711       }
13712       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
13713       if (haveF2no66noF3(pfx)
13714           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13715          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
13716          goto decode_success;
13717       }
13718       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
13719       if (have66noF2noF3(pfx) && sz == 2) {
13720          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
13721          goto decode_success;
13722       }
13723       break;
13724
13725    case 0x5E:
13726       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
13727       if (haveF2no66noF3(pfx) && sz == 4) {
13728          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
13729          goto decode_success;
13730       }
13731       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
13732       if (haveNo66noF2noF3(pfx) && sz == 4) {
13733          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
13734          goto decode_success;
13735       }
13736       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
13737       if (haveF3no66noF2(pfx) && sz == 4) {
13738          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
13739          goto decode_success;
13740       }
13741       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
13742       if (have66noF2noF3(pfx) && sz == 2) {
13743          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
13744          goto decode_success;
13745       }
13746       break;
13747
13748    case 0x5F:
13749       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
13750       if (haveNo66noF2noF3(pfx) && sz == 4) {
13751          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
13752          goto decode_success;
13753       }
13754       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
13755       if (haveF3no66noF2(pfx) && sz == 4) {
13756          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
13757          goto decode_success;
13758       }
13759       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
13760       if (haveF2no66noF3(pfx)
13761           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13762          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
13763          goto decode_success;
13764       }
13765       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
13766       if (have66noF2noF3(pfx) && sz == 2) {
13767          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
13768          goto decode_success;
13769       }
13770       break;
13771
13772    case 0x60:
13773       /* 66 0F 60 = PUNPCKLBW */
13774       if (have66noF2noF3(pfx) && sz == 2) {
13775          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13776                                     "punpcklbw",
13777                                     Iop_InterleaveLO8x16, True );
13778          goto decode_success;
13779       }
13780       break;
13781
13782    case 0x61:
13783       /* 66 0F 61 = PUNPCKLWD */
13784       if (have66noF2noF3(pfx) && sz == 2) {
13785          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13786                                     "punpcklwd",
13787                                     Iop_InterleaveLO16x8, True );
13788          goto decode_success;
13789       }
13790       break;
13791
13792    case 0x62:
13793       /* 66 0F 62 = PUNPCKLDQ */
13794       if (have66noF2noF3(pfx) && sz == 2) {
13795          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13796                                     "punpckldq",
13797                                     Iop_InterleaveLO32x4, True );
13798          goto decode_success;
13799       }
13800       break;
13801
13802    case 0x63:
13803       /* 66 0F 63 = PACKSSWB */
13804       if (have66noF2noF3(pfx) && sz == 2) {
13805          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13806                                     "packsswb",
13807                                     Iop_QNarrowBin16Sto8Sx16, True );
13808          goto decode_success;
13809       }
13810       break;
13811
13812    case 0x64:
13813       /* 66 0F 64 = PCMPGTB */
13814       if (have66noF2noF3(pfx) && sz == 2) {
13815          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13816                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
13817          goto decode_success;
13818       }
13819       break;
13820
13821    case 0x65:
13822       /* 66 0F 65 = PCMPGTW */
13823       if (have66noF2noF3(pfx) && sz == 2) {
13824          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13825                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
13826          goto decode_success;
13827       }
13828       break;
13829
13830    case 0x66:
13831       /* 66 0F 66 = PCMPGTD */
13832       if (have66noF2noF3(pfx) && sz == 2) {
13833          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13834                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
13835          goto decode_success;
13836       }
13837       break;
13838
13839    case 0x67:
13840       /* 66 0F 67 = PACKUSWB */
13841       if (have66noF2noF3(pfx) && sz == 2) {
13842          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13843                                     "packuswb",
13844                                     Iop_QNarrowBin16Sto8Ux16, True );
13845          goto decode_success;
13846       }
13847       break;
13848
13849    case 0x68:
13850       /* 66 0F 68 = PUNPCKHBW */
13851       if (have66noF2noF3(pfx) && sz == 2) {
13852          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13853                                     "punpckhbw",
13854                                     Iop_InterleaveHI8x16, True );
13855          goto decode_success;
13856       }
13857       break;
13858
13859    case 0x69:
13860       /* 66 0F 69 = PUNPCKHWD */
13861       if (have66noF2noF3(pfx) && sz == 2) {
13862          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13863                                     "punpckhwd",
13864                                     Iop_InterleaveHI16x8, True );
13865          goto decode_success;
13866       }
13867       break;
13868
13869    case 0x6A:
13870       /* 66 0F 6A = PUNPCKHDQ */
13871       if (have66noF2noF3(pfx) && sz == 2) {
13872          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13873                                     "punpckhdq",
13874                                     Iop_InterleaveHI32x4, True );
13875          goto decode_success;
13876       }
13877       break;
13878
13879    case 0x6B:
13880       /* 66 0F 6B = PACKSSDW */
13881       if (have66noF2noF3(pfx) && sz == 2) {
13882          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13883                                     "packssdw",
13884                                     Iop_QNarrowBin32Sto16Sx8, True );
13885          goto decode_success;
13886       }
13887       break;
13888
13889    case 0x6C:
13890       /* 66 0F 6C = PUNPCKLQDQ */
13891       if (have66noF2noF3(pfx) && sz == 2) {
13892          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13893                                     "punpcklqdq",
13894                                     Iop_InterleaveLO64x2, True );
13895          goto decode_success;
13896       }
13897       break;
13898
13899    case 0x6D:
13900       /* 66 0F 6D = PUNPCKHQDQ */
13901       if (have66noF2noF3(pfx) && sz == 2) {
13902          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13903                                     "punpckhqdq",
13904                                     Iop_InterleaveHI64x2, True );
13905          goto decode_success;
13906       }
13907       break;
13908
13909    case 0x6E:
13910       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
13911                     zeroing high 3/4 of xmm. */
13912       /*              or from ireg64/m64 to xmm lo 1/2,
13913                     zeroing high 1/2 of xmm. */
13914       if (have66noF2noF3(pfx)) {
13915          vassert(sz == 2 || sz == 8);
13916          if (sz == 2) sz = 4;
13917          modrm = getUChar(delta);
13918          if (epartIsReg(modrm)) {
13919             delta += 1;
13920             if (sz == 4) {
13921                putXMMReg(
13922                   gregOfRexRM(pfx,modrm),
13923                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
13924                );
13925                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13926                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13927             } else {
13928                putXMMReg(
13929                   gregOfRexRM(pfx,modrm),
13930                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
13931                );
13932                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13933                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13934             }
13935          } else {
13936             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
13937             delta += alen;
13938             putXMMReg(
13939                gregOfRexRM(pfx,modrm),
13940                sz == 4
13941                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
13942                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
13943             );
13944             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
13945                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13946          }
13947          goto decode_success;
13948       }
13949       break;
13950
13951    case 0x6F:
13952       if (have66noF2noF3(pfx)
13953           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13954          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
13955          modrm = getUChar(delta);
13956          if (epartIsReg(modrm)) {
13957             putXMMReg( gregOfRexRM(pfx,modrm),
13958                        getXMMReg( eregOfRexRM(pfx,modrm) ));
13959             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13960                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13961             delta += 1;
13962          } else {
13963             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13964             gen_SIGNAL_if_not_16_aligned( vbi, addr );
13965             putXMMReg( gregOfRexRM(pfx,modrm),
13966                        loadLE(Ity_V128, mkexpr(addr)) );
13967             DIP("movdqa %s,%s\n", dis_buf,
13968                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13969             delta += alen;
13970          }
13971          goto decode_success;
13972       }
13973       if (haveF3no66noF2(pfx)
13974           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13975          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
13976          modrm = getUChar(delta);
13977          if (epartIsReg(modrm)) {
13978             putXMMReg( gregOfRexRM(pfx,modrm),
13979                        getXMMReg( eregOfRexRM(pfx,modrm) ));
13980             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13981                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13982             delta += 1;
13983          } else {
13984             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13985             putXMMReg( gregOfRexRM(pfx,modrm),
13986                        loadLE(Ity_V128, mkexpr(addr)) );
13987             DIP("movdqu %s,%s\n", dis_buf,
13988                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13989             delta += alen;
13990          }
13991          goto decode_success;
13992       }
13993       break;
13994
13995    case 0x70:
13996       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
13997       if (have66noF2noF3(pfx) && sz == 2) {
13998          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
13999          goto decode_success;
14000       }
14001       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14002       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
14003       if (haveNo66noF2noF3(pfx) && sz == 4) {
14004          Int order;
14005          IRTemp sV, dV, s3, s2, s1, s0;
14006          s3 = s2 = s1 = s0 = IRTemp_INVALID;
14007          sV = newTemp(Ity_I64);
14008          dV = newTemp(Ity_I64);
14009          do_MMX_preamble();
14010          modrm = getUChar(delta);
14011          if (epartIsReg(modrm)) {
14012             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14013             order = (Int)getUChar(delta+1);
14014             delta += 1+1;
14015             DIP("pshufw $%d,%s,%s\n", order,
14016                                       nameMMXReg(eregLO3ofRM(modrm)),
14017                                       nameMMXReg(gregLO3ofRM(modrm)));
14018          } else {
14019             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
14020                               1/*extra byte after amode*/ );
14021             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14022             order = (Int)getUChar(delta+alen);
14023             delta += 1+alen;
14024             DIP("pshufw $%d,%s,%s\n", order,
14025                                       dis_buf,
14026                                       nameMMXReg(gregLO3ofRM(modrm)));
14027          }
14028          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
14029 #        define SEL(n) \
14030                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
14031          assign(dV,
14032                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
14033                              SEL((order>>2)&3), SEL((order>>0)&3) )
14034          );
14035          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
14036 #        undef SEL
14037          goto decode_success;
14038       }
14039       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
14040          mem) to G(xmm), and copy upper half */
14041       if (haveF2no66noF3(pfx) && sz == 4) {
14042          delta = dis_PSHUFxW_128( vbi, pfx, delta,
14043                                   False/*!isAvx*/, False/*!xIsH*/ );
14044          goto decode_success;
14045       }
14046       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
14047          mem) to G(xmm), and copy lower half */
14048       if (haveF3no66noF2(pfx) && sz == 4) {
14049          delta = dis_PSHUFxW_128( vbi, pfx, delta,
14050                                   False/*!isAvx*/, True/*xIsH*/ );
14051          goto decode_success;
14052       }
14053       break;
14054
14055    case 0x71:
14056       /* 66 0F 71 /2 ib = PSRLW by immediate */
14057       if (have66noF2noF3(pfx) && sz == 2
14058           && epartIsReg(getUChar(delta))
14059           && gregLO3ofRM(getUChar(delta)) == 2) {
14060          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
14061          goto decode_success;
14062       }
14063       /* 66 0F 71 /4 ib = PSRAW by immediate */
14064       if (have66noF2noF3(pfx) && sz == 2
14065           && epartIsReg(getUChar(delta))
14066           && gregLO3ofRM(getUChar(delta)) == 4) {
14067          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
14068          goto decode_success;
14069       }
14070       /* 66 0F 71 /6 ib = PSLLW by immediate */
14071       if (have66noF2noF3(pfx) && sz == 2
14072           && epartIsReg(getUChar(delta))
14073           && gregLO3ofRM(getUChar(delta)) == 6) {
14074          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
14075          goto decode_success;
14076       }
14077       break;
14078
14079    case 0x72:
14080       /* 66 0F 72 /2 ib = PSRLD by immediate */
14081       if (have66noF2noF3(pfx) && sz == 2
14082           && epartIsReg(getUChar(delta))
14083           && gregLO3ofRM(getUChar(delta)) == 2) {
14084          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
14085          goto decode_success;
14086       }
14087       /* 66 0F 72 /4 ib = PSRAD by immediate */
14088       if (have66noF2noF3(pfx) && sz == 2
14089           && epartIsReg(getUChar(delta))
14090           && gregLO3ofRM(getUChar(delta)) == 4) {
14091          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
14092          goto decode_success;
14093       }
14094       /* 66 0F 72 /6 ib = PSLLD by immediate */
14095       if (have66noF2noF3(pfx) && sz == 2
14096           && epartIsReg(getUChar(delta))
14097           && gregLO3ofRM(getUChar(delta)) == 6) {
14098          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
14099          goto decode_success;
14100       }
14101       break;
14102
14103    case 0x73:
14104       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
14105       /* note, if mem case ever filled in, 1 byte after amode */
14106       if (have66noF2noF3(pfx) && sz == 2
14107           && epartIsReg(getUChar(delta))
14108           && gregLO3ofRM(getUChar(delta)) == 3) {
14109          Int imm = (Int)getUChar(delta+1);
14110          Int reg = eregOfRexRM(pfx,getUChar(delta));
14111          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
14112          delta += 2;
14113          IRTemp sV = newTemp(Ity_V128);
14114          assign( sV, getXMMReg(reg) );
14115          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
14116          goto decode_success;
14117       }
14118       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
14119       /* note, if mem case ever filled in, 1 byte after amode */
14120       if (have66noF2noF3(pfx) && sz == 2
14121           && epartIsReg(getUChar(delta))
14122           && gregLO3ofRM(getUChar(delta)) == 7) {
14123          Int imm = (Int)getUChar(delta+1);
14124          Int reg = eregOfRexRM(pfx,getUChar(delta));
14125          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
14126          vassert(imm >= 0 && imm <= 255);
14127          delta += 2;
14128          IRTemp sV = newTemp(Ity_V128);
14129          assign( sV, getXMMReg(reg) );
14130          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
14131          goto decode_success;
14132       }
14133       /* 66 0F 73 /2 ib = PSRLQ by immediate */
14134       if (have66noF2noF3(pfx) && sz == 2
14135           && epartIsReg(getUChar(delta))
14136           && gregLO3ofRM(getUChar(delta)) == 2) {
14137          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
14138          goto decode_success;
14139       }
14140       /* 66 0F 73 /6 ib = PSLLQ by immediate */
14141       if (have66noF2noF3(pfx) && sz == 2
14142           && epartIsReg(getUChar(delta))
14143           && gregLO3ofRM(getUChar(delta)) == 6) {
14144          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
14145          goto decode_success;
14146       }
14147       break;
14148
14149    case 0x74:
14150       /* 66 0F 74 = PCMPEQB */
14151       if (have66noF2noF3(pfx) && sz == 2) {
14152          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14153                                     "pcmpeqb", Iop_CmpEQ8x16, False );
14154          goto decode_success;
14155       }
14156       break;
14157
14158    case 0x75:
14159       /* 66 0F 75 = PCMPEQW */
14160       if (have66noF2noF3(pfx) && sz == 2) {
14161          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14162                                     "pcmpeqw", Iop_CmpEQ16x8, False );
14163          goto decode_success;
14164       }
14165       break;
14166
14167    case 0x76:
14168       /* 66 0F 76 = PCMPEQD */
14169       if (have66noF2noF3(pfx) && sz == 2) {
14170          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14171                                     "pcmpeqd", Iop_CmpEQ32x4, False );
14172          goto decode_success;
14173       }
14174       break;
14175
14176    case 0x7E:
14177       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
14178          G (lo half xmm).  Upper half of G is zeroed out. */
14179       if (haveF3no66noF2(pfx)
14180           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14181          modrm = getUChar(delta);
14182          if (epartIsReg(modrm)) {
14183             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
14184                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
14185                /* zero bits 127:64 */
14186                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
14187             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14188                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
14189             delta += 1;
14190          } else {
14191             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14192             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
14193             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
14194                              loadLE(Ity_I64, mkexpr(addr)) );
14195             DIP("movsd %s,%s\n", dis_buf,
14196                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
14197             delta += alen;
14198          }
14199          goto decode_success;
14200       }
14201       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
14202       /*              or from xmm low 1/2 to ireg64 or m64. */
14203          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
14204          if (sz == 2) sz = 4;
14205          modrm = getUChar(delta);
14206          if (epartIsReg(modrm)) {
14207             delta += 1;
14208             if (sz == 4) {
14209                putIReg32( eregOfRexRM(pfx,modrm),
14210                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
14211                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14212                                     nameIReg32(eregOfRexRM(pfx,modrm)));
14213             } else {
14214                putIReg64( eregOfRexRM(pfx,modrm),
14215                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
14216                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14217                                     nameIReg64(eregOfRexRM(pfx,modrm)));
14218             }
14219          } else {
14220             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
14221             delta += alen;
14222             storeLE( mkexpr(addr),
14223                      sz == 4
14224                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
14225                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
14226             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
14227                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
14228          }
14229          goto decode_success;
14230       }
14231       break;
14232
14233    case 0x7F:
14234       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
14235       if (haveF3no66noF2(pfx)
14236           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14237          modrm = getUChar(delta);
14238          if (epartIsReg(modrm)) {
14239             goto decode_failure; /* awaiting test case */
14240             delta += 1;
14241             putXMMReg( eregOfRexRM(pfx,modrm),
14242                        getXMMReg(gregOfRexRM(pfx,modrm)) );
14243             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14244                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
14245          } else {
14246             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
14247             delta += alen;
14248             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14249             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
14250          }
14251          goto decode_success;
14252       }
14253       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
14254       if (have66noF2noF3(pfx) && sz == 2) {
14255          modrm = getUChar(delta);
14256          if (epartIsReg(modrm)) {
14257             delta += 1;
14258             putXMMReg( eregOfRexRM(pfx,modrm),
14259                        getXMMReg(gregOfRexRM(pfx,modrm)) );
14260             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14261                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
14262          } else {
14263             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
14264             gen_SIGNAL_if_not_16_aligned( vbi, addr );
14265             delta += alen;
14266             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14267             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
14268          }
14269          goto decode_success;
14270       }
14271       break;
14272
14273    case 0xAE:
14274       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
14275       if (haveNo66noF2noF3(pfx)
14276           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
14277           && sz == 4) {
14278          delta += 1;
14279          /* Insert a memory fence.  It's sometimes important that these
14280             are carried through to the generated code. */
14281          stmt( IRStmt_MBE(Imbe_Fence) );
14282          DIP("sfence\n");
14283          goto decode_success;
14284       }
14285       /* mindless duplication follows .. */
14286       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
14287       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
14288       if (haveNo66noF2noF3(pfx)
14289           && epartIsReg(getUChar(delta))
14290           && (gregLO3ofRM(getUChar(delta)) == 5
14291               || gregLO3ofRM(getUChar(delta)) == 6)
14292           && sz == 4) {
14293          delta += 1;
14294          /* Insert a memory fence.  It's sometimes important that these
14295             are carried through to the generated code. */
14296          stmt( IRStmt_MBE(Imbe_Fence) );
14297          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
14298          goto decode_success;
14299       }
14300
14301       /* 0F AE /7 = CLFLUSH -- flush cache line */
14302       if (haveNo66noF2noF3(pfx)
14303           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
14304           && sz == 4) {
14305
14306          /* This is something of a hack.  We need to know the size of
14307             the cache line containing addr.  Since we don't (easily),
14308             assume 256 on the basis that no real cache would have a
14309             line that big.  It's safe to invalidate more stuff than we
14310             need, just inefficient. */
14311          ULong lineszB = 256ULL;
14312
14313          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14314          delta += alen;
14315
14316          /* Round addr down to the start of the containing block. */
14317          stmt( IRStmt_Put(
14318                   OFFB_CMSTART,
14319                   binop( Iop_And64,
14320                          mkexpr(addr),
14321                          mkU64( ~(lineszB-1) ))) );
14322
14323          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
14324
14325          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
14326
14327          DIP("clflush %s\n", dis_buf);
14328          goto decode_success;
14329       }
14330
14331       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
14332       if (haveNo66noF2noF3(pfx)
14333           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
14334           && sz == 4) {
14335          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
14336          goto decode_success;
14337       }
14338       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
14339       if (haveNo66noF2noF3(pfx)
14340           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
14341           && sz == 4) {
14342          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
14343          goto decode_success;
14344       }
14345       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
14346       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14347           && !epartIsReg(getUChar(delta))
14348           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
14349          delta = dis_FXSAVE(vbi, pfx, delta, sz);
14350          goto decode_success;
14351       }
14352       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
14353       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14354           && !epartIsReg(getUChar(delta))
14355           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
14356          delta = dis_FXRSTOR(vbi, pfx, delta, sz);
14357          goto decode_success;
14358       }
14359       /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
14360       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14361           && !epartIsReg(getUChar(delta))
14362           && gregOfRexRM(pfx,getUChar(delta)) == 4
14363           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
14364          delta = dis_XSAVE(vbi, pfx, delta, sz);
14365          goto decode_success;
14366       }
14367       /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
14368       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14369           && !epartIsReg(getUChar(delta))
14370           && gregOfRexRM(pfx,getUChar(delta)) == 5
14371           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
14372          delta = dis_XRSTOR(vbi, pfx, delta, sz);
14373          goto decode_success;
14374       }
14375       break;
14376
14377    case 0xC2:
14378       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
14379       if (haveNo66noF2noF3(pfx) && sz == 4) {
14380          Long delta0 = delta;
14381          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
14382          if (delta > delta0) goto decode_success;
14383       }
14384       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
14385       if (haveF3no66noF2(pfx) && sz == 4) {
14386          Long delta0 = delta;
14387          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
14388          if (delta > delta0) goto decode_success;
14389       }
14390       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
14391       if (haveF2no66noF3(pfx) && sz == 4) {
14392          Long delta0 = delta;
14393          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
14394          if (delta > delta0) goto decode_success;
14395       }
14396       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
14397       if (have66noF2noF3(pfx) && sz == 2) {
14398          Long delta0 = delta;
14399          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
14400          if (delta > delta0) goto decode_success;
14401       }
14402       break;
14403
14404    case 0xC3:
14405       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
14406       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
14407          modrm = getUChar(delta);
14408          if (!epartIsReg(modrm)) {
14409             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14410             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
14411             DIP("movnti %s,%s\n", dis_buf,
14412                                   nameIRegG(sz, pfx, modrm));
14413             delta += alen;
14414             goto decode_success;
14415          }
14416          /* else fall through */
14417       }
14418       break;
14419
14420    case 0xC4:
14421       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14422       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
14423          put it into the specified lane of mmx(G). */
14424       if (haveNo66noF2noF3(pfx)
14425           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14426          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
14427             mmx reg.  t4 is the new lane value.  t5 is the original
14428             mmx value. t6 is the new mmx value. */
14429          Int lane;
14430          t4 = newTemp(Ity_I16);
14431          t5 = newTemp(Ity_I64);
14432          t6 = newTemp(Ity_I64);
14433          modrm = getUChar(delta);
14434          do_MMX_preamble();
14435
14436          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
14437          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
14438
14439          if (epartIsReg(modrm)) {
14440             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
14441             delta += 1+1;
14442             lane = getUChar(delta-1);
14443             DIP("pinsrw $%d,%s,%s\n", lane,
14444                                       nameIReg16(eregOfRexRM(pfx,modrm)),
14445                                       nameMMXReg(gregLO3ofRM(modrm)));
14446          } else {
14447             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
14448             delta += 1+alen;
14449             lane = getUChar(delta-1);
14450             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
14451             DIP("pinsrw $%d,%s,%s\n", lane,
14452                                       dis_buf,
14453                                       nameMMXReg(gregLO3ofRM(modrm)));
14454          }
14455
14456          switch (lane & 3) {
14457             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
14458             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
14459             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
14460             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
14461             default: vassert(0);
14462          }
14463          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
14464          goto decode_success;
14465       }
14466       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
14467          put it into the specified lane of xmm(G). */
14468       if (have66noF2noF3(pfx)
14469           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14470          Int lane;
14471          t4 = newTemp(Ity_I16);
14472          modrm = getUChar(delta);
14473          UInt rG = gregOfRexRM(pfx,modrm);
14474          if (epartIsReg(modrm)) {
14475             UInt rE = eregOfRexRM(pfx,modrm);
14476             assign(t4, getIReg16(rE));
14477             delta += 1+1;
14478             lane = getUChar(delta-1);
14479             DIP("pinsrw $%d,%s,%s\n",
14480                 lane, nameIReg16(rE), nameXMMReg(rG));
14481          } else {
14482             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
14483                               1/*byte after the amode*/ );
14484             delta += 1+alen;
14485             lane = getUChar(delta-1);
14486             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
14487             DIP("pinsrw $%d,%s,%s\n",
14488                 lane, dis_buf, nameXMMReg(rG));
14489          }
14490          IRTemp src_vec = newTemp(Ity_V128);
14491          assign(src_vec, getXMMReg(rG));
14492          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
14493          putXMMReg(rG, mkexpr(res_vec));
14494          goto decode_success;
14495       }
14496       break;
14497
14498    case 0xC5:
14499       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14500       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
14501          zero-extend of it in ireg(G). */
14502       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
14503          modrm = getUChar(delta);
14504          if (epartIsReg(modrm)) {
14505             IRTemp sV = newTemp(Ity_I64);
14506             t5 = newTemp(Ity_I16);
14507             do_MMX_preamble();
14508             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
14509             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
14510             switch (getUChar(delta+1) & 3) {
14511                case 0:  assign(t5, mkexpr(t0)); break;
14512                case 1:  assign(t5, mkexpr(t1)); break;
14513                case 2:  assign(t5, mkexpr(t2)); break;
14514                case 3:  assign(t5, mkexpr(t3)); break;
14515                default: vassert(0);
14516             }
14517             if (sz == 8)
14518                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
14519             else
14520                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
14521             DIP("pextrw $%d,%s,%s\n",
14522                 (Int)getUChar(delta+1),
14523                 nameMMXReg(eregLO3ofRM(modrm)),
14524                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
14525                       : nameIReg32(gregOfRexRM(pfx,modrm))
14526             );
14527             delta += 2;
14528             goto decode_success;
14529          }
14530          /* else fall through */
14531          /* note, for anyone filling in the mem case: this insn has one
14532             byte after the amode and therefore you must pass 1 as the
14533             last arg to disAMode */
14534       }
14535       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
14536          zero-extend of it in ireg(G). */
14537       if (have66noF2noF3(pfx)
14538           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14539          Long delta0 = delta;
14540          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
14541                                               False/*!isAvx*/ );
14542          if (delta > delta0) goto decode_success;
14543          /* else fall through -- decoding has failed */
14544       }
14545       break;
14546
14547    case 0xC6:
14548       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
14549       if (haveNo66noF2noF3(pfx) && sz == 4) {
14550          Int    imm8 = 0;
14551          IRTemp sV   = newTemp(Ity_V128);
14552          IRTemp dV   = newTemp(Ity_V128);
14553          modrm = getUChar(delta);
14554          UInt rG = gregOfRexRM(pfx,modrm);
14555          assign( dV, getXMMReg(rG) );
14556          if (epartIsReg(modrm)) {
14557             UInt rE = eregOfRexRM(pfx,modrm);
14558             assign( sV, getXMMReg(rE) );
14559             imm8 = (Int)getUChar(delta+1);
14560             delta += 1+1;
14561             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
14562          } else {
14563             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
14564             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14565             imm8 = (Int)getUChar(delta+alen);
14566             delta += 1+alen;
14567             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
14568          }
14569          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
14570          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
14571          goto decode_success;
14572       }
14573       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
14574       if (have66noF2noF3(pfx) && sz == 2) {
14575          Int    select;
14576          IRTemp sV = newTemp(Ity_V128);
14577          IRTemp dV = newTemp(Ity_V128);
14578
14579          modrm = getUChar(delta);
14580          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
14581
14582          if (epartIsReg(modrm)) {
14583             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
14584             select = (Int)getUChar(delta+1);
14585             delta += 1+1;
14586             DIP("shufpd $%d,%s,%s\n", select,
14587                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
14588                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
14589          } else {
14590             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
14591             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14592             select = getUChar(delta+alen);
14593             delta += 1+alen;
14594             DIP("shufpd $%d,%s,%s\n", select,
14595                                       dis_buf,
14596                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
14597          }
14598
14599          IRTemp res = math_SHUFPD_128( sV, dV, select );
14600          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
14601          goto decode_success;
14602       }
14603       break;
14604
14605    case 0xD1:
14606       /* 66 0F D1 = PSRLW by E */
14607       if (have66noF2noF3(pfx) && sz == 2) {
14608          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
14609          goto decode_success;
14610       }
14611       break;
14612
14613    case 0xD2:
14614       /* 66 0F D2 = PSRLD by E */
14615       if (have66noF2noF3(pfx) && sz == 2) {
14616          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
14617          goto decode_success;
14618       }
14619       break;
14620
14621    case 0xD3:
14622       /* 66 0F D3 = PSRLQ by E */
14623       if (have66noF2noF3(pfx) && sz == 2) {
14624          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
14625          goto decode_success;
14626       }
14627       break;
14628
14629    case 0xD4:
14630       /* 66 0F D4 = PADDQ */
14631       if (have66noF2noF3(pfx) && sz == 2) {
14632          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14633                                     "paddq", Iop_Add64x2, False );
14634          goto decode_success;
14635       }
14636       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
14637       /* 0F D4 = PADDQ -- add 64x1 */
14638       if (haveNo66noF2noF3(pfx) && sz == 4) {
14639          do_MMX_preamble();
14640          delta = dis_MMXop_regmem_to_reg (
14641                    vbi, pfx, delta, opc, "paddq", False );
14642          goto decode_success;
14643       }
14644       break;
14645
14646    case 0xD5:
14647       /* 66 0F D5 = PMULLW -- 16x8 multiply */
14648       if (have66noF2noF3(pfx) && sz == 2) {
14649          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14650                                     "pmullw", Iop_Mul16x8, False );
14651          goto decode_success;
14652       }
14653       break;
14654
14655    case 0xD6:
14656       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
14657          hi half). */
14658       if (haveF3no66noF2(pfx) && sz == 4) {
14659          modrm = getUChar(delta);
14660          if (epartIsReg(modrm)) {
14661             do_MMX_preamble();
14662             putXMMReg( gregOfRexRM(pfx,modrm),
14663                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
14664             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14665                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
14666             delta += 1;
14667             goto decode_success;
14668          }
14669          /* apparently no mem case for this insn */
14670       }
14671       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
14672          or lo half xmm).  */
14673       if (have66noF2noF3(pfx)
14674           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14675          modrm = getUChar(delta);
14676          if (epartIsReg(modrm)) {
14677             /* fall through, awaiting test case */
14678             /* dst: lo half copied, hi half zeroed */
14679          } else {
14680             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14681             storeLE( mkexpr(addr),
14682                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
14683             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
14684             delta += alen;
14685             goto decode_success;
14686          }
14687       }
14688       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
14689       if (haveF2no66noF3(pfx) && sz == 4) {
14690          modrm = getUChar(delta);
14691          if (epartIsReg(modrm)) {
14692             do_MMX_preamble();
14693             putMMXReg( gregLO3ofRM(modrm),
14694                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
14695             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14696                                    nameMMXReg(gregLO3ofRM(modrm)));
14697             delta += 1;
14698             goto decode_success;
14699          }
14700          /* apparently no mem case for this insn */
14701       }
14702       break;
14703
14704    case 0xD7:
14705       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
14706          lanes in xmm(E), turn them into a byte, and put
14707          zero-extend of it in ireg(G).  Doing this directly is just
14708          too cumbersome; give up therefore and call a helper. */
14709       if (have66noF2noF3(pfx)
14710           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
14711           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
14712          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
14713          goto decode_success;
14714       }
14715       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14716       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
14717          mmx(E), turn them into a byte, and put zero-extend of it in
14718          ireg(G). */
14719       if (haveNo66noF2noF3(pfx)
14720           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14721          modrm = getUChar(delta);
14722          if (epartIsReg(modrm)) {
14723             do_MMX_preamble();
14724             t0 = newTemp(Ity_I64);
14725             t1 = newTemp(Ity_I32);
14726             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
14727             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
14728             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
14729             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14730                                     nameIReg32(gregOfRexRM(pfx,modrm)));
14731             delta += 1;
14732             goto decode_success;
14733          }
14734          /* else fall through */
14735       }
14736       break;
14737
14738    case 0xD8:
14739       /* 66 0F D8 = PSUBUSB */
14740       if (have66noF2noF3(pfx) && sz == 2) {
14741          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14742                                     "psubusb", Iop_QSub8Ux16, False );
14743          goto decode_success;
14744       }
14745       break;
14746
14747    case 0xD9:
14748       /* 66 0F D9 = PSUBUSW */
14749       if (have66noF2noF3(pfx) && sz == 2) {
14750          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14751                                     "psubusw", Iop_QSub16Ux8, False );
14752          goto decode_success;
14753       }
14754       break;
14755
14756    case 0xDA:
14757       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14758       /* 0F DA = PMINUB -- 8x8 unsigned min */
14759       if (haveNo66noF2noF3(pfx) && sz == 4) {
14760          do_MMX_preamble();
14761          delta = dis_MMXop_regmem_to_reg (
14762                     vbi, pfx, delta, opc, "pminub", False );
14763          goto decode_success;
14764       }
14765       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
14766       if (have66noF2noF3(pfx) && sz == 2) {
14767          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14768                                     "pminub", Iop_Min8Ux16, False );
14769          goto decode_success;
14770       }
14771       break;
14772
14773    case 0xDB:
14774       /* 66 0F DB = PAND */
14775       if (have66noF2noF3(pfx) && sz == 2) {
14776          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
14777          goto decode_success;
14778       }
14779       break;
14780
14781    case 0xDC:
14782       /* 66 0F DC = PADDUSB */
14783       if (have66noF2noF3(pfx) && sz == 2) {
14784          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14785                                     "paddusb", Iop_QAdd8Ux16, False );
14786          goto decode_success;
14787       }
14788       break;
14789
14790    case 0xDD:
14791       /* 66 0F DD = PADDUSW */
14792       if (have66noF2noF3(pfx) && sz == 2) {
14793          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14794                                     "paddusw", Iop_QAdd16Ux8, False );
14795          goto decode_success;
14796       }
14797       break;
14798
14799    case 0xDE:
14800       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14801       /* 0F DE = PMAXUB -- 8x8 unsigned max */
14802       if (haveNo66noF2noF3(pfx) && sz == 4) {
14803          do_MMX_preamble();
14804          delta = dis_MMXop_regmem_to_reg (
14805                     vbi, pfx, delta, opc, "pmaxub", False );
14806          goto decode_success;
14807       }
14808       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
14809       if (have66noF2noF3(pfx) && sz == 2) {
14810          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14811                                     "pmaxub", Iop_Max8Ux16, False );
14812          goto decode_success;
14813       }
14814       break;
14815
14816    case 0xDF:
14817       /* 66 0F DF = PANDN */
14818       if (have66noF2noF3(pfx) && sz == 2) {
14819          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
14820          goto decode_success;
14821       }
14822       break;
14823
14824    case 0xE0:
14825       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14826       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
14827       if (haveNo66noF2noF3(pfx) && sz == 4) {
14828          do_MMX_preamble();
14829          delta = dis_MMXop_regmem_to_reg (
14830                     vbi, pfx, delta, opc, "pavgb", False );
14831          goto decode_success;
14832       }
14833       /* 66 0F E0 = PAVGB */
14834       if (have66noF2noF3(pfx) && sz == 2) {
14835          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14836                                     "pavgb", Iop_Avg8Ux16, False );
14837          goto decode_success;
14838       }
14839       break;
14840
14841    case 0xE1:
14842       /* 66 0F E1 = PSRAW by E */
14843       if (have66noF2noF3(pfx) && sz == 2) {
14844          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
14845          goto decode_success;
14846       }
14847       break;
14848
14849    case 0xE2:
14850       /* 66 0F E2 = PSRAD by E */
14851       if (have66noF2noF3(pfx) && sz == 2) {
14852          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
14853          goto decode_success;
14854       }
14855       break;
14856
14857    case 0xE3:
14858       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14859       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
14860       if (haveNo66noF2noF3(pfx) && sz == 4) {
14861          do_MMX_preamble();
14862          delta = dis_MMXop_regmem_to_reg (
14863                     vbi, pfx, delta, opc, "pavgw", False );
14864          goto decode_success;
14865       }
14866       /* 66 0F E3 = PAVGW */
14867       if (have66noF2noF3(pfx) && sz == 2) {
14868          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14869                                     "pavgw", Iop_Avg16Ux8, False );
14870          goto decode_success;
14871       }
14872       break;
14873
14874    case 0xE4:
14875       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14876       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
14877       if (haveNo66noF2noF3(pfx) && sz == 4) {
14878          do_MMX_preamble();
14879          delta = dis_MMXop_regmem_to_reg (
14880                     vbi, pfx, delta, opc, "pmuluh", False );
14881          goto decode_success;
14882       }
14883       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
14884       if (have66noF2noF3(pfx) && sz == 2) {
14885          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14886                                     "pmulhuw", Iop_MulHi16Ux8, False );
14887          goto decode_success;
14888       }
14889       break;
14890
14891    case 0xE5:
14892       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
14893       if (have66noF2noF3(pfx) && sz == 2) {
14894          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14895                                     "pmulhw", Iop_MulHi16Sx8, False );
14896          goto decode_success;
14897       }
14898       break;
14899
14900    case 0xE6:
14901       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
14902          lo half xmm(G), and zero upper half, rounding towards zero */
14903       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
14904          lo half xmm(G), according to prevailing rounding mode, and zero
14905          upper half */
14906       if ( (haveF2no66noF3(pfx) && sz == 4)
14907            || (have66noF2noF3(pfx) && sz == 2) ) {
14908          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
14909                                     toBool(sz == 2)/*r2zero*/);
14910          goto decode_success;
14911       }
14912       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
14913          F64 in xmm(G) */
14914       if (haveF3no66noF2(pfx) && sz == 4) {
14915          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
14916          goto decode_success;
14917       }
14918       break;
14919
14920    case 0xE7:
14921       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14922       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
14923          Intel manual does not say anything about the usual business of
14924          the FP reg tags getting trashed whenever an MMX insn happens.
14925          So we just leave them alone.
14926       */
14927       if (haveNo66noF2noF3(pfx) && sz == 4) {
14928          modrm = getUChar(delta);
14929          if (!epartIsReg(modrm)) {
14930             /* do_MMX_preamble(); Intel docs don't specify this */
14931             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14932             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
14933             DIP("movntq %s,%s\n", dis_buf,
14934                                   nameMMXReg(gregLO3ofRM(modrm)));
14935             delta += alen;
14936             goto decode_success;
14937          }
14938          /* else fall through */
14939       }
14940       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
14941       if (have66noF2noF3(pfx) && sz == 2) {
14942          modrm = getUChar(delta);
14943          if (!epartIsReg(modrm)) {
14944             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14945             gen_SIGNAL_if_not_16_aligned( vbi, addr );
14946             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14947             DIP("movntdq %s,%s\n", dis_buf,
14948                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
14949             delta += alen;
14950             goto decode_success;
14951          }
14952          /* else fall through */
14953       }
14954       break;
14955
14956    case 0xE8:
14957       /* 66 0F E8 = PSUBSB */
14958       if (have66noF2noF3(pfx) && sz == 2) {
14959          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14960                                     "psubsb", Iop_QSub8Sx16, False );
14961          goto decode_success;
14962       }
14963       break;
14964
14965    case 0xE9:
14966       /* 66 0F E9 = PSUBSW */
14967       if (have66noF2noF3(pfx) && sz == 2) {
14968          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14969                                     "psubsw", Iop_QSub16Sx8, False );
14970          goto decode_success;
14971       }
14972       break;
14973
14974    case 0xEA:
14975       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14976       /* 0F EA = PMINSW -- 16x4 signed min */
14977       if (haveNo66noF2noF3(pfx) && sz == 4) {
14978          do_MMX_preamble();
14979          delta = dis_MMXop_regmem_to_reg (
14980                     vbi, pfx, delta, opc, "pminsw", False );
14981          goto decode_success;
14982       }
14983       /* 66 0F EA = PMINSW -- 16x8 signed min */
14984       if (have66noF2noF3(pfx) && sz == 2) {
14985          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14986                                     "pminsw", Iop_Min16Sx8, False );
14987          goto decode_success;
14988       }
14989       break;
14990
14991    case 0xEB:
14992       /* 66 0F EB = POR */
14993       if (have66noF2noF3(pfx) && sz == 2) {
14994          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
14995          goto decode_success;
14996       }
14997       break;
14998
14999    case 0xEC:
15000       /* 66 0F EC = PADDSB */
15001       if (have66noF2noF3(pfx) && sz == 2) {
15002          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15003                                     "paddsb", Iop_QAdd8Sx16, False );
15004          goto decode_success;
15005       }
15006       break;
15007
15008    case 0xED:
15009       /* 66 0F ED = PADDSW */
15010       if (have66noF2noF3(pfx) && sz == 2) {
15011          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15012                                     "paddsw", Iop_QAdd16Sx8, False );
15013          goto decode_success;
15014       }
15015       break;
15016
15017    case 0xEE:
15018       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
15019       /* 0F EE = PMAXSW -- 16x4 signed max */
15020       if (haveNo66noF2noF3(pfx) && sz == 4) {
15021          do_MMX_preamble();
15022          delta = dis_MMXop_regmem_to_reg (
15023                     vbi, pfx, delta, opc, "pmaxsw", False );
15024          goto decode_success;
15025       }
15026       /* 66 0F EE = PMAXSW -- 16x8 signed max */
15027       if (have66noF2noF3(pfx) && sz == 2) {
15028          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15029                                     "pmaxsw", Iop_Max16Sx8, False );
15030          goto decode_success;
15031       }
15032       break;
15033
15034    case 0xEF:
15035       /* 66 0F EF = PXOR */
15036       if (have66noF2noF3(pfx) && sz == 2) {
15037          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
15038          goto decode_success;
15039       }
15040       break;
15041
15042    case 0xF1:
15043       /* 66 0F F1 = PSLLW by E */
15044       if (have66noF2noF3(pfx) && sz == 2) {
15045          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
15046          goto decode_success;
15047       }
15048       break;
15049
15050    case 0xF2:
15051       /* 66 0F F2 = PSLLD by E */
15052       if (have66noF2noF3(pfx) && sz == 2) {
15053          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
15054          goto decode_success;
15055       }
15056       break;
15057
15058    case 0xF3:
15059       /* 66 0F F3 = PSLLQ by E */
15060       if (have66noF2noF3(pfx) && sz == 2) {
15061          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
15062          goto decode_success;
15063       }
15064       break;
15065
15066    case 0xF4:
15067       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
15068          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
15069          half */
15070       if (have66noF2noF3(pfx) && sz == 2) {
15071          IRTemp sV = newTemp(Ity_V128);
15072          IRTemp dV = newTemp(Ity_V128);
15073          modrm = getUChar(delta);
15074          UInt rG = gregOfRexRM(pfx,modrm);
15075          assign( dV, getXMMReg(rG) );
15076          if (epartIsReg(modrm)) {
15077             UInt rE = eregOfRexRM(pfx,modrm);
15078             assign( sV, getXMMReg(rE) );
15079             delta += 1;
15080             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15081          } else {
15082             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15083             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15084             delta += alen;
15085             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
15086          }
15087          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
15088          goto decode_success;
15089       }
15090       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
15091       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
15092          0 to form 64-bit result */
15093       if (haveNo66noF2noF3(pfx) && sz == 4) {
15094          IRTemp sV = newTemp(Ity_I64);
15095          IRTemp dV = newTemp(Ity_I64);
15096          t1 = newTemp(Ity_I32);
15097          t0 = newTemp(Ity_I32);
15098          modrm = getUChar(delta);
15099
15100          do_MMX_preamble();
15101          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15102
15103          if (epartIsReg(modrm)) {
15104             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15105             delta += 1;
15106             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15107                                    nameMMXReg(gregLO3ofRM(modrm)));
15108          } else {
15109             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15110             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15111             delta += alen;
15112             DIP("pmuludq %s,%s\n", dis_buf,
15113                                    nameMMXReg(gregLO3ofRM(modrm)));
15114          }
15115
15116          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
15117          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
15118          putMMXReg( gregLO3ofRM(modrm),
15119                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
15120          goto decode_success;
15121       }
15122       break;
15123
15124    case 0xF5:
15125       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
15126          E(xmm or mem) to G(xmm) */
15127       if (have66noF2noF3(pfx) && sz == 2) {
15128          IRTemp sV = newTemp(Ity_V128);
15129          IRTemp dV = newTemp(Ity_V128);
15130          modrm     = getUChar(delta);
15131          UInt   rG = gregOfRexRM(pfx,modrm);
15132          if (epartIsReg(modrm)) {
15133             UInt rE = eregOfRexRM(pfx,modrm);
15134             assign( sV, getXMMReg(rE) );
15135             delta += 1;
15136             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15137          } else {
15138             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15139             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15140             delta += alen;
15141             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
15142          }
15143          assign( dV, getXMMReg(rG) );
15144          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
15145          goto decode_success;
15146       }
15147       break;
15148
15149    case 0xF6:
15150       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
15151       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
15152       if (haveNo66noF2noF3(pfx) && sz == 4) {
15153          do_MMX_preamble();
15154          delta = dis_MMXop_regmem_to_reg (
15155                     vbi, pfx, delta, opc, "psadbw", False );
15156          goto decode_success;
15157       }
15158       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
15159          from E(xmm or mem) to G(xmm) */
15160       if (have66noF2noF3(pfx) && sz == 2) {
15161          IRTemp sV  = newTemp(Ity_V128);
15162          IRTemp dV  = newTemp(Ity_V128);
15163          modrm = getUChar(delta);
15164          UInt   rG   = gregOfRexRM(pfx,modrm);
15165          if (epartIsReg(modrm)) {
15166             UInt rE = eregOfRexRM(pfx,modrm);
15167             assign( sV, getXMMReg(rE) );
15168             delta += 1;
15169             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15170          } else {
15171             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15172             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15173             delta += alen;
15174             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
15175          }
15176          assign( dV, getXMMReg(rG) );
15177          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
15178
15179          goto decode_success;
15180       }
15181       break;
15182
15183    case 0xF7:
15184       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
15185       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
15186       if (haveNo66noF2noF3(pfx) && sz == 4) {
15187          Bool ok = False;
15188          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
15189          if (ok) goto decode_success;
15190       }
15191       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
15192       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
15193          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
15194          goto decode_success;
15195       }
15196       break;
15197
15198    case 0xF8:
15199       /* 66 0F F8 = PSUBB */
15200       if (have66noF2noF3(pfx) && sz == 2) {
15201          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15202                                     "psubb", Iop_Sub8x16, False );
15203          goto decode_success;
15204       }
15205       break;
15206
15207    case 0xF9:
15208       /* 66 0F F9 = PSUBW */
15209       if (have66noF2noF3(pfx) && sz == 2) {
15210          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15211                                     "psubw", Iop_Sub16x8, False );
15212          goto decode_success;
15213       }
15214       break;
15215
15216    case 0xFA:
15217       /* 66 0F FA = PSUBD */
15218       if (have66noF2noF3(pfx) && sz == 2) {
15219          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15220                                     "psubd", Iop_Sub32x4, False );
15221          goto decode_success;
15222       }
15223       break;
15224
15225    case 0xFB:
15226       /* 66 0F FB = PSUBQ */
15227       if (have66noF2noF3(pfx) && sz == 2) {
15228          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15229                                     "psubq", Iop_Sub64x2, False );
15230          goto decode_success;
15231       }
15232       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
15233       /* 0F FB = PSUBQ -- sub 64x1 */
15234       if (haveNo66noF2noF3(pfx) && sz == 4) {
15235          do_MMX_preamble();
15236          delta = dis_MMXop_regmem_to_reg (
15237                    vbi, pfx, delta, opc, "psubq", False );
15238          goto decode_success;
15239       }
15240       break;
15241
15242    case 0xFC:
15243       /* 66 0F FC = PADDB */
15244       if (have66noF2noF3(pfx) && sz == 2) {
15245          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15246                                     "paddb", Iop_Add8x16, False );
15247          goto decode_success;
15248       }
15249       break;
15250
15251    case 0xFD:
15252       /* 66 0F FD = PADDW */
15253       if (have66noF2noF3(pfx) && sz == 2) {
15254          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15255                                     "paddw", Iop_Add16x8, False );
15256          goto decode_success;
15257       }
15258       break;
15259
15260    case 0xFE:
15261       /* 66 0F FE = PADDD */
15262       if (have66noF2noF3(pfx) && sz == 2) {
15263          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15264                                     "paddd", Iop_Add32x4, False );
15265          goto decode_success;
15266       }
15267       break;
15268
15269    default:
15270       goto decode_failure;
15271
15272    }
15273
15274   decode_failure:
15275    *decode_OK = False;
15276    return deltaIN;
15277
15278   decode_success:
15279    *decode_OK = True;
15280    return delta;
15281 }
15282
15283
15284 /*------------------------------------------------------------*/
15285 /*---                                                      ---*/
15286 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
15287 /*---                                                      ---*/
15288 /*------------------------------------------------------------*/
15289
15290 static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
15291                               Long delta, Bool isAvx )
15292 {
15293    IRTemp addr   = IRTemp_INVALID;
15294    Int    alen   = 0;
15295    HChar  dis_buf[50];
15296    IRTemp sV    = newTemp(Ity_V128);
15297    IRTemp d0    = newTemp(Ity_I64);
15298    UChar  modrm = getUChar(delta);
15299    UInt   rG    = gregOfRexRM(pfx,modrm);
15300    if (epartIsReg(modrm)) {
15301       UInt rE = eregOfRexRM(pfx,modrm);
15302       assign( sV, getXMMReg(rE) );
15303       DIP("%smovddup %s,%s\n",
15304           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
15305       delta += 1;
15306       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
15307    } else {
15308       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15309       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
15310       DIP("%smovddup %s,%s\n",
15311           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
15312       delta += alen;
15313    }
15314    (isAvx ? putYMMRegLoAndZU : putXMMReg)
15315       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
15316    return delta;
15317 }
15318
15319
15320 static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
15321                               Long delta )
15322 {
15323    IRTemp addr   = IRTemp_INVALID;
15324    Int    alen   = 0;
15325    HChar  dis_buf[50];
15326    IRTemp d0    = newTemp(Ity_I64);
15327    IRTemp d1    = newTemp(Ity_I64);
15328    UChar  modrm = getUChar(delta);
15329    UInt   rG    = gregOfRexRM(pfx,modrm);
15330    if (epartIsReg(modrm)) {
15331       UInt rE = eregOfRexRM(pfx,modrm);
15332       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
15333       delta += 1;
15334       assign ( d0, getYMMRegLane64(rE, 0) );
15335       assign ( d1, getYMMRegLane64(rE, 2) );
15336    } else {
15337       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15338       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
15339       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
15340                                         mkexpr(addr), mkU64(16))) );
15341       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
15342       delta += alen;
15343    }
15344    putYMMRegLane64( rG, 0, mkexpr(d0) );
15345    putYMMRegLane64( rG, 1, mkexpr(d0) );
15346    putYMMRegLane64( rG, 2, mkexpr(d1) );
15347    putYMMRegLane64( rG, 3, mkexpr(d1) );
15348    return delta;
15349 }
15350
15351
15352 static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
15353                                Long delta, Bool isAvx, Bool isL )
15354 {
15355    IRTemp addr  = IRTemp_INVALID;
15356    Int    alen  = 0;
15357    HChar  dis_buf[50];
15358    IRTemp sV    = newTemp(Ity_V128);
15359    UChar  modrm = getUChar(delta);
15360    UInt   rG    = gregOfRexRM(pfx,modrm);
15361    IRTemp s3, s2, s1, s0;
15362    s3 = s2 = s1 = s0 = IRTemp_INVALID;
15363    if (epartIsReg(modrm)) {
15364       UInt rE = eregOfRexRM(pfx,modrm);
15365       assign( sV, getXMMReg(rE) );
15366       DIP("%smovs%cdup %s,%s\n",
15367           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
15368       delta += 1;
15369    } else {
15370       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15371       if (!isAvx)
15372          gen_SIGNAL_if_not_16_aligned( vbi, addr );
15373       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15374       DIP("%smovs%cdup %s,%s\n",
15375           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
15376       delta += alen;
15377    }
15378    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
15379    (isAvx ? putYMMRegLoAndZU : putXMMReg)
15380       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
15381                 : mkV128from32s( s3, s3, s1, s1 ) );
15382    return delta;
15383 }
15384
15385
15386 static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
15387                                Long delta, Bool isL )
15388 {
15389    IRTemp addr  = IRTemp_INVALID;
15390    Int    alen  = 0;
15391    HChar  dis_buf[50];
15392    IRTemp sV    = newTemp(Ity_V256);
15393    UChar  modrm = getUChar(delta);
15394    UInt   rG    = gregOfRexRM(pfx,modrm);
15395    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
15396    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
15397    if (epartIsReg(modrm)) {
15398       UInt rE = eregOfRexRM(pfx,modrm);
15399       assign( sV, getYMMReg(rE) );
15400       DIP("vmovs%cdup %s,%s\n",
15401           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
15402       delta += 1;
15403    } else {
15404       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15405       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
15406       DIP("vmovs%cdup %s,%s\n",
15407           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
15408       delta += alen;
15409    }
15410    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
15411    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
15412                                 : mkV128from32s( s7, s7, s5, s5 ) );
15413    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
15414                                 : mkV128from32s( s3, s3, s1, s1 ) );
15415    return delta;
15416 }
15417
15418
15419 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
15420 {
15421    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
15422    IRTemp leftV  = newTemp(Ity_V128);
15423    IRTemp rightV = newTemp(Ity_V128);
15424    IRTemp rm     = newTemp(Ity_I32);
15425    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
15426
15427    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
15428    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
15429
15430    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
15431    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
15432
15433    IRTemp res = newTemp(Ity_V128);
15434    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
15435    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
15436                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
15437    return res;
15438 }
15439
15440
15441 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
15442 {
15443    IRTemp s1, s0, d1, d0;
15444    IRTemp leftV  = newTemp(Ity_V128);
15445    IRTemp rightV = newTemp(Ity_V128);
15446    IRTemp rm     = newTemp(Ity_I32);
15447    s1 = s0 = d1 = d0 = IRTemp_INVALID;
15448
15449    breakupV128to64s( sV, &s1, &s0 );
15450    breakupV128to64s( dV, &d1, &d0 );
15451
15452    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
15453    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
15454
15455    IRTemp res = newTemp(Ity_V128);
15456    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
15457    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
15458                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
15459    return res;
15460 }
15461
15462
15463 __attribute__((noinline))
15464 static
15465 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
15466                         const VexAbiInfo* vbi,
15467                         Prefix pfx, Int sz, Long deltaIN )
15468 {
15469    IRTemp addr  = IRTemp_INVALID;
15470    UChar  modrm = 0;
15471    Int    alen  = 0;
15472    HChar  dis_buf[50];
15473
15474    *decode_OK = False;
15475
15476    Long   delta = deltaIN;
15477    UChar  opc   = getUChar(delta);
15478    delta++;
15479    switch (opc) {
15480
15481    case 0x12:
15482       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
15483          duplicating some lanes (2:2:0:0). */
15484       if (haveF3no66noF2(pfx) && sz == 4) {
15485          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
15486                                    True/*isL*/ );
15487          goto decode_success;
15488       }
15489       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
15490          duplicating some lanes (0:1:0:1). */
15491       if (haveF2no66noF3(pfx)
15492           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
15493          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
15494          goto decode_success;
15495       }
15496       break;
15497
15498    case 0x16:
15499       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
15500          duplicating some lanes (3:3:1:1). */
15501       if (haveF3no66noF2(pfx) && sz == 4) {
15502          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
15503                                    False/*!isL*/ );
15504          goto decode_success;
15505       }
15506       break;
15507
15508    case 0x7C:
15509    case 0x7D:
15510       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
15511       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
15512       if (haveF2no66noF3(pfx) && sz == 4) {
15513          IRTemp eV     = newTemp(Ity_V128);
15514          IRTemp gV     = newTemp(Ity_V128);
15515          Bool   isAdd  = opc == 0x7C;
15516          const HChar* str = isAdd ? "add" : "sub";
15517          modrm         = getUChar(delta);
15518          UInt   rG     = gregOfRexRM(pfx,modrm);
15519          if (epartIsReg(modrm)) {
15520             UInt rE = eregOfRexRM(pfx,modrm);
15521             assign( eV, getXMMReg(rE) );
15522             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
15523             delta += 1;
15524          } else {
15525             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15526             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15527             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
15528             delta += alen;
15529          }
15530
15531          assign( gV, getXMMReg(rG) );
15532          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
15533          goto decode_success;
15534       }
15535       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
15536       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
15537       if (have66noF2noF3(pfx) && sz == 2) {
15538          IRTemp eV     = newTemp(Ity_V128);
15539          IRTemp gV     = newTemp(Ity_V128);
15540          Bool   isAdd  = opc == 0x7C;
15541          const HChar* str = isAdd ? "add" : "sub";
15542          modrm         = getUChar(delta);
15543          UInt   rG     = gregOfRexRM(pfx,modrm);
15544          if (epartIsReg(modrm)) {
15545             UInt rE = eregOfRexRM(pfx,modrm);
15546             assign( eV, getXMMReg(rE) );
15547             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
15548             delta += 1;
15549          } else {
15550             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15551             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15552             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
15553             delta += alen;
15554          }
15555
15556          assign( gV, getXMMReg(rG) );
15557          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
15558          goto decode_success;
15559       }
15560       break;
15561
15562    case 0xD0:
15563       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
15564       if (have66noF2noF3(pfx) && sz == 2) {
15565          IRTemp eV   = newTemp(Ity_V128);
15566          IRTemp gV   = newTemp(Ity_V128);
15567          modrm       = getUChar(delta);
15568          UInt   rG   = gregOfRexRM(pfx,modrm);
15569          if (epartIsReg(modrm)) {
15570             UInt rE = eregOfRexRM(pfx,modrm);
15571             assign( eV, getXMMReg(rE) );
15572             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15573             delta += 1;
15574          } else {
15575             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15576             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15577             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
15578             delta += alen;
15579          }
15580
15581          assign( gV, getXMMReg(rG) );
15582          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
15583          goto decode_success;
15584       }
15585       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
15586       if (haveF2no66noF3(pfx) && sz == 4) {
15587          IRTemp eV   = newTemp(Ity_V128);
15588          IRTemp gV   = newTemp(Ity_V128);
15589          modrm       = getUChar(delta);
15590          UInt   rG   = gregOfRexRM(pfx,modrm);
15591
15592          modrm = getUChar(delta);
15593          if (epartIsReg(modrm)) {
15594             UInt rE = eregOfRexRM(pfx,modrm);
15595             assign( eV, getXMMReg(rE) );
15596             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15597             delta += 1;
15598          } else {
15599             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15600             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15601             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
15602             delta += alen;
15603          }
15604
15605          assign( gV, getXMMReg(rG) );
15606          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
15607          goto decode_success;
15608       }
15609       break;
15610
15611    case 0xF0:
15612       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
15613       if (haveF2no66noF3(pfx) && sz == 4) {
15614          modrm = getUChar(delta);
15615          if (epartIsReg(modrm)) {
15616             goto decode_failure;
15617          } else {
15618             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15619             putXMMReg( gregOfRexRM(pfx,modrm),
15620                        loadLE(Ity_V128, mkexpr(addr)) );
15621             DIP("lddqu %s,%s\n", dis_buf,
15622                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
15623             delta += alen;
15624          }
15625          goto decode_success;
15626       }
15627       break;
15628
15629    default:
15630       goto decode_failure;
15631
15632    }
15633
15634   decode_failure:
15635    *decode_OK = False;
15636    return deltaIN;
15637
15638   decode_success:
15639    *decode_OK = True;
15640    return delta;
15641 }
15642
15643
15644 /*------------------------------------------------------------*/
15645 /*---                                                      ---*/
15646 /*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
15647 /*---                                                      ---*/
15648 /*------------------------------------------------------------*/
15649
15650 static
15651 IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
15652 {
15653    IRTemp halfMask = newTemp(Ity_I64);
15654    assign(halfMask, mkU64(0x8F8F8F8F8F8F8F8FULL));
15655    IRExpr* mask = binop(Iop_64HLtoV128, mkexpr(halfMask), mkexpr(halfMask));
15656    IRTemp res = newTemp(Ity_V128);
15657    assign(res,
15658           binop(Iop_PermOrZero8x16,
15659                 mkexpr(dV),
15660                 // Mask off bits [6:3] of each source operand lane
15661                 binop(Iop_AndV128, mkexpr(sV), mask)
15662    ));
15663    return res;
15664 }
15665
15666
15667 static
15668 IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
15669 {
15670    IRTemp sHi, sLo, dHi, dLo;
15671    sHi = sLo = dHi = dLo = IRTemp_INVALID;
15672    breakupV256toV128s( dV, &dHi, &dLo);
15673    breakupV256toV128s( sV, &sHi, &sLo);
15674    IRTemp res = newTemp(Ity_V256);
15675    assign(res, binop(Iop_V128HLtoV256,
15676                      mkexpr(math_PSHUFB_XMM(dHi, sHi)),
15677                      mkexpr(math_PSHUFB_XMM(dLo, sLo))));
15678    return res;
15679 }
15680
15681
15682 static Long dis_PHADD_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
15683                             Bool isAvx, UChar opc )
15684 {
15685    IRTemp addr   = IRTemp_INVALID;
15686    Int    alen   = 0;
15687    HChar  dis_buf[50];
15688    const HChar* str = "???";
15689    IROp   opV64  = Iop_INVALID;
15690    IROp   opCatO = Iop_CatOddLanes16x4;
15691    IROp   opCatE = Iop_CatEvenLanes16x4;
15692    IRTemp sV     = newTemp(Ity_V128);
15693    IRTemp dV     = newTemp(Ity_V128);
15694    IRTemp sHi    = newTemp(Ity_I64);
15695    IRTemp sLo    = newTemp(Ity_I64);
15696    IRTemp dHi    = newTemp(Ity_I64);
15697    IRTemp dLo    = newTemp(Ity_I64);
15698    UChar  modrm  = getUChar(delta);
15699    UInt   rG     = gregOfRexRM(pfx,modrm);
15700    UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
15701
15702    switch (opc) {
15703       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15704       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15705       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15706       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15707       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15708       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15709       default: vassert(0);
15710    }
15711    if (opc == 0x02 || opc == 0x06) {
15712       opCatO = Iop_InterleaveHI32x2;
15713       opCatE = Iop_InterleaveLO32x2;
15714    }
15715
15716    assign( dV, getXMMReg(rV) );
15717
15718    if (epartIsReg(modrm)) {
15719       UInt rE = eregOfRexRM(pfx,modrm);
15720       assign( sV, getXMMReg(rE) );
15721       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
15722           nameXMMReg(rE), nameXMMReg(rG));
15723       delta += 1;
15724    } else {
15725       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15726       if (!isAvx)
15727          gen_SIGNAL_if_not_16_aligned( vbi, addr );
15728       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15729       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
15730           dis_buf, nameXMMReg(rG));
15731       delta += alen;
15732    }
15733
15734    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15735    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15736    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15737    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15738
15739    /* This isn't a particularly efficient way to compute the
15740       result, but at least it avoids a proliferation of IROps,
15741       hence avoids complication all the backends. */
15742
15743    (isAvx ? putYMMRegLoAndZU : putXMMReg)
15744       ( rG,
15745         binop(Iop_64HLtoV128,
15746               binop(opV64,
15747                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
15748                     binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
15749               binop(opV64,
15750                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
15751                     binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
15752    return delta;
15753 }
15754
15755
15756 static Long dis_PHADD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
15757                             UChar opc )
15758 {
15759    IRTemp addr   = IRTemp_INVALID;
15760    Int    alen   = 0;
15761    HChar  dis_buf[50];
15762    const HChar* str = "???";
15763    IROp   opV64  = Iop_INVALID;
15764    IROp   opCatO = Iop_CatOddLanes16x4;
15765    IROp   opCatE = Iop_CatEvenLanes16x4;
15766    IRTemp sV     = newTemp(Ity_V256);
15767    IRTemp dV     = newTemp(Ity_V256);
15768    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
15769    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
15770    UChar  modrm  = getUChar(delta);
15771    UInt   rG     = gregOfRexRM(pfx,modrm);
15772    UInt   rV     = getVexNvvvv(pfx);
15773
15774    switch (opc) {
15775       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15776       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15777       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15778       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15779       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15780       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15781       default: vassert(0);
15782    }
15783    if (opc == 0x02 || opc == 0x06) {
15784       opCatO = Iop_InterleaveHI32x2;
15785       opCatE = Iop_InterleaveLO32x2;
15786    }
15787
15788    assign( dV, getYMMReg(rV) );
15789
15790    if (epartIsReg(modrm)) {
15791       UInt rE = eregOfRexRM(pfx,modrm);
15792       assign( sV, getYMMReg(rE) );
15793       DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
15794       delta += 1;
15795    } else {
15796       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15797       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
15798       DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
15799       delta += alen;
15800    }
15801
15802    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
15803    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
15804
15805    /* This isn't a particularly efficient way to compute the
15806       result, but at least it avoids a proliferation of IROps,
15807       hence avoids complication all the backends. */
15808
15809    putYMMReg( rG,
15810               binop(Iop_V128HLtoV256,
15811                     binop(Iop_64HLtoV128,
15812                           binop(opV64,
15813                                 binop(opCatE,mkexpr(s3),mkexpr(s2)),
15814                                 binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
15815                           binop(opV64,
15816                                 binop(opCatE,mkexpr(d3),mkexpr(d2)),
15817                                 binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
15818                     binop(Iop_64HLtoV128,
15819                           binop(opV64,
15820                                 binop(opCatE,mkexpr(s1),mkexpr(s0)),
15821                                 binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
15822                           binop(opV64,
15823                                 binop(opCatE,mkexpr(d1),mkexpr(d0)),
15824                                 binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
15825    return delta;
15826 }
15827
15828
15829 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
15830 {
15831    IRTemp res = newTemp(Ity_V128);
15832    assign(res, binop(Iop_PwExtUSMulQAdd8x16, mkexpr(dV), mkexpr(sV)));
15833    return res;
15834 }
15835
15836
15837 static
15838 IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
15839 {
15840    IRTemp sHi, sLo, dHi, dLo;
15841    sHi = sLo = dHi = dLo = IRTemp_INVALID;
15842    breakupV256toV128s( dV, &dHi, &dLo);
15843    breakupV256toV128s( sV, &sHi, &sLo);
15844    IRTemp res = newTemp(Ity_V256);
15845    assign(res, binop(Iop_V128HLtoV256,
15846                      mkexpr(math_PMADDUBSW_128(dHi, sHi)),
15847                      mkexpr(math_PMADDUBSW_128(dLo, sLo))));
15848    return res;
15849 }
15850
15851
15852 __attribute__((noinline))
15853 static
15854 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
15855                              const VexAbiInfo* vbi,
15856                              Prefix pfx, Int sz, Long deltaIN )
15857 {
15858    IRTemp addr  = IRTemp_INVALID;
15859    UChar  modrm = 0;
15860    Int    alen  = 0;
15861    HChar  dis_buf[50];
15862
15863    *decode_OK = False;
15864
15865    Long   delta = deltaIN;
15866    UChar  opc   = getUChar(delta);
15867    delta++;
15868    switch (opc) {
15869
15870    case 0x00:
15871       /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
15872       if (have66noF2noF3(pfx)
15873           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15874          IRTemp sV = newTemp(Ity_V128);
15875          IRTemp dV = newTemp(Ity_V128);
15876
15877          modrm = getUChar(delta);
15878          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15879
15880          if (epartIsReg(modrm)) {
15881             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15882             delta += 1;
15883             DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
15884                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
15885          } else {
15886             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15887             gen_SIGNAL_if_not_16_aligned( vbi, addr );
15888             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15889             delta += alen;
15890             DIP("pshufb %s,%s\n", dis_buf,
15891                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
15892          }
15893
15894          IRTemp res = math_PSHUFB_XMM( dV, sV );
15895          putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
15896          goto decode_success;
15897       }
15898       /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
15899       if (haveNo66noF2noF3(pfx) && sz == 4) {
15900          IRTemp sV      = newTemp(Ity_I64);
15901          IRTemp dV      = newTemp(Ity_I64);
15902
15903          modrm = getUChar(delta);
15904          do_MMX_preamble();
15905          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15906
15907          if (epartIsReg(modrm)) {
15908             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15909             delta += 1;
15910             DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15911                                   nameMMXReg(gregLO3ofRM(modrm)));
15912          } else {
15913             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15914             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15915             delta += alen;
15916             DIP("pshufb %s,%s\n", dis_buf,
15917                                   nameMMXReg(gregLO3ofRM(modrm)));
15918          }
15919
15920          putMMXReg(
15921             gregLO3ofRM(modrm),
15922             binop(
15923                Iop_PermOrZero8x8,
15924                mkexpr(dV),
15925                // Mask off bits [6:3] of each source operand lane
15926                binop(Iop_And64, mkexpr(sV), mkU64(0x8787878787878787ULL))
15927             )
15928          );
15929          goto decode_success;
15930       }
15931       break;
15932
15933    case 0x01:
15934    case 0x02:
15935    case 0x03:
15936    case 0x05:
15937    case 0x06:
15938    case 0x07:
15939       /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
15940          G to G (xmm). */
15941       /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
15942          G to G (xmm). */
15943       /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
15944          xmm) and G to G (xmm). */
15945       /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
15946          G to G (xmm). */
15947       /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
15948          G to G (xmm). */
15949       /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
15950          xmm) and G to G (xmm). */
15951       if (have66noF2noF3(pfx)
15952           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15953          delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
15954          goto decode_success;
15955       }
15956       /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
15957       /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
15958          to G (mmx). */
15959       /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
15960          to G (mmx). */
15961       /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
15962          mmx) and G to G (mmx). */
15963       /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
15964          to G (mmx). */
15965       /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
15966          to G (mmx). */
15967       /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
15968          mmx) and G to G (mmx). */
15969       if (haveNo66noF2noF3(pfx) && sz == 4) {
15970          const HChar* str = "???";
15971          IROp   opV64  = Iop_INVALID;
15972          IROp   opCatO = Iop_CatOddLanes16x4;
15973          IROp   opCatE = Iop_CatEvenLanes16x4;
15974          IRTemp sV     = newTemp(Ity_I64);
15975          IRTemp dV     = newTemp(Ity_I64);
15976
15977          modrm = getUChar(delta);
15978
15979          switch (opc) {
15980             case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15981             case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15982             case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15983             case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15984             case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15985             case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15986             default: vassert(0);
15987          }
15988          if (opc == 0x02 || opc == 0x06) {
15989             opCatO = Iop_InterleaveHI32x2;
15990             opCatE = Iop_InterleaveLO32x2;
15991          }
15992
15993          do_MMX_preamble();
15994          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15995
15996          if (epartIsReg(modrm)) {
15997             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15998             delta += 1;
15999             DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
16000                                      nameMMXReg(gregLO3ofRM(modrm)));
16001          } else {
16002             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16003             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16004             delta += alen;
16005             DIP("ph%s %s,%s\n", str, dis_buf,
16006                                      nameMMXReg(gregLO3ofRM(modrm)));
16007          }
16008
16009          putMMXReg(
16010             gregLO3ofRM(modrm),
16011             binop(opV64,
16012                   binop(opCatE,mkexpr(sV),mkexpr(dV)),
16013                   binop(opCatO,mkexpr(sV),mkexpr(dV))
16014             )
16015          );
16016          goto decode_success;
16017       }
16018       break;
16019
16020    case 0x04:
16021       /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
16022          Unsigned Bytes (XMM) */
16023       if (have66noF2noF3(pfx)
16024           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16025          IRTemp sV = newTemp(Ity_V128);
16026          IRTemp dV = newTemp(Ity_V128);
16027          modrm     = getUChar(delta);
16028          UInt   rG = gregOfRexRM(pfx,modrm);
16029
16030          assign( dV, getXMMReg(rG) );
16031
16032          if (epartIsReg(modrm)) {
16033             UInt rE = eregOfRexRM(pfx,modrm);
16034             assign( sV, getXMMReg(rE) );
16035             delta += 1;
16036             DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
16037          } else {
16038             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16039             gen_SIGNAL_if_not_16_aligned( vbi, addr );
16040             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16041             delta += alen;
16042             DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
16043          }
16044
16045          putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
16046          goto decode_success;
16047       }
16048       /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
16049          Unsigned Bytes (MMX) */
16050       if (haveNo66noF2noF3(pfx) && sz == 4) {
16051          IRTemp sV        = newTemp(Ity_I64);
16052          IRTemp dV        = newTemp(Ity_I64);
16053          IRTemp sVoddsSX  = newTemp(Ity_I64);
16054          IRTemp sVevensSX = newTemp(Ity_I64);
16055          IRTemp dVoddsZX  = newTemp(Ity_I64);
16056          IRTemp dVevensZX = newTemp(Ity_I64);
16057
16058          modrm = getUChar(delta);
16059          do_MMX_preamble();
16060          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16061
16062          if (epartIsReg(modrm)) {
16063             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16064             delta += 1;
16065             DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
16066                                      nameMMXReg(gregLO3ofRM(modrm)));
16067          } else {
16068             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16069             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16070             delta += alen;
16071             DIP("pmaddubsw %s,%s\n", dis_buf,
16072                                      nameMMXReg(gregLO3ofRM(modrm)));
16073          }
16074
16075          /* compute dV unsigned x sV signed */
16076          assign( sVoddsSX,
16077                  binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
16078          assign( sVevensSX,
16079                  binop(Iop_SarN16x4,
16080                        binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
16081                        mkU8(8)) );
16082          assign( dVoddsZX,
16083                  binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
16084          assign( dVevensZX,
16085                  binop(Iop_ShrN16x4,
16086                        binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
16087                        mkU8(8)) );
16088
16089          putMMXReg(
16090             gregLO3ofRM(modrm),
16091             binop(Iop_QAdd16Sx4,
16092                   binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
16093                   binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
16094             )
16095          );
16096          goto decode_success;
16097       }
16098       break;
16099
16100    case 0x08:
16101    case 0x09:
16102    case 0x0A:
16103       /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
16104       /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
16105       /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
16106       if (have66noF2noF3(pfx)
16107           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16108          IRTemp sV      = newTemp(Ity_V128);
16109          IRTemp dV      = newTemp(Ity_V128);
16110          IRTemp sHi     = newTemp(Ity_I64);
16111          IRTemp sLo     = newTemp(Ity_I64);
16112          IRTemp dHi     = newTemp(Ity_I64);
16113          IRTemp dLo     = newTemp(Ity_I64);
16114          const HChar* str = "???";
16115          Int    laneszB = 0;
16116
16117          switch (opc) {
16118             case 0x08: laneszB = 1; str = "b"; break;
16119             case 0x09: laneszB = 2; str = "w"; break;
16120             case 0x0A: laneszB = 4; str = "d"; break;
16121             default: vassert(0);
16122          }
16123
16124          modrm = getUChar(delta);
16125          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16126
16127          if (epartIsReg(modrm)) {
16128             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16129             delta += 1;
16130             DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
16131                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
16132          } else {
16133             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16134             gen_SIGNAL_if_not_16_aligned( vbi, addr );
16135             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16136             delta += alen;
16137             DIP("psign%s %s,%s\n", str, dis_buf,
16138                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
16139          }
16140
16141          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
16142          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
16143          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
16144          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
16145
16146          putXMMReg(
16147             gregOfRexRM(pfx,modrm),
16148             binop(Iop_64HLtoV128,
16149                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
16150                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
16151             )
16152          );
16153          goto decode_success;
16154       }
16155       /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
16156       /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
16157       /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
16158       if (haveNo66noF2noF3(pfx) && sz == 4) {
16159          IRTemp sV      = newTemp(Ity_I64);
16160          IRTemp dV      = newTemp(Ity_I64);
16161          const HChar* str = "???";
16162          Int    laneszB = 0;
16163
16164          switch (opc) {
16165             case 0x08: laneszB = 1; str = "b"; break;
16166             case 0x09: laneszB = 2; str = "w"; break;
16167             case 0x0A: laneszB = 4; str = "d"; break;
16168             default: vassert(0);
16169          }
16170
16171          modrm = getUChar(delta);
16172          do_MMX_preamble();
16173          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16174
16175          if (epartIsReg(modrm)) {
16176             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16177             delta += 1;
16178             DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
16179                                         nameMMXReg(gregLO3ofRM(modrm)));
16180          } else {
16181             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16182             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16183             delta += alen;
16184             DIP("psign%s %s,%s\n", str, dis_buf,
16185                                         nameMMXReg(gregLO3ofRM(modrm)));
16186          }
16187
16188          putMMXReg(
16189             gregLO3ofRM(modrm),
16190             dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
16191          );
16192          goto decode_success;
16193       }
16194       break;
16195
16196    case 0x0B:
16197       /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
16198          Scale (XMM) */
16199       if (have66noF2noF3(pfx)
16200           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16201          IRTemp sV  = newTemp(Ity_V128);
16202          IRTemp dV  = newTemp(Ity_V128);
16203          IRTemp sHi = newTemp(Ity_I64);
16204          IRTemp sLo = newTemp(Ity_I64);
16205          IRTemp dHi = newTemp(Ity_I64);
16206          IRTemp dLo = newTemp(Ity_I64);
16207
16208          modrm = getUChar(delta);
16209          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16210
16211          if (epartIsReg(modrm)) {
16212             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16213             delta += 1;
16214             DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
16215                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
16216          } else {
16217             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16218             gen_SIGNAL_if_not_16_aligned( vbi, addr );
16219             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16220             delta += alen;
16221             DIP("pmulhrsw %s,%s\n", dis_buf,
16222                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
16223          }
16224
16225          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
16226          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
16227          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
16228          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
16229
16230          putXMMReg(
16231             gregOfRexRM(pfx,modrm),
16232             binop(Iop_64HLtoV128,
16233                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
16234                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
16235             )
16236          );
16237          goto decode_success;
16238       }
16239       /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
16240          (MMX) */
16241       if (haveNo66noF2noF3(pfx) && sz == 4) {
16242          IRTemp sV = newTemp(Ity_I64);
16243          IRTemp dV = newTemp(Ity_I64);
16244
16245          modrm = getUChar(delta);
16246          do_MMX_preamble();
16247          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16248
16249          if (epartIsReg(modrm)) {
16250             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16251             delta += 1;
16252             DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
16253                                     nameMMXReg(gregLO3ofRM(modrm)));
16254          } else {
16255             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16256             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16257             delta += alen;
16258             DIP("pmulhrsw %s,%s\n", dis_buf,
16259                                     nameMMXReg(gregLO3ofRM(modrm)));
16260          }
16261
16262          putMMXReg(
16263             gregLO3ofRM(modrm),
16264             dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
16265          );
16266          goto decode_success;
16267       }
16268       break;
16269
16270    case 0x1C:
16271    case 0x1D:
16272    case 0x1E:
16273       /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
16274       /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
16275       /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
16276       if (have66noF2noF3(pfx)
16277           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16278          IRTemp sV  = newTemp(Ity_V128);
16279          const HChar* str = "???";
16280          Int    laneszB = 0;
16281
16282          switch (opc) {
16283             case 0x1C: laneszB = 1; str = "b"; break;
16284             case 0x1D: laneszB = 2; str = "w"; break;
16285             case 0x1E: laneszB = 4; str = "d"; break;
16286             default: vassert(0);
16287          }
16288
16289          modrm = getUChar(delta);
16290          if (epartIsReg(modrm)) {
16291             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16292             delta += 1;
16293             DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
16294                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
16295          } else {
16296             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16297             gen_SIGNAL_if_not_16_aligned( vbi, addr );
16298             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16299             delta += alen;
16300             DIP("pabs%s %s,%s\n", str, dis_buf,
16301                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
16302          }
16303
16304          putXMMReg( gregOfRexRM(pfx,modrm),
16305                     mkexpr(math_PABS_XMM(sV, laneszB)) );
16306          goto decode_success;
16307       }
16308       /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
16309       /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
16310       /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
16311       if (haveNo66noF2noF3(pfx) && sz == 4) {
16312          IRTemp sV      = newTemp(Ity_I64);
16313          const HChar* str = "???";
16314          Int    laneszB = 0;
16315
16316          switch (opc) {
16317             case 0x1C: laneszB = 1; str = "b"; break;
16318             case 0x1D: laneszB = 2; str = "w"; break;
16319             case 0x1E: laneszB = 4; str = "d"; break;
16320             default: vassert(0);
16321          }
16322
16323          modrm = getUChar(delta);
16324          do_MMX_preamble();
16325
16326          if (epartIsReg(modrm)) {
16327             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16328             delta += 1;
16329             DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
16330                                        nameMMXReg(gregLO3ofRM(modrm)));
16331          } else {
16332             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16333             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16334             delta += alen;
16335             DIP("pabs%s %s,%s\n", str, dis_buf,
16336                                        nameMMXReg(gregLO3ofRM(modrm)));
16337          }
16338
16339          putMMXReg( gregLO3ofRM(modrm),
16340                     mkexpr(math_PABS_MMX( sV, laneszB )) );
16341          goto decode_success;
16342       }
16343       break;
16344
16345    default:
16346       break;
16347
16348    }
16349
16350   //decode_failure:
16351    *decode_OK = False;
16352    return deltaIN;
16353
16354   decode_success:
16355    *decode_OK = True;
16356    return delta;
16357 }
16358
16359
16360 /*------------------------------------------------------------*/
16361 /*---                                                      ---*/
16362 /*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
16363 /*---                                                      ---*/
16364 /*------------------------------------------------------------*/
16365
16366 __attribute__((noinline))
16367 static
16368 Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
16369                              const VexAbiInfo* vbi,
16370                              Prefix pfx, Int sz, Long deltaIN )
16371 {
16372    Long   d64   = 0;
16373    IRTemp addr  = IRTemp_INVALID;
16374    UChar  modrm = 0;
16375    Int    alen  = 0;
16376    HChar  dis_buf[50];
16377
16378    *decode_OK = False;
16379
16380    Long   delta = deltaIN;
16381    UChar  opc   = getUChar(delta);
16382    delta++;
16383    switch (opc) {
16384
16385    case 0x0F:
16386       /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
16387       if (have66noF2noF3(pfx)
16388           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16389          IRTemp sV  = newTemp(Ity_V128);
16390          IRTemp dV  = newTemp(Ity_V128);
16391
16392          modrm = getUChar(delta);
16393          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16394
16395          if (epartIsReg(modrm)) {
16396             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16397             d64 = (Long)getUChar(delta+1);
16398             delta += 1+1;
16399             DIP("palignr $%lld,%s,%s\n", d64,
16400                                        nameXMMReg(eregOfRexRM(pfx,modrm)),
16401                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
16402          } else {
16403             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
16404             gen_SIGNAL_if_not_16_aligned( vbi, addr );
16405             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16406             d64 = (Long)getUChar(delta+alen);
16407             delta += alen+1;
16408             DIP("palignr $%lld,%s,%s\n", d64,
16409                                        dis_buf,
16410                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
16411          }
16412
16413          IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
16414          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
16415          goto decode_success;
16416       }
16417       /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
16418       if (haveNo66noF2noF3(pfx) && sz == 4) {
16419          IRTemp sV  = newTemp(Ity_I64);
16420          IRTemp dV  = newTemp(Ity_I64);
16421          IRTemp res = newTemp(Ity_I64);
16422
16423          modrm = getUChar(delta);
16424          do_MMX_preamble();
16425          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16426
16427          if (epartIsReg(modrm)) {
16428             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16429             d64 = (Long)getUChar(delta+1);
16430             delta += 1+1;
16431             DIP("palignr $%lld,%s,%s\n",  d64,
16432                                         nameMMXReg(eregLO3ofRM(modrm)),
16433                                         nameMMXReg(gregLO3ofRM(modrm)));
16434          } else {
16435             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
16436             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16437             d64 = (Long)getUChar(delta+alen);
16438             delta += alen+1;
16439             DIP("palignr $%lld%s,%s\n", d64,
16440                                       dis_buf,
16441                                       nameMMXReg(gregLO3ofRM(modrm)));
16442          }
16443
16444          if (d64 == 0) {
16445             assign( res, mkexpr(sV) );
16446          }
16447          else if (d64 >= 1 && d64 <= 7) {
16448             assign(res,
16449                    binop(Iop_Or64,
16450                          binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
16451                          binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
16452                         )));
16453          }
16454          else if (d64 == 8) {
16455            assign( res, mkexpr(dV) );
16456          }
16457          else if (d64 >= 9 && d64 <= 15) {
16458             assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
16459          }
16460          else if (d64 >= 16 && d64 <= 255) {
16461             assign( res, mkU64(0) );
16462          }
16463          else
16464             vassert(0);
16465
16466          putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
16467          goto decode_success;
16468       }
16469       break;
16470
16471    default:
16472       break;
16473
16474    }
16475
16476   //decode_failure:
16477    *decode_OK = False;
16478    return deltaIN;
16479
16480   decode_success:
16481    *decode_OK = True;
16482    return delta;
16483 }
16484
16485
16486 /*------------------------------------------------------------*/
16487 /*---                                                      ---*/
16488 /*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
16489 /*---                                                      ---*/
16490 /*------------------------------------------------------------*/
16491
16492 __attribute__((noinline))
16493 static
16494 Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
16495                         const VexArchInfo* archinfo,
16496                         const VexAbiInfo* vbi,
16497                         Prefix pfx, Int sz, Long deltaIN )
16498 {
16499    IRTemp addr  = IRTemp_INVALID;
16500    IRType ty    = Ity_INVALID;
16501    UChar  modrm = 0;
16502    Int    alen  = 0;
16503    HChar  dis_buf[50];
16504
16505    *decode_OK = False;
16506
16507    Long   delta = deltaIN;
16508    UChar  opc   = getUChar(delta);
16509    delta++;
16510    switch (opc) {
16511
16512    case 0xB8:
16513       /* F3 0F B8  = POPCNT{W,L,Q}
16514          Count the number of 1 bits in a register
16515       */
16516       if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
16517           && (sz == 2 || sz == 4 || sz == 8)) {
16518          /*IRType*/ ty  = szToITy(sz);
16519          IRTemp     src = newTemp(ty);
16520          modrm = getUChar(delta);
16521          if (epartIsReg(modrm)) {
16522             assign(src, getIRegE(sz, pfx, modrm));
16523             delta += 1;
16524             DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16525                 nameIRegG(sz, pfx, modrm));
16526          } else {
16527             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16528             assign(src, loadLE(ty, mkexpr(addr)));
16529             delta += alen;
16530             DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
16531                 nameIRegG(sz, pfx, modrm));
16532          }
16533
16534          IRTemp result = gen_POPCOUNT(ty, src);
16535          putIRegG(sz, pfx, modrm, mkexpr(result));
16536
16537          // Update flags.  This is pretty lame .. perhaps can do better
16538          // if this turns out to be performance critical.
16539          // O S A C P are cleared.  Z is set if SRC == 0.
16540          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16541          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16542          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16543          stmt( IRStmt_Put( OFFB_CC_DEP1,
16544                binop(Iop_Shl64,
16545                      unop(Iop_1Uto64,
16546                           binop(Iop_CmpEQ64,
16547                                 widenUto64(mkexpr(src)),
16548                                 mkU64(0))),
16549                      mkU8(AMD64G_CC_SHIFT_Z))));
16550
16551          goto decode_success;
16552       }
16553       break;
16554
16555    case 0xBC:
16556       /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
16557          which we can only decode if we're sure this is a BMI1 capable cpu
16558          that supports TZCNT, since otherwise it's BSF, which behaves
16559          differently on zero source.  */
16560       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
16561           && (sz == 2 || sz == 4 || sz == 8)
16562           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
16563          /*IRType*/ ty  = szToITy(sz);
16564          IRTemp     src = newTemp(ty);
16565          modrm = getUChar(delta);
16566          if (epartIsReg(modrm)) {
16567             assign(src, getIRegE(sz, pfx, modrm));
16568             delta += 1;
16569             DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16570                 nameIRegG(sz, pfx, modrm));
16571          } else {
16572             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16573             assign(src, loadLE(ty, mkexpr(addr)));
16574             delta += alen;
16575             DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
16576                 nameIRegG(sz, pfx, modrm));
16577          }
16578
16579          IRTemp res = gen_TZCNT(ty, src);
16580          putIRegG(sz, pfx, modrm, mkexpr(res));
16581
16582          // Update flags.  This is pretty lame .. perhaps can do better
16583          // if this turns out to be performance critical.
16584          // O S A P are cleared.  Z is set if RESULT == 0.
16585          // C is set if SRC is zero.
16586          IRTemp src64 = newTemp(Ity_I64);
16587          IRTemp res64 = newTemp(Ity_I64);
16588          assign(src64, widenUto64(mkexpr(src)));
16589          assign(res64, widenUto64(mkexpr(res)));
16590
16591          IRTemp oszacp = newTemp(Ity_I64);
16592          assign(
16593             oszacp,
16594             binop(Iop_Or64,
16595                   binop(Iop_Shl64,
16596                         unop(Iop_1Uto64,
16597                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
16598                         mkU8(AMD64G_CC_SHIFT_Z)),
16599                   binop(Iop_Shl64,
16600                         unop(Iop_1Uto64,
16601                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
16602                         mkU8(AMD64G_CC_SHIFT_C))
16603             )
16604          );
16605
16606          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16607          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16608          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16609          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
16610
16611          goto decode_success;
16612       }
16613       break;
16614
16615    case 0xBD:
16616       /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
16617          which we can only decode if we're sure this is an AMD cpu
16618          that supports LZCNT, since otherwise it's BSR, which behaves
16619          differently.  Bizarrely, my Sandy Bridge also accepts these
16620          instructions but produces different results. */
16621       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
16622           && (sz == 2 || sz == 4 || sz == 8)
16623           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
16624          /*IRType*/ ty  = szToITy(sz);
16625          IRTemp     src = newTemp(ty);
16626          modrm = getUChar(delta);
16627          if (epartIsReg(modrm)) {
16628             assign(src, getIRegE(sz, pfx, modrm));
16629             delta += 1;
16630             DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16631                 nameIRegG(sz, pfx, modrm));
16632          } else {
16633             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16634             assign(src, loadLE(ty, mkexpr(addr)));
16635             delta += alen;
16636             DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
16637                 nameIRegG(sz, pfx, modrm));
16638          }
16639
16640          IRTemp res = gen_LZCNT(ty, src);
16641          putIRegG(sz, pfx, modrm, mkexpr(res));
16642
16643          // Update flags.  This is pretty lame .. perhaps can do better
16644          // if this turns out to be performance critical.
16645          // O S A P are cleared.  Z is set if RESULT == 0.
16646          // C is set if SRC is zero.
16647          IRTemp src64 = newTemp(Ity_I64);
16648          IRTemp res64 = newTemp(Ity_I64);
16649          assign(src64, widenUto64(mkexpr(src)));
16650          assign(res64, widenUto64(mkexpr(res)));
16651
16652          IRTemp oszacp = newTemp(Ity_I64);
16653          assign(
16654             oszacp,
16655             binop(Iop_Or64,
16656                   binop(Iop_Shl64,
16657                         unop(Iop_1Uto64,
16658                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
16659                         mkU8(AMD64G_CC_SHIFT_Z)),
16660                   binop(Iop_Shl64,
16661                         unop(Iop_1Uto64,
16662                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
16663                         mkU8(AMD64G_CC_SHIFT_C))
16664             )
16665          );
16666
16667          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16668          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16669          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16670          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
16671
16672          goto decode_success;
16673       }
16674       break;
16675
16676    default:
16677       break;
16678
16679    }
16680
16681   //decode_failure:
16682    *decode_OK = False;
16683    return deltaIN;
16684
16685   decode_success:
16686    *decode_OK = True;
16687    return delta;
16688 }
16689
16690
16691 /*------------------------------------------------------------*/
16692 /*---                                                      ---*/
16693 /*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
16694 /*---                                                      ---*/
16695 /*------------------------------------------------------------*/
16696
16697 static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
16698                                   IRTemp vec0/*controlling mask*/,
16699                                   UInt gran, IROp opSAR )
16700 {
16701    /* The tricky bit is to convert vec0 into a suitable mask, by
16702       copying the most significant bit of each lane into all positions
16703       in the lane. */
16704    IRTemp sh = newTemp(Ity_I8);
16705    assign(sh, mkU8(8 * gran - 1));
16706
16707    IRTemp mask = newTemp(Ity_V128);
16708    assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
16709
16710    IRTemp notmask = newTemp(Ity_V128);
16711    assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
16712
16713    IRTemp res = newTemp(Ity_V128);
16714    assign(res,  binop(Iop_OrV128,
16715                       binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
16716                       binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
16717    return res;
16718 }
16719
16720 static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
16721                                   IRTemp vec0/*controlling mask*/,
16722                                   UInt gran, IROp opSAR128 )
16723 {
16724    /* The tricky bit is to convert vec0 into a suitable mask, by
16725       copying the most significant bit of each lane into all positions
16726       in the lane. */
16727    IRTemp sh = newTemp(Ity_I8);
16728    assign(sh, mkU8(8 * gran - 1));
16729
16730    IRTemp vec0Hi = IRTemp_INVALID;
16731    IRTemp vec0Lo = IRTemp_INVALID;
16732    breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
16733
16734    IRTemp mask = newTemp(Ity_V256);
16735    assign(mask, binop(Iop_V128HLtoV256,
16736                       binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
16737                       binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
16738
16739    IRTemp notmask = newTemp(Ity_V256);
16740    assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
16741
16742    IRTemp res = newTemp(Ity_V256);
16743    assign(res,  binop(Iop_OrV256,
16744                       binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
16745                       binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
16746    return res;
16747 }
16748
16749 static Long dis_VBLENDV_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
16750                               const HChar *name, UInt gran, IROp opSAR )
16751 {
16752    IRTemp addr   = IRTemp_INVALID;
16753    Int    alen   = 0;
16754    HChar  dis_buf[50];
16755    UChar  modrm  = getUChar(delta);
16756    UInt   rG     = gregOfRexRM(pfx, modrm);
16757    UInt   rV     = getVexNvvvv(pfx);
16758    UInt   rIS4   = 0xFF; /* invalid */
16759    IRTemp vecE   = newTemp(Ity_V128);
16760    IRTemp vecV   = newTemp(Ity_V128);
16761    IRTemp vecIS4 = newTemp(Ity_V128);
16762    if (epartIsReg(modrm)) {
16763       delta++;
16764       UInt rE = eregOfRexRM(pfx, modrm);
16765       assign(vecE, getXMMReg(rE));
16766       UChar ib = getUChar(delta);
16767       rIS4 = (ib >> 4) & 0xF;
16768       DIP("%s %s,%s,%s,%s\n",
16769           name, nameXMMReg(rIS4), nameXMMReg(rE),
16770           nameXMMReg(rV), nameXMMReg(rG));
16771    } else {
16772       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16773       delta += alen;
16774       assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
16775       UChar ib = getUChar(delta);
16776       rIS4 = (ib >> 4) & 0xF;
16777       DIP("%s %s,%s,%s,%s\n",
16778           name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
16779    }
16780    delta++;
16781    assign(vecV,   getXMMReg(rV));
16782    assign(vecIS4, getXMMReg(rIS4));
16783    IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
16784    putYMMRegLoAndZU( rG, mkexpr(res) );
16785    return delta;
16786 }
16787
16788 static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
16789                               const HChar *name, UInt gran, IROp opSAR128 )
16790 {
16791    IRTemp addr   = IRTemp_INVALID;
16792    Int    alen   = 0;
16793    HChar  dis_buf[50];
16794    UChar  modrm  = getUChar(delta);
16795    UInt   rG     = gregOfRexRM(pfx, modrm);
16796    UInt   rV     = getVexNvvvv(pfx);
16797    UInt   rIS4   = 0xFF; /* invalid */
16798    IRTemp vecE   = newTemp(Ity_V256);
16799    IRTemp vecV   = newTemp(Ity_V256);
16800    IRTemp vecIS4 = newTemp(Ity_V256);
16801    if (epartIsReg(modrm)) {
16802       delta++;
16803       UInt rE = eregOfRexRM(pfx, modrm);
16804       assign(vecE, getYMMReg(rE));
16805       UChar ib = getUChar(delta);
16806       rIS4 = (ib >> 4) & 0xF;
16807       DIP("%s %s,%s,%s,%s\n",
16808           name, nameYMMReg(rIS4), nameYMMReg(rE),
16809           nameYMMReg(rV), nameYMMReg(rG));
16810    } else {
16811       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16812       delta += alen;
16813       assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
16814       UChar ib = getUChar(delta);
16815       rIS4 = (ib >> 4) & 0xF;
16816       DIP("%s %s,%s,%s,%s\n",
16817           name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
16818    }
16819    delta++;
16820    assign(vecV,   getYMMReg(rV));
16821    assign(vecIS4, getYMMReg(rIS4));
16822    IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
16823    putYMMReg( rG, mkexpr(res) );
16824    return delta;
16825 }
16826
16827 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
16828 {
16829    /* Set Z=1 iff (vecE & vecG) == 0
16830       Set C=1 iff (vecE & not vecG) == 0
16831    */
16832
16833    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16834
16835    /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
16836       and bottom 64-bits together.  It relies on this trick:
16837
16838       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
16839
16840       InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
16841       InterleaveHI64x2([a,b],[a,b]) == [a,a]
16842
16843       and so the OR of the above 2 exprs produces
16844       [a OR b, a OR b], from which we simply take the lower half.
16845    */
16846    IRTemp and64  = newTemp(Ity_I64);
16847    IRTemp andn64 = newTemp(Ity_I64);
16848
16849    assign(and64,
16850           unop(Iop_V128to64,
16851                binop(Iop_OrV128,
16852                      binop(Iop_InterleaveLO64x2,
16853                            mkexpr(andV), mkexpr(andV)),
16854                      binop(Iop_InterleaveHI64x2,
16855                            mkexpr(andV), mkexpr(andV)))));
16856
16857    assign(andn64,
16858           unop(Iop_V128to64,
16859                binop(Iop_OrV128,
16860                      binop(Iop_InterleaveLO64x2,
16861                            mkexpr(andnV), mkexpr(andnV)),
16862                      binop(Iop_InterleaveHI64x2,
16863                            mkexpr(andnV), mkexpr(andnV)))));
16864
16865    IRTemp z64 = newTemp(Ity_I64);
16866    IRTemp c64 = newTemp(Ity_I64);
16867    if (sign == 64) {
16868       /* When only interested in the most significant bit, just shift
16869          arithmetically right and negate.  */
16870       assign(z64,
16871              unop(Iop_Not64,
16872                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
16873
16874       assign(c64,
16875              unop(Iop_Not64,
16876                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
16877    } else {
16878       if (sign == 32) {
16879          /* When interested in bit 31 and bit 63, mask those bits and
16880             fallthrough into the PTEST handling.  */
16881          IRTemp t0 = newTemp(Ity_I64);
16882          IRTemp t1 = newTemp(Ity_I64);
16883          IRTemp t2 = newTemp(Ity_I64);
16884          assign(t0, mkU64(0x8000000080000000ULL));
16885          assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
16886          assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
16887          and64 = t1;
16888          andn64 = t2;
16889       }
16890       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
16891          slice out the Z and C bits conveniently.  We use the standard
16892          trick all-zeroes -> all-zeroes, anything-else -> all-ones
16893          done by "(x | -x) >>s (word-size - 1)".
16894       */
16895       assign(z64,
16896              unop(Iop_Not64,
16897                   binop(Iop_Sar64,
16898                         binop(Iop_Or64,
16899                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
16900                                     mkexpr(and64)), mkU8(63))));
16901
16902       assign(c64,
16903              unop(Iop_Not64,
16904                   binop(Iop_Sar64,
16905                         binop(Iop_Or64,
16906                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
16907                                     mkexpr(andn64)), mkU8(63))));
16908    }
16909
16910    /* And finally, slice out the Z and C flags and set the flags
16911       thunk to COPY for them.  OSAP are set to zero. */
16912    IRTemp newOSZACP = newTemp(Ity_I64);
16913    assign(newOSZACP,
16914           binop(Iop_Or64,
16915                 binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
16916                 binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
16917
16918    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
16919    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16920    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16921    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16922 }
16923
16924
16925 /* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
16926    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
16927 static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx,
16928                              Long delta, Bool isAvx, Int sign )
16929 {
16930    IRTemp addr   = IRTemp_INVALID;
16931    Int    alen   = 0;
16932    HChar  dis_buf[50];
16933    UChar  modrm  = getUChar(delta);
16934    UInt   rG     = gregOfRexRM(pfx, modrm);
16935    IRTemp vecE = newTemp(Ity_V128);
16936    IRTemp vecG = newTemp(Ity_V128);
16937
16938    if ( epartIsReg(modrm) ) {
16939       UInt rE = eregOfRexRM(pfx, modrm);
16940       assign(vecE, getXMMReg(rE));
16941       delta += 1;
16942       DIP( "%s%stest%s %s,%s\n",
16943            isAvx ? "v" : "", sign == 0 ? "p" : "",
16944            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16945            nameXMMReg(rE), nameXMMReg(rG) );
16946    } else {
16947       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16948       if (!isAvx)
16949          gen_SIGNAL_if_not_16_aligned( vbi, addr );
16950       assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
16951       delta += alen;
16952       DIP( "%s%stest%s %s,%s\n",
16953            isAvx ? "v" : "", sign == 0 ? "p" : "",
16954            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16955            dis_buf, nameXMMReg(rG) );
16956    }
16957
16958    assign(vecG, getXMMReg(rG));
16959
16960    /* Set Z=1 iff (vecE & vecG) == 0
16961       Set C=1 iff (vecE & not vecG) == 0
16962    */
16963
16964    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16965    IRTemp andV  = newTemp(Ity_V128);
16966    IRTemp andnV = newTemp(Ity_V128);
16967    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
16968    assign(andnV, binop(Iop_AndV128,
16969                        mkexpr(vecE),
16970                        binop(Iop_XorV128, mkexpr(vecG),
16971                                           mkV128(0xFFFF))));
16972
16973    finish_xTESTy ( andV, andnV, sign );
16974    return delta;
16975 }
16976
16977
16978 /* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
16979    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
16980 static Long dis_xTESTy_256 ( const VexAbiInfo* vbi, Prefix pfx,
16981                              Long delta, Int sign )
16982 {
16983    IRTemp addr   = IRTemp_INVALID;
16984    Int    alen   = 0;
16985    HChar  dis_buf[50];
16986    UChar  modrm  = getUChar(delta);
16987    UInt   rG     = gregOfRexRM(pfx, modrm);
16988    IRTemp vecE   = newTemp(Ity_V256);
16989    IRTemp vecG   = newTemp(Ity_V256);
16990
16991    if ( epartIsReg(modrm) ) {
16992       UInt rE = eregOfRexRM(pfx, modrm);
16993       assign(vecE, getYMMReg(rE));
16994       delta += 1;
16995       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
16996            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16997            nameYMMReg(rE), nameYMMReg(rG) );
16998    } else {
16999       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17000       assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
17001       delta += alen;
17002       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
17003            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
17004            dis_buf, nameYMMReg(rG) );
17005    }
17006
17007    assign(vecG, getYMMReg(rG));
17008
17009    /* Set Z=1 iff (vecE & vecG) == 0
17010       Set C=1 iff (vecE & not vecG) == 0
17011    */
17012
17013    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
17014    IRTemp andV  = newTemp(Ity_V256);
17015    IRTemp andnV = newTemp(Ity_V256);
17016    assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
17017    assign(andnV, binop(Iop_AndV256,
17018                        mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
17019
17020    IRTemp andVhi  = IRTemp_INVALID;
17021    IRTemp andVlo  = IRTemp_INVALID;
17022    IRTemp andnVhi = IRTemp_INVALID;
17023    IRTemp andnVlo = IRTemp_INVALID;
17024    breakupV256toV128s( andV, &andVhi, &andVlo );
17025    breakupV256toV128s( andnV, &andnVhi, &andnVlo );
17026
17027    IRTemp andV128  = newTemp(Ity_V128);
17028    IRTemp andnV128 = newTemp(Ity_V128);
17029    assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
17030    assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
17031
17032    finish_xTESTy ( andV128, andnV128, sign );
17033    return delta;
17034 }
17035
17036
17037 /* Handles 128 and 256 bit versions of VCVTPH2PS. */
17038 static Long dis_VCVTPH2PS ( const VexAbiInfo* vbi, Prefix pfx,
17039                             Long delta, Bool is256bit )
17040 {
17041    /* This is a width-doubling load or reg-reg move, that does conversion on the
17042       transferred data. */
17043    UChar  modrm  = getUChar(delta);
17044    UInt   rG     = gregOfRexRM(pfx, modrm);
17045    IRTemp srcE   = newTemp(is256bit ? Ity_V128 : Ity_I64);
17046
17047    if (epartIsReg(modrm)) {
17048       UInt rE = eregOfRexRM(pfx, modrm);
17049       assign(srcE, is256bit ? unop(Iop_V256toV128_0, getYMMReg(rE))
17050                             : unop(Iop_V128to64, getXMMReg(rE)));
17051       delta += 1;
17052       DIP("vcvtph2ps %s,%s\n", nameXMMReg(rE),
17053                                (is256bit ? nameYMMReg: nameXMMReg)(rG));
17054    } else {
17055       Int    alen   = 0;
17056       HChar  dis_buf[50];
17057       IRTemp addr = disAMode(&alen, vbi, pfx, delta, dis_buf, 0);
17058       // I don't think we need an alignment check here (not 100% sure tho.)
17059       assign(srcE, loadLE(is256bit ? Ity_V128 : Ity_I64, mkexpr(addr)));
17060       delta += alen;
17061       DIP( "vcvtph2ps %s,%s\n", dis_buf,
17062                                 (is256bit ? nameYMMReg: nameXMMReg)(rG));
17063    }
17064
17065    IRExpr* res = unop(is256bit ? Iop_F16toF32x8 : Iop_F16toF32x4, mkexpr(srcE));
17066    (is256bit ? putYMMReg : putYMMRegLoAndZU)(rG, res);
17067
17068    return delta;
17069 }
17070
17071
17072 /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
17073 static Long dis_PMOVxXBW_128 ( const VexAbiInfo* vbi, Prefix pfx,
17074                                Long delta, Bool isAvx, Bool xIsZ )
17075 {
17076    IRTemp addr   = IRTemp_INVALID;
17077    Int    alen   = 0;
17078    HChar  dis_buf[50];
17079    IRTemp srcVec = newTemp(Ity_V128);
17080    UChar  modrm  = getUChar(delta);
17081    const HChar* mbV    = isAvx ? "v" : "";
17082    const HChar  how    = xIsZ ? 'z' : 's';
17083    UInt   rG     = gregOfRexRM(pfx, modrm);
17084    if ( epartIsReg(modrm) ) {
17085       UInt rE = eregOfRexRM(pfx, modrm);
17086       assign( srcVec, getXMMReg(rE) );
17087       delta += 1;
17088       DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17089    } else {
17090       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17091       assign( srcVec,
17092               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17093       delta += alen;
17094       DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17095    }
17096
17097    IRExpr* res
17098       = xIsZ /* do math for either zero or sign extend */
17099         ? binop( Iop_InterleaveLO8x16,
17100                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
17101         : binop( Iop_SarN16x8,
17102                  binop( Iop_ShlN16x8,
17103                         binop( Iop_InterleaveLO8x16,
17104                                IRExpr_Const( IRConst_V128(0) ),
17105                                mkexpr(srcVec) ),
17106                         mkU8(8) ),
17107                  mkU8(8) );
17108
17109    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
17110
17111    return delta;
17112 }
17113
17114
17115 /* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
17116 static Long dis_PMOVxXBW_256 ( const VexAbiInfo* vbi, Prefix pfx,
17117                                Long delta, Bool xIsZ )
17118 {
17119    IRTemp addr   = IRTemp_INVALID;
17120    Int    alen   = 0;
17121    HChar  dis_buf[50];
17122    IRTemp srcVec = newTemp(Ity_V128);
17123    UChar  modrm  = getUChar(delta);
17124    UChar  how    = xIsZ ? 'z' : 's';
17125    UInt   rG     = gregOfRexRM(pfx, modrm);
17126    if ( epartIsReg(modrm) ) {
17127       UInt rE = eregOfRexRM(pfx, modrm);
17128       assign( srcVec, getXMMReg(rE) );
17129       delta += 1;
17130       DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17131    } else {
17132       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17133       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
17134       delta += alen;
17135       DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17136    }
17137
17138    /* First do zero extend.  */
17139    IRExpr* res
17140       = binop( Iop_V128HLtoV256,
17141                binop( Iop_InterleaveHI8x16,
17142                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
17143                binop( Iop_InterleaveLO8x16,
17144                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
17145    /* And if needed sign extension as well.  */
17146    if (!xIsZ)
17147       res = binop( Iop_SarN16x16,
17148                    binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
17149
17150    putYMMReg ( rG, res );
17151
17152    return delta;
17153 }
17154
17155
17156 static Long dis_PMOVxXWD_128 ( const VexAbiInfo* vbi, Prefix pfx,
17157                                Long delta, Bool isAvx, Bool xIsZ )
17158 {
17159    IRTemp addr   = IRTemp_INVALID;
17160    Int    alen   = 0;
17161    HChar  dis_buf[50];
17162    IRTemp srcVec = newTemp(Ity_V128);
17163    UChar  modrm  = getUChar(delta);
17164    const HChar* mbV    = isAvx ? "v" : "";
17165    const HChar  how    = xIsZ ? 'z' : 's';
17166    UInt   rG     = gregOfRexRM(pfx, modrm);
17167
17168    if ( epartIsReg(modrm) ) {
17169       UInt rE = eregOfRexRM(pfx, modrm);
17170       assign( srcVec, getXMMReg(rE) );
17171       delta += 1;
17172       DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17173    } else {
17174       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17175       assign( srcVec,
17176               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17177       delta += alen;
17178       DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17179    }
17180
17181    IRExpr* res
17182       = binop( Iop_InterleaveLO16x8,
17183                IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
17184    if (!xIsZ)
17185       res = binop(Iop_SarN32x4,
17186                   binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
17187
17188    (isAvx ? putYMMRegLoAndZU : putXMMReg)
17189       ( gregOfRexRM(pfx, modrm), res );
17190
17191    return delta;
17192 }
17193
17194
17195 static Long dis_PMOVxXWD_256 ( const VexAbiInfo* vbi, Prefix pfx,
17196                                Long delta, Bool xIsZ )
17197 {
17198    IRTemp addr   = IRTemp_INVALID;
17199    Int    alen   = 0;
17200    HChar  dis_buf[50];
17201    IRTemp srcVec = newTemp(Ity_V128);
17202    UChar  modrm  = getUChar(delta);
17203    UChar  how    = xIsZ ? 'z' : 's';
17204    UInt   rG     = gregOfRexRM(pfx, modrm);
17205
17206    if ( epartIsReg(modrm) ) {
17207       UInt rE = eregOfRexRM(pfx, modrm);
17208       assign( srcVec, getXMMReg(rE) );
17209       delta += 1;
17210       DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17211    } else {
17212       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17213       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
17214       delta += alen;
17215       DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17216    }
17217
17218    IRExpr* res
17219       = binop( Iop_V128HLtoV256,
17220                binop( Iop_InterleaveHI16x8,
17221                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
17222                binop( Iop_InterleaveLO16x8,
17223                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
17224    if (!xIsZ)
17225       res = binop(Iop_SarN32x8,
17226                   binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
17227
17228    putYMMReg ( rG, res );
17229
17230    return delta;
17231 }
17232
17233
17234 static Long dis_PMOVSXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17235                                Long delta, Bool isAvx )
17236 {
17237    IRTemp addr     = IRTemp_INVALID;
17238    Int    alen     = 0;
17239    HChar  dis_buf[50];
17240    IRTemp srcBytes = newTemp(Ity_I32);
17241    UChar  modrm    = getUChar(delta);
17242    const HChar* mbV = isAvx ? "v" : "";
17243    UInt   rG       = gregOfRexRM(pfx, modrm);
17244
17245    if ( epartIsReg( modrm ) ) {
17246       UInt rE = eregOfRexRM(pfx, modrm);
17247       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
17248       delta += 1;
17249       DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17250    } else {
17251       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17252       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
17253       delta += alen;
17254       DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17255    }
17256
17257    (isAvx ? putYMMRegLoAndZU : putXMMReg)
17258       ( rG, binop( Iop_64HLtoV128,
17259                    unop( Iop_16Sto64,
17260                          unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
17261                    unop( Iop_16Sto64,
17262                          unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
17263    return delta;
17264 }
17265
17266
17267 static Long dis_PMOVSXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
17268 {
17269    IRTemp addr     = IRTemp_INVALID;
17270    Int    alen     = 0;
17271    HChar  dis_buf[50];
17272    IRTemp srcBytes = newTemp(Ity_I64);
17273    UChar  modrm    = getUChar(delta);
17274    UInt   rG       = gregOfRexRM(pfx, modrm);
17275    IRTemp s3, s2, s1, s0;
17276    s3 = s2 = s1 = s0 = IRTemp_INVALID;
17277
17278    if ( epartIsReg( modrm ) ) {
17279       UInt rE = eregOfRexRM(pfx, modrm);
17280       assign( srcBytes, getXMMRegLane64( rE, 0 ) );
17281       delta += 1;
17282       DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17283    } else {
17284       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17285       assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
17286       delta += alen;
17287       DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
17288    }
17289
17290    breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
17291    putYMMReg( rG, binop( Iop_V128HLtoV256,
17292                          binop( Iop_64HLtoV128,
17293                                 unop( Iop_16Sto64, mkexpr(s3) ),
17294                                 unop( Iop_16Sto64, mkexpr(s2) ) ),
17295                          binop( Iop_64HLtoV128,
17296                                 unop( Iop_16Sto64, mkexpr(s1) ),
17297                                 unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
17298    return delta;
17299 }
17300
17301
17302 static Long dis_PMOVZXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17303                                Long delta, Bool isAvx )
17304 {
17305    IRTemp addr     = IRTemp_INVALID;
17306    Int    alen     = 0;
17307    HChar  dis_buf[50];
17308    IRTemp srcVec = newTemp(Ity_V128);
17309    UChar  modrm    = getUChar(delta);
17310    const HChar* mbV = isAvx ? "v" : "";
17311    UInt   rG       = gregOfRexRM(pfx, modrm);
17312
17313    if ( epartIsReg( modrm ) ) {
17314       UInt rE = eregOfRexRM(pfx, modrm);
17315       assign( srcVec, getXMMReg(rE) );
17316       delta += 1;
17317       DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17318    } else {
17319       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17320       assign( srcVec,
17321               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
17322       delta += alen;
17323       DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17324    }
17325
17326    IRTemp zeroVec = newTemp( Ity_V128 );
17327    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17328
17329    (isAvx ? putYMMRegLoAndZU : putXMMReg)
17330       ( rG, binop( Iop_InterleaveLO16x8,
17331                    mkexpr(zeroVec),
17332                    binop( Iop_InterleaveLO16x8,
17333                           mkexpr(zeroVec), mkexpr(srcVec) ) ) );
17334    return delta;
17335 }
17336
17337
17338 static Long dis_PMOVZXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17339                                Long delta )
17340 {
17341    IRTemp addr     = IRTemp_INVALID;
17342    Int    alen     = 0;
17343    HChar  dis_buf[50];
17344    IRTemp srcVec = newTemp(Ity_V128);
17345    UChar  modrm    = getUChar(delta);
17346    UInt   rG       = gregOfRexRM(pfx, modrm);
17347
17348    if ( epartIsReg( modrm ) ) {
17349       UInt rE = eregOfRexRM(pfx, modrm);
17350       assign( srcVec, getXMMReg(rE) );
17351       delta += 1;
17352       DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17353    } else {
17354       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17355       assign( srcVec,
17356               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17357       delta += alen;
17358       DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
17359    }
17360
17361    IRTemp zeroVec = newTemp( Ity_V128 );
17362    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17363
17364    putYMMReg( rG, binop( Iop_V128HLtoV256,
17365                          binop( Iop_InterleaveHI16x8,
17366                                 mkexpr(zeroVec),
17367                                 binop( Iop_InterleaveLO16x8,
17368                                        mkexpr(zeroVec), mkexpr(srcVec) ) ),
17369                          binop( Iop_InterleaveLO16x8,
17370                                 mkexpr(zeroVec),
17371                                 binop( Iop_InterleaveLO16x8,
17372                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
17373    return delta;
17374 }
17375
17376
17377 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
17378 static Long dis_PMOVxXDQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17379                                Long delta, Bool isAvx, Bool xIsZ )
17380 {
17381    IRTemp addr   = IRTemp_INVALID;
17382    Int    alen   = 0;
17383    HChar  dis_buf[50];
17384    IRTemp srcI64 = newTemp(Ity_I64);
17385    IRTemp srcVec = newTemp(Ity_V128);
17386    UChar  modrm  = getUChar(delta);
17387    const HChar* mbV = isAvx ? "v" : "";
17388    const HChar  how = xIsZ ? 'z' : 's';
17389    UInt   rG     = gregOfRexRM(pfx, modrm);
17390    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
17391       thing in a V128, with arbitrary junk in the top 64 bits.  Use
17392       one or both of them and let iropt clean up afterwards (as
17393       usual). */
17394    if ( epartIsReg(modrm) ) {
17395       UInt rE = eregOfRexRM(pfx, modrm);
17396       assign( srcVec, getXMMReg(rE) );
17397       assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
17398       delta += 1;
17399       DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17400    } else {
17401       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17402       assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
17403       assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
17404       delta += alen;
17405       DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17406    }
17407
17408    IRExpr* res
17409       = xIsZ /* do math for either zero or sign extend */
17410         ? binop( Iop_InterleaveLO32x4,
17411                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
17412         : binop( Iop_64HLtoV128,
17413                  unop( Iop_32Sto64,
17414                        unop( Iop_64HIto32, mkexpr(srcI64) ) ),
17415                  unop( Iop_32Sto64,
17416                        unop( Iop_64to32, mkexpr(srcI64) ) ) );
17417
17418    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
17419
17420    return delta;
17421 }
17422
17423
17424 /* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
17425 static Long dis_PMOVxXDQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17426                                Long delta, Bool xIsZ )
17427 {
17428    IRTemp addr   = IRTemp_INVALID;
17429    Int    alen   = 0;
17430    HChar  dis_buf[50];
17431    IRTemp srcVec = newTemp(Ity_V128);
17432    UChar  modrm  = getUChar(delta);
17433    UChar  how    = xIsZ ? 'z' : 's';
17434    UInt   rG     = gregOfRexRM(pfx, modrm);
17435    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
17436       thing in a V128, with arbitrary junk in the top 64 bits.  Use
17437       one or both of them and let iropt clean up afterwards (as
17438       usual). */
17439    if ( epartIsReg(modrm) ) {
17440       UInt rE = eregOfRexRM(pfx, modrm);
17441       assign( srcVec, getXMMReg(rE) );
17442       delta += 1;
17443       DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17444    } else {
17445       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17446       assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
17447       delta += alen;
17448       DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17449    }
17450
17451    IRExpr* res;
17452    if (xIsZ)
17453       res = binop( Iop_V128HLtoV256,
17454                    binop( Iop_InterleaveHI32x4,
17455                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
17456                    binop( Iop_InterleaveLO32x4,
17457                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
17458    else {
17459       IRTemp s3, s2, s1, s0;
17460       s3 = s2 = s1 = s0 = IRTemp_INVALID;
17461       breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
17462       res = binop( Iop_V128HLtoV256,
17463                    binop( Iop_64HLtoV128,
17464                           unop( Iop_32Sto64, mkexpr(s3) ),
17465                           unop( Iop_32Sto64, mkexpr(s2) ) ),
17466                    binop( Iop_64HLtoV128,
17467                           unop( Iop_32Sto64, mkexpr(s1) ),
17468                           unop( Iop_32Sto64, mkexpr(s0) ) ) );
17469    }
17470
17471    putYMMReg ( rG, res );
17472
17473    return delta;
17474 }
17475
17476
17477 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
17478 static Long dis_PMOVxXBD_128 ( const VexAbiInfo* vbi, Prefix pfx,
17479                                Long delta, Bool isAvx, Bool xIsZ )
17480 {
17481    IRTemp addr   = IRTemp_INVALID;
17482    Int    alen   = 0;
17483    HChar  dis_buf[50];
17484    IRTemp srcVec = newTemp(Ity_V128);
17485    UChar  modrm  = getUChar(delta);
17486    const HChar* mbV = isAvx ? "v" : "";
17487    const HChar  how = xIsZ ? 'z' : 's';
17488    UInt   rG     = gregOfRexRM(pfx, modrm);
17489    if ( epartIsReg(modrm) ) {
17490       UInt rE = eregOfRexRM(pfx, modrm);
17491       assign( srcVec, getXMMReg(rE) );
17492       delta += 1;
17493       DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17494    } else {
17495       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17496       assign( srcVec,
17497               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
17498       delta += alen;
17499       DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17500    }
17501
17502    IRTemp zeroVec = newTemp(Ity_V128);
17503    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17504
17505    IRExpr* res
17506       = binop(Iop_InterleaveLO8x16,
17507               mkexpr(zeroVec),
17508               binop(Iop_InterleaveLO8x16,
17509                     mkexpr(zeroVec), mkexpr(srcVec)));
17510    if (!xIsZ)
17511       res = binop(Iop_SarN32x4,
17512                   binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
17513
17514    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
17515
17516    return delta;
17517 }
17518
17519
17520 /* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
17521 static Long dis_PMOVxXBD_256 ( const VexAbiInfo* vbi, Prefix pfx,
17522                                Long delta, Bool xIsZ )
17523 {
17524    IRTemp addr   = IRTemp_INVALID;
17525    Int    alen   = 0;
17526    HChar  dis_buf[50];
17527    IRTemp srcVec = newTemp(Ity_V128);
17528    UChar  modrm  = getUChar(delta);
17529    UChar  how    = xIsZ ? 'z' : 's';
17530    UInt   rG     = gregOfRexRM(pfx, modrm);
17531    if ( epartIsReg(modrm) ) {
17532       UInt rE = eregOfRexRM(pfx, modrm);
17533       assign( srcVec, getXMMReg(rE) );
17534       delta += 1;
17535       DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17536    } else {
17537       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17538       assign( srcVec,
17539               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17540       delta += alen;
17541       DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17542    }
17543
17544    IRTemp zeroVec = newTemp(Ity_V128);
17545    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17546
17547    IRExpr* res
17548       = binop( Iop_V128HLtoV256,
17549                binop(Iop_InterleaveHI8x16,
17550                      mkexpr(zeroVec),
17551                      binop(Iop_InterleaveLO8x16,
17552                            mkexpr(zeroVec), mkexpr(srcVec)) ),
17553                binop(Iop_InterleaveLO8x16,
17554                      mkexpr(zeroVec),
17555                      binop(Iop_InterleaveLO8x16,
17556                            mkexpr(zeroVec), mkexpr(srcVec)) ) );
17557    if (!xIsZ)
17558       res = binop(Iop_SarN32x8,
17559                   binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
17560
17561    putYMMReg ( rG, res );
17562
17563    return delta;
17564 }
17565
17566
17567 /* Handles 128 bit versions of PMOVSXBQ. */
17568 static Long dis_PMOVSXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17569                                Long delta, Bool isAvx )
17570 {
17571    IRTemp addr     = IRTemp_INVALID;
17572    Int    alen     = 0;
17573    HChar  dis_buf[50];
17574    IRTemp srcBytes = newTemp(Ity_I16);
17575    UChar  modrm    = getUChar(delta);
17576    const HChar* mbV = isAvx ? "v" : "";
17577    UInt   rG       = gregOfRexRM(pfx, modrm);
17578    if ( epartIsReg(modrm) ) {
17579       UInt rE = eregOfRexRM(pfx, modrm);
17580       assign( srcBytes, getXMMRegLane16( rE, 0 ) );
17581       delta += 1;
17582       DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17583    } else {
17584       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17585       assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
17586       delta += alen;
17587       DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17588    }
17589
17590    (isAvx ? putYMMRegLoAndZU : putXMMReg)
17591       ( rG, binop( Iop_64HLtoV128,
17592                    unop( Iop_8Sto64,
17593                          unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
17594                    unop( Iop_8Sto64,
17595                          unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
17596    return delta;
17597 }
17598
17599
17600 /* Handles 256 bit versions of PMOVSXBQ. */
17601 static Long dis_PMOVSXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17602                                Long delta )
17603 {
17604    IRTemp addr     = IRTemp_INVALID;
17605    Int    alen     = 0;
17606    HChar  dis_buf[50];
17607    IRTemp srcBytes = newTemp(Ity_I32);
17608    UChar  modrm    = getUChar(delta);
17609    UInt   rG       = gregOfRexRM(pfx, modrm);
17610    if ( epartIsReg(modrm) ) {
17611       UInt rE = eregOfRexRM(pfx, modrm);
17612       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
17613       delta += 1;
17614       DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17615    } else {
17616       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17617       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
17618       delta += alen;
17619       DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
17620    }
17621
17622    putYMMReg
17623       ( rG, binop( Iop_V128HLtoV256,
17624                    binop( Iop_64HLtoV128,
17625                           unop( Iop_8Sto64,
17626                                 unop( Iop_16HIto8,
17627                                       unop( Iop_32HIto16,
17628                                             mkexpr(srcBytes) ) ) ),
17629                           unop( Iop_8Sto64,
17630                                 unop( Iop_16to8,
17631                                       unop( Iop_32HIto16,
17632                                             mkexpr(srcBytes) ) ) ) ),
17633                    binop( Iop_64HLtoV128,
17634                           unop( Iop_8Sto64,
17635                                 unop( Iop_16HIto8,
17636                                       unop( Iop_32to16,
17637                                             mkexpr(srcBytes) ) ) ),
17638                           unop( Iop_8Sto64,
17639                                 unop( Iop_16to8,
17640                                       unop( Iop_32to16,
17641                                             mkexpr(srcBytes) ) ) ) ) ) );
17642    return delta;
17643 }
17644
17645
17646 /* Handles 128 bit versions of PMOVZXBQ. */
17647 static Long dis_PMOVZXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17648                                Long delta, Bool isAvx )
17649 {
17650    IRTemp addr     = IRTemp_INVALID;
17651    Int    alen     = 0;
17652    HChar  dis_buf[50];
17653    IRTemp srcVec   = newTemp(Ity_V128);
17654    UChar  modrm    = getUChar(delta);
17655    const HChar* mbV = isAvx ? "v" : "";
17656    UInt   rG       = gregOfRexRM(pfx, modrm);
17657    if ( epartIsReg(modrm) ) {
17658       UInt rE = eregOfRexRM(pfx, modrm);
17659       assign( srcVec, getXMMReg(rE) );
17660       delta += 1;
17661       DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17662    } else {
17663       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17664       assign( srcVec,
17665               unop( Iop_32UtoV128,
17666                     unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
17667       delta += alen;
17668       DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17669    }
17670
17671    IRTemp zeroVec = newTemp(Ity_V128);
17672    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17673
17674    (isAvx ? putYMMRegLoAndZU : putXMMReg)
17675       ( rG, binop( Iop_InterleaveLO8x16,
17676                    mkexpr(zeroVec),
17677                    binop( Iop_InterleaveLO8x16,
17678                           mkexpr(zeroVec),
17679                           binop( Iop_InterleaveLO8x16,
17680                                  mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
17681    return delta;
17682 }
17683
17684
17685 /* Handles 256 bit versions of PMOVZXBQ. */
17686 static Long dis_PMOVZXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17687                                Long delta )
17688 {
17689    IRTemp addr     = IRTemp_INVALID;
17690    Int    alen     = 0;
17691    HChar  dis_buf[50];
17692    IRTemp srcVec   = newTemp(Ity_V128);
17693    UChar  modrm    = getUChar(delta);
17694    UInt   rG       = gregOfRexRM(pfx, modrm);
17695    if ( epartIsReg(modrm) ) {
17696       UInt rE = eregOfRexRM(pfx, modrm);
17697       assign( srcVec, getXMMReg(rE) );
17698       delta += 1;
17699       DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17700    } else {
17701       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17702       assign( srcVec,
17703               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
17704       delta += alen;
17705       DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
17706    }
17707
17708    IRTemp zeroVec = newTemp(Ity_V128);
17709    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17710
17711    putYMMReg
17712       ( rG, binop( Iop_V128HLtoV256,
17713                    binop( Iop_InterleaveHI8x16,
17714                           mkexpr(zeroVec),
17715                           binop( Iop_InterleaveLO8x16,
17716                                  mkexpr(zeroVec),
17717                                  binop( Iop_InterleaveLO8x16,
17718                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
17719                    binop( Iop_InterleaveLO8x16,
17720                           mkexpr(zeroVec),
17721                           binop( Iop_InterleaveLO8x16,
17722                                  mkexpr(zeroVec),
17723                                  binop( Iop_InterleaveLO8x16,
17724                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) )
17725                  ) );
17726    return delta;
17727 }
17728
17729
17730 static Long dis_PHMINPOSUW_128 ( const VexAbiInfo* vbi, Prefix pfx,
17731                                  Long delta, Bool isAvx )
17732 {
17733    IRTemp addr   = IRTemp_INVALID;
17734    Int    alen   = 0;
17735    HChar  dis_buf[50];
17736    UChar  modrm  = getUChar(delta);
17737    const HChar* mbV = isAvx ? "v" : "";
17738    IRTemp sV     = newTemp(Ity_V128);
17739    IRTemp sHi    = newTemp(Ity_I64);
17740    IRTemp sLo    = newTemp(Ity_I64);
17741    IRTemp dLo    = newTemp(Ity_I64);
17742    UInt   rG     = gregOfRexRM(pfx,modrm);
17743    if (epartIsReg(modrm)) {
17744       UInt rE = eregOfRexRM(pfx,modrm);
17745       assign( sV, getXMMReg(rE) );
17746       delta += 1;
17747       DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
17748    } else {
17749       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
17750       if (!isAvx)
17751          gen_SIGNAL_if_not_16_aligned(vbi, addr);
17752       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
17753       delta += alen;
17754       DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
17755    }
17756    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
17757    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
17758    assign( dLo, mkIRExprCCall(
17759                    Ity_I64, 0/*regparms*/,
17760                    "amd64g_calculate_sse_phminposuw",
17761                    &amd64g_calculate_sse_phminposuw,
17762                    mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
17763          ));
17764    (isAvx ? putYMMRegLoAndZU : putXMMReg)
17765       (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
17766    return delta;
17767 }
17768
17769
17770 static Long dis_AESx ( const VexAbiInfo* vbi, Prefix pfx,
17771                        Long delta, Bool isAvx, UChar opc )
17772 {
17773    IRTemp addr   = IRTemp_INVALID;
17774    Int    alen   = 0;
17775    HChar  dis_buf[50];
17776    UChar  modrm  = getUChar(delta);
17777    UInt   rG     = gregOfRexRM(pfx, modrm);
17778    UInt   regNoL = 0;
17779    UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
17780
17781    /* This is a nasty kludge.  We need to pass 2 x V128 to the
17782       helper.  Since we can't do that, use a dirty
17783       helper to compute the results directly from the XMM regs in
17784       the guest state.  That means for the memory case, we need to
17785       move the left operand into a pseudo-register (XMM16, let's
17786       call it). */
17787    if (epartIsReg(modrm)) {
17788       regNoL = eregOfRexRM(pfx, modrm);
17789       delta += 1;
17790    } else {
17791       regNoL = 16; /* use XMM16 as an intermediary */
17792       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17793       /* alignment check needed ???? */
17794       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
17795       delta += alen;
17796    }
17797
17798    void*  fn = &amd64g_dirtyhelper_AES;
17799    const HChar* nm = "amd64g_dirtyhelper_AES";
17800
17801    /* Round up the arguments.  Note that this is a kludge -- the
17802       use of mkU64 rather than mkIRExpr_HWord implies the
17803       assumption that the host's word size is 64-bit. */
17804    UInt gstOffD = ymmGuestRegOffset(rG);
17805    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
17806    UInt gstOffR = ymmGuestRegOffset(regNoR);
17807    IRExpr*  opc4         = mkU64(opc);
17808    IRExpr*  gstOffDe     = mkU64(gstOffD);
17809    IRExpr*  gstOffLe     = mkU64(gstOffL);
17810    IRExpr*  gstOffRe     = mkU64(gstOffR);
17811    IRExpr** args
17812       = mkIRExprVec_5( IRExpr_GSPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
17813
17814    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
17815    /* It's not really a dirty call, but we can't use the clean helper
17816       mechanism here for the very lame reason that we can't pass 2 x
17817       V128s by value to a helper.  Hence this roundabout scheme. */
17818    d->nFxState = 2;
17819    vex_bzero(&d->fxState, sizeof(d->fxState));
17820    /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
17821       the second for !isAvx or the third for isAvx.
17822       AESIMC (0xDB) reads the first register, and writes the second. */
17823    d->fxState[0].fx     = Ifx_Read;
17824    d->fxState[0].offset = gstOffL;
17825    d->fxState[0].size   = sizeof(U128);
17826    d->fxState[1].offset = gstOffR;
17827    d->fxState[1].size   = sizeof(U128);
17828    if (opc == 0xDB)
17829       d->fxState[1].fx   = Ifx_Write;
17830    else if (!isAvx || rG == regNoR)
17831       d->fxState[1].fx   = Ifx_Modify;
17832    else {
17833       d->fxState[1].fx     = Ifx_Read;
17834       d->nFxState++;
17835       d->fxState[2].fx     = Ifx_Write;
17836       d->fxState[2].offset = gstOffD;
17837       d->fxState[2].size   = sizeof(U128);
17838    }
17839
17840    stmt( IRStmt_Dirty(d) );
17841    {
17842       const HChar* opsuf;
17843       switch (opc) {
17844          case 0xDC: opsuf = "enc"; break;
17845          case 0XDD: opsuf = "enclast"; break;
17846          case 0xDE: opsuf = "dec"; break;
17847          case 0xDF: opsuf = "declast"; break;
17848          case 0xDB: opsuf = "imc"; break;
17849          default: vassert(0);
17850       }
17851       DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
17852           (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
17853           nameXMMReg(regNoR),
17854           (isAvx && opc != 0xDB) ? "," : "",
17855           (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
17856    }
17857    if (isAvx)
17858       putYMMRegLane128( rG, 1, mkV128(0) );
17859    return delta;
17860 }
17861
17862 static Long dis_AESKEYGENASSIST ( const VexAbiInfo* vbi, Prefix pfx,
17863                                   Long delta, Bool isAvx )
17864 {
17865    IRTemp addr   = IRTemp_INVALID;
17866    Int    alen   = 0;
17867    HChar  dis_buf[50];
17868    UChar  modrm  = getUChar(delta);
17869    UInt   regNoL = 0;
17870    UInt   regNoR = gregOfRexRM(pfx, modrm);
17871    UChar  imm    = 0;
17872
17873    /* This is a nasty kludge.  See AESENC et al. instructions. */
17874    modrm = getUChar(delta);
17875    if (epartIsReg(modrm)) {
17876       regNoL = eregOfRexRM(pfx, modrm);
17877       imm = getUChar(delta+1);
17878       delta += 1+1;
17879    } else {
17880       regNoL = 16; /* use XMM16 as an intermediary */
17881       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17882       /* alignment check ???? . */
17883       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
17884       imm = getUChar(delta+alen);
17885       delta += alen+1;
17886    }
17887
17888    /* Who ya gonna call?  Presumably not Ghostbusters. */
17889    void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
17890    const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
17891
17892    /* Round up the arguments.  Note that this is a kludge -- the
17893       use of mkU64 rather than mkIRExpr_HWord implies the
17894       assumption that the host's word size is 64-bit. */
17895    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
17896    UInt gstOffR = ymmGuestRegOffset(regNoR);
17897
17898    IRExpr*  imme          = mkU64(imm & 0xFF);
17899    IRExpr*  gstOffLe     = mkU64(gstOffL);
17900    IRExpr*  gstOffRe     = mkU64(gstOffR);
17901    IRExpr** args
17902       = mkIRExprVec_4( IRExpr_GSPTR(), imme, gstOffLe, gstOffRe );
17903
17904    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
17905    /* It's not really a dirty call, but we can't use the clean helper
17906       mechanism here for the very lame reason that we can't pass 2 x
17907       V128s by value to a helper.  Hence this roundabout scheme. */
17908    d->nFxState = 2;
17909    vex_bzero(&d->fxState, sizeof(d->fxState));
17910    d->fxState[0].fx     = Ifx_Read;
17911    d->fxState[0].offset = gstOffL;
17912    d->fxState[0].size   = sizeof(U128);
17913    d->fxState[1].fx     = Ifx_Write;
17914    d->fxState[1].offset = gstOffR;
17915    d->fxState[1].size   = sizeof(U128);
17916    stmt( IRStmt_Dirty(d) );
17917
17918    DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
17919        (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
17920        nameXMMReg(regNoR));
17921    if (isAvx)
17922       putYMMRegLane128( regNoR, 1, mkV128(0) );
17923    return delta;
17924 }
17925
17926
17927 __attribute__((noinline))
17928 static
17929 Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
17930                           const VexAbiInfo* vbi,
17931                           Prefix pfx, Int sz, Long deltaIN )
17932 {
17933    IRTemp addr  = IRTemp_INVALID;
17934    UChar  modrm = 0;
17935    Int    alen  = 0;
17936    HChar  dis_buf[50];
17937
17938    *decode_OK = False;
17939
17940    Long   delta = deltaIN;
17941    UChar  opc   = getUChar(delta);
17942    delta++;
17943    switch (opc) {
17944
17945    case 0x10:
17946    case 0x14:
17947    case 0x15:
17948       /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
17949          66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
17950          66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
17951          Blend at various granularities, with XMM0 (implicit operand)
17952          providing the controlling mask.
17953       */
17954       if (have66noF2noF3(pfx) && sz == 2) {
17955          modrm = getUChar(delta);
17956
17957          const HChar* nm    = NULL;
17958          UInt   gran  = 0;
17959          IROp   opSAR = Iop_INVALID;
17960          switch (opc) {
17961             case 0x10:
17962                nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
17963                break;
17964             case 0x14:
17965                nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
17966                break;
17967             case 0x15:
17968                nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
17969                break;
17970          }
17971          vassert(nm);
17972
17973          IRTemp vecE = newTemp(Ity_V128);
17974          IRTemp vecG = newTemp(Ity_V128);
17975          IRTemp vec0 = newTemp(Ity_V128);
17976
17977          if ( epartIsReg(modrm) ) {
17978             assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
17979             delta += 1;
17980             DIP( "%s %s,%s\n", nm,
17981                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
17982                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17983          } else {
17984             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17985             gen_SIGNAL_if_not_16_aligned( vbi, addr );
17986             assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
17987             delta += alen;
17988             DIP( "%s %s,%s\n", nm,
17989                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17990          }
17991
17992          assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
17993          assign(vec0, getXMMReg(0));
17994
17995          IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
17996          putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
17997
17998          goto decode_success;
17999       }
18000       break;
18001
18002    case 0x17:
18003       /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
18004          Logical compare (set ZF and CF from AND/ANDN of the operands) */
18005       if (have66noF2noF3(pfx)
18006           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
18007          delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
18008          goto decode_success;
18009       }
18010       break;
18011
18012    case 0x20:
18013       /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
18014          Packed Move with Sign Extend from Byte to Word (XMM) */
18015       if (have66noF2noF3(pfx) && sz == 2) {
18016          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
18017                                    False/*!isAvx*/, False/*!xIsZ*/ );
18018          goto decode_success;
18019       }
18020       break;
18021
18022    case 0x21:
18023       /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
18024          Packed Move with Sign Extend from Byte to DWord (XMM) */
18025       if (have66noF2noF3(pfx) && sz == 2) {
18026          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
18027                                    False/*!isAvx*/, False/*!xIsZ*/ );
18028          goto decode_success;
18029       }
18030       break;
18031
18032    case 0x22:
18033       /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
18034          Packed Move with Sign Extend from Byte to QWord (XMM) */
18035       if (have66noF2noF3(pfx) && sz == 2) {
18036          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18037          goto decode_success;
18038       }
18039       break;
18040
18041    case 0x23:
18042       /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
18043          Packed Move with Sign Extend from Word to DWord (XMM) */
18044       if (have66noF2noF3(pfx) && sz == 2) {
18045          delta = dis_PMOVxXWD_128(vbi, pfx, delta,
18046                                   False/*!isAvx*/, False/*!xIsZ*/);
18047          goto decode_success;
18048       }
18049       break;
18050
18051    case 0x24:
18052       /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
18053          Packed Move with Sign Extend from Word to QWord (XMM) */
18054       if (have66noF2noF3(pfx) && sz == 2) {
18055          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18056          goto decode_success;
18057       }
18058       break;
18059
18060    case 0x25:
18061       /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
18062          Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
18063       if (have66noF2noF3(pfx) && sz == 2) {
18064          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
18065                                    False/*!isAvx*/, False/*!xIsZ*/ );
18066          goto decode_success;
18067       }
18068       break;
18069
18070    case 0x28:
18071       /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
18072          0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
18073          64-bit half */
18074       /* This is a really poor translation -- could be improved if
18075          performance critical.  It's a copy-paste of PMULUDQ, too. */
18076       if (have66noF2noF3(pfx) && sz == 2) {
18077          IRTemp sV = newTemp(Ity_V128);
18078          IRTemp dV = newTemp(Ity_V128);
18079          modrm = getUChar(delta);
18080          UInt rG = gregOfRexRM(pfx,modrm);
18081          assign( dV, getXMMReg(rG) );
18082          if (epartIsReg(modrm)) {
18083             UInt rE = eregOfRexRM(pfx,modrm);
18084             assign( sV, getXMMReg(rE) );
18085             delta += 1;
18086             DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
18087          } else {
18088             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18089             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
18090             delta += alen;
18091             DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
18092          }
18093
18094          putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
18095          goto decode_success;
18096       }
18097       break;
18098
18099    case 0x29:
18100       /* 66 0F 38 29 = PCMPEQQ
18101          64x2 equality comparison */
18102       if (have66noF2noF3(pfx) && sz == 2) {
18103          /* FIXME: this needs an alignment check */
18104          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
18105                                     "pcmpeqq", Iop_CmpEQ64x2, False );
18106          goto decode_success;
18107       }
18108       break;
18109
18110    case 0x2A:
18111       /* 66 0F 38 2A = MOVNTDQA
18112          "non-temporal" "streaming" load
18113          Handle like MOVDQA but only memory operand is allowed */
18114       if (have66noF2noF3(pfx) && sz == 2) {
18115          modrm = getUChar(delta);
18116          if (!epartIsReg(modrm)) {
18117             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18118             gen_SIGNAL_if_not_16_aligned( vbi, addr );
18119             putXMMReg( gregOfRexRM(pfx,modrm),
18120                        loadLE(Ity_V128, mkexpr(addr)) );
18121             DIP("movntdqa %s,%s\n", dis_buf,
18122                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
18123             delta += alen;
18124             goto decode_success;
18125          }
18126       }
18127       break;
18128
18129    case 0x2B:
18130       /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
18131          2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
18132       if (have66noF2noF3(pfx) && sz == 2) {
18133
18134          modrm = getUChar(delta);
18135
18136          IRTemp argL = newTemp(Ity_V128);
18137          IRTemp argR = newTemp(Ity_V128);
18138
18139          if ( epartIsReg(modrm) ) {
18140             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18141             delta += 1;
18142             DIP( "packusdw %s,%s\n",
18143                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
18144                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18145          } else {
18146             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
18147             gen_SIGNAL_if_not_16_aligned( vbi, addr );
18148             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
18149             delta += alen;
18150             DIP( "packusdw %s,%s\n",
18151                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18152          }
18153
18154          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
18155
18156          putXMMReg( gregOfRexRM(pfx, modrm),
18157                     binop( Iop_QNarrowBin32Sto16Ux8,
18158                            mkexpr(argL), mkexpr(argR)) );
18159
18160          goto decode_success;
18161       }
18162       break;
18163
18164    case 0x30:
18165       /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
18166          Packed Move with Zero Extend from Byte to Word (XMM) */
18167       if (have66noF2noF3(pfx) && sz == 2) {
18168          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
18169                                    False/*!isAvx*/, True/*xIsZ*/ );
18170          goto decode_success;
18171       }
18172       break;
18173
18174    case 0x31:
18175       /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
18176          Packed Move with Zero Extend from Byte to DWord (XMM) */
18177       if (have66noF2noF3(pfx) && sz == 2) {
18178          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
18179                                    False/*!isAvx*/, True/*xIsZ*/ );
18180          goto decode_success;
18181       }
18182       break;
18183
18184    case 0x32:
18185       /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
18186          Packed Move with Zero Extend from Byte to QWord (XMM) */
18187       if (have66noF2noF3(pfx) && sz == 2) {
18188          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18189          goto decode_success;
18190       }
18191       break;
18192
18193    case 0x33:
18194       /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
18195          Packed Move with Zero Extend from Word to DWord (XMM) */
18196       if (have66noF2noF3(pfx) && sz == 2) {
18197          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
18198                                    False/*!isAvx*/, True/*xIsZ*/ );
18199          goto decode_success;
18200       }
18201       break;
18202
18203    case 0x34:
18204       /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
18205          Packed Move with Zero Extend from Word to QWord (XMM) */
18206       if (have66noF2noF3(pfx) && sz == 2) {
18207          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18208          goto decode_success;
18209       }
18210       break;
18211
18212    case 0x35:
18213       /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
18214          Packed Move with Zero Extend from DWord to QWord (XMM) */
18215       if (have66noF2noF3(pfx) && sz == 2) {
18216          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
18217                                    False/*!isAvx*/, True/*xIsZ*/ );
18218          goto decode_success;
18219       }
18220       break;
18221
18222    case 0x37:
18223       /* 66 0F 38 37 = PCMPGTQ
18224          64x2 comparison (signed, presumably; the Intel docs don't say :-)
18225       */
18226       if (have66noF2noF3(pfx) && sz == 2) {
18227          /* FIXME: this needs an alignment check */
18228          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
18229                                     "pcmpgtq", Iop_CmpGT64Sx2, False );
18230          goto decode_success;
18231       }
18232       break;
18233
18234    case 0x38:
18235    case 0x3C:
18236       /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
18237          66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
18238       */
18239       if (have66noF2noF3(pfx) && sz == 2) {
18240          /* FIXME: this needs an alignment check */
18241          Bool isMAX = opc == 0x3C;
18242          delta = dis_SSEint_E_to_G(
18243                     vbi, pfx, delta,
18244                     isMAX ? "pmaxsb" : "pminsb",
18245                     isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
18246                     False
18247                  );
18248          goto decode_success;
18249       }
18250       break;
18251
18252    case 0x39:
18253    case 0x3D:
18254       /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
18255          Minimum of Packed Signed Double Word Integers (XMM)
18256          66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
18257          Maximum of Packed Signed Double Word Integers (XMM)
18258       */
18259       if (have66noF2noF3(pfx) && sz == 2) {
18260          /* FIXME: this needs an alignment check */
18261          Bool isMAX = opc == 0x3D;
18262          delta = dis_SSEint_E_to_G(
18263                     vbi, pfx, delta,
18264                     isMAX ? "pmaxsd" : "pminsd",
18265                     isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
18266                     False
18267                  );
18268          goto decode_success;
18269       }
18270       break;
18271
18272    case 0x3A:
18273    case 0x3E:
18274       /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
18275          Minimum of Packed Unsigned Word Integers (XMM)
18276          66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
18277          Maximum of Packed Unsigned Word Integers (XMM)
18278       */
18279       if (have66noF2noF3(pfx) && sz == 2) {
18280          /* FIXME: this needs an alignment check */
18281          Bool isMAX = opc == 0x3E;
18282          delta = dis_SSEint_E_to_G(
18283                     vbi, pfx, delta,
18284                     isMAX ? "pmaxuw" : "pminuw",
18285                     isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
18286                     False
18287                  );
18288          goto decode_success;
18289       }
18290       break;
18291
18292    case 0x3B:
18293    case 0x3F:
18294       /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
18295          Minimum of Packed Unsigned Doubleword Integers (XMM)
18296          66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
18297          Maximum of Packed Unsigned Doubleword Integers (XMM)
18298       */
18299       if (have66noF2noF3(pfx) && sz == 2) {
18300          /* FIXME: this needs an alignment check */
18301          Bool isMAX = opc == 0x3F;
18302          delta = dis_SSEint_E_to_G(
18303                     vbi, pfx, delta,
18304                     isMAX ? "pmaxud" : "pminud",
18305                     isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
18306                     False
18307                  );
18308          goto decode_success;
18309       }
18310       break;
18311
18312    case 0x40:
18313       /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
18314          32x4 integer multiply from xmm2/m128 to xmm1 */
18315       if (have66noF2noF3(pfx) && sz == 2) {
18316
18317          modrm = getUChar(delta);
18318
18319          IRTemp argL = newTemp(Ity_V128);
18320          IRTemp argR = newTemp(Ity_V128);
18321
18322          if ( epartIsReg(modrm) ) {
18323             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18324             delta += 1;
18325             DIP( "pmulld %s,%s\n",
18326                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
18327                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18328          } else {
18329             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
18330             gen_SIGNAL_if_not_16_aligned( vbi, addr );
18331             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
18332             delta += alen;
18333             DIP( "pmulld %s,%s\n",
18334                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18335          }
18336
18337          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
18338
18339          putXMMReg( gregOfRexRM(pfx, modrm),
18340                     binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
18341
18342          goto decode_success;
18343       }
18344       break;
18345
18346    case 0x41:
18347       /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
18348          Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
18349       if (have66noF2noF3(pfx) && sz == 2) {
18350          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
18351          goto decode_success;
18352       }
18353       break;
18354
18355    case 0xDC:
18356    case 0xDD:
18357    case 0xDE:
18358    case 0xDF:
18359    case 0xDB:
18360       /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
18361                   DD /r = AESENCLAST xmm1, xmm2/m128
18362                   DE /r = AESDEC xmm1, xmm2/m128
18363                   DF /r = AESDECLAST xmm1, xmm2/m128
18364
18365                   DB /r = AESIMC xmm1, xmm2/m128 */
18366       if (have66noF2noF3(pfx) && sz == 2) {
18367          delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
18368          goto decode_success;
18369       }
18370       break;
18371
18372    case 0xF0:
18373    case 0xF1:
18374       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
18375          F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
18376          The decoding on this is a bit unusual.
18377       */
18378       if (haveF2noF3(pfx)
18379           && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
18380          modrm = getUChar(delta);
18381
18382          if (opc == 0xF0)
18383             sz = 1;
18384          else
18385             vassert(sz == 2 || sz == 4 || sz == 8);
18386
18387          IRType tyE = szToITy(sz);
18388          IRTemp valE = newTemp(tyE);
18389
18390          if (epartIsReg(modrm)) {
18391             assign(valE, getIRegE(sz, pfx, modrm));
18392             delta += 1;
18393             DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
18394                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
18395          } else {
18396             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
18397             assign(valE, loadLE(tyE, mkexpr(addr)));
18398             delta += alen;
18399             DIP("crc32b %s,%s\n", dis_buf,
18400                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
18401          }
18402
18403          /* Somewhat funny getting/putting of the crc32 value, in order
18404             to ensure that it turns into 64-bit gets and puts.  However,
18405             mask off the upper 32 bits so as to not get memcheck false
18406             +ves around the helper call. */
18407          IRTemp valG0 = newTemp(Ity_I64);
18408          assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
18409                              mkU64(0xFFFFFFFF)));
18410
18411          const HChar* nm = NULL;
18412          void*  fn = NULL;
18413          switch (sz) {
18414             case 1: nm = "amd64g_calc_crc32b";
18415                     fn = &amd64g_calc_crc32b; break;
18416             case 2: nm = "amd64g_calc_crc32w";
18417                     fn = &amd64g_calc_crc32w; break;
18418             case 4: nm = "amd64g_calc_crc32l";
18419                     fn = &amd64g_calc_crc32l; break;
18420             case 8: nm = "amd64g_calc_crc32q";
18421                     fn = &amd64g_calc_crc32q; break;
18422          }
18423          vassert(nm && fn);
18424          IRTemp valG1 = newTemp(Ity_I64);
18425          assign(valG1,
18426                 mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
18427                               mkIRExprVec_2(mkexpr(valG0),
18428                                             widenUto64(mkexpr(valE)))));
18429
18430          putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
18431          goto decode_success;
18432       }
18433       break;
18434
18435    default:
18436       break;
18437
18438    }
18439
18440   //decode_failure:
18441    *decode_OK = False;
18442    return deltaIN;
18443
18444   decode_success:
18445    *decode_OK = True;
18446    return delta;
18447 }
18448
18449
18450 /*------------------------------------------------------------*/
18451 /*---                                                      ---*/
18452 /*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
18453 /*---                                                      ---*/
18454 /*------------------------------------------------------------*/
18455
18456 static Long dis_PEXTRW ( const VexAbiInfo* vbi, Prefix pfx,
18457                          Long delta, Bool isAvx )
18458 {
18459    IRTemp addr  = IRTemp_INVALID;
18460    IRTemp t0    = IRTemp_INVALID;
18461    IRTemp t1    = IRTemp_INVALID;
18462    IRTemp t2    = IRTemp_INVALID;
18463    IRTemp t3    = IRTemp_INVALID;
18464    UChar  modrm = getUChar(delta);
18465    Int    alen  = 0;
18466    HChar  dis_buf[50];
18467    UInt   rG    = gregOfRexRM(pfx,modrm);
18468    Int    imm8_20;
18469    IRTemp xmm_vec = newTemp(Ity_V128);
18470    IRTemp d16   = newTemp(Ity_I16);
18471    const HChar* mbV = isAvx ? "v" : "";
18472
18473    vassert(0==getRexW(pfx)); /* ensured by caller */
18474    assign( xmm_vec, getXMMReg(rG) );
18475    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
18476
18477    if ( epartIsReg( modrm ) ) {
18478       imm8_20 = (Int)(getUChar(delta+1) & 7);
18479    } else {
18480       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18481       imm8_20 = (Int)(getUChar(delta+alen) & 7);
18482    }
18483
18484    switch (imm8_20) {
18485       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
18486       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
18487       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
18488       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
18489       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
18490       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
18491       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
18492       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
18493       default: vassert(0);
18494    }
18495
18496    if ( epartIsReg( modrm ) ) {
18497       UInt rE = eregOfRexRM(pfx,modrm);
18498       putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
18499       delta += 1+1;
18500       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
18501            nameXMMReg( rG ), nameIReg32( rE ) );
18502    } else {
18503       storeLE( mkexpr(addr), mkexpr(d16) );
18504       delta += alen+1;
18505       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
18506    }
18507    return delta;
18508 }
18509
18510
18511 static Long dis_PEXTRD ( const VexAbiInfo* vbi, Prefix pfx,
18512                          Long delta, Bool isAvx )
18513 {
18514    IRTemp addr  = IRTemp_INVALID;
18515    IRTemp t0    = IRTemp_INVALID;
18516    IRTemp t1    = IRTemp_INVALID;
18517    IRTemp t2    = IRTemp_INVALID;
18518    IRTemp t3    = IRTemp_INVALID;
18519    UChar  modrm = 0;
18520    Int    alen  = 0;
18521    HChar  dis_buf[50];
18522
18523    Int    imm8_10;
18524    IRTemp xmm_vec   = newTemp(Ity_V128);
18525    IRTemp src_dword = newTemp(Ity_I32);
18526    const HChar* mbV = isAvx ? "v" : "";
18527
18528    vassert(0==getRexW(pfx)); /* ensured by caller */
18529    modrm = getUChar(delta);
18530    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
18531    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
18532
18533    if ( epartIsReg( modrm ) ) {
18534       imm8_10 = (Int)(getUChar(delta+1) & 3);
18535    } else {
18536       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18537       imm8_10 = (Int)(getUChar(delta+alen) & 3);
18538    }
18539
18540    switch ( imm8_10 ) {
18541       case 0:  assign( src_dword, mkexpr(t0) ); break;
18542       case 1:  assign( src_dword, mkexpr(t1) ); break;
18543       case 2:  assign( src_dword, mkexpr(t2) ); break;
18544       case 3:  assign( src_dword, mkexpr(t3) ); break;
18545       default: vassert(0);
18546    }
18547
18548    if ( epartIsReg( modrm ) ) {
18549       putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
18550       delta += 1+1;
18551       DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
18552            nameXMMReg( gregOfRexRM(pfx, modrm) ),
18553            nameIReg32( eregOfRexRM(pfx, modrm) ) );
18554    } else {
18555       storeLE( mkexpr(addr), mkexpr(src_dword) );
18556       delta += alen+1;
18557       DIP( "%spextrd $%d, %s,%s\n", mbV,
18558            imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
18559    }
18560    return delta;
18561 }
18562
18563
18564 static Long dis_PEXTRQ ( const VexAbiInfo* vbi, Prefix pfx,
18565                          Long delta, Bool isAvx )
18566 {
18567    IRTemp addr  = IRTemp_INVALID;
18568    UChar  modrm = 0;
18569    Int    alen  = 0;
18570    HChar  dis_buf[50];
18571
18572    Int imm8_0;
18573    IRTemp xmm_vec   = newTemp(Ity_V128);
18574    IRTemp src_qword = newTemp(Ity_I64);
18575    const HChar* mbV = isAvx ? "v" : "";
18576
18577    vassert(1==getRexW(pfx)); /* ensured by caller */
18578    modrm = getUChar(delta);
18579    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
18580
18581    if ( epartIsReg( modrm ) ) {
18582       imm8_0 = (Int)(getUChar(delta+1) & 1);
18583    } else {
18584       addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18585       imm8_0 = (Int)(getUChar(delta+alen) & 1);
18586    }
18587
18588    switch ( imm8_0 ) {
18589       case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
18590                break;
18591       case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
18592                break;
18593       default: vassert(0);
18594    }
18595
18596    if ( epartIsReg( modrm ) ) {
18597       putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
18598       delta += 1+1;
18599       DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
18600            nameXMMReg( gregOfRexRM(pfx, modrm) ),
18601            nameIReg64( eregOfRexRM(pfx, modrm) ) );
18602    } else {
18603       storeLE( mkexpr(addr), mkexpr(src_qword) );
18604       delta += alen+1;
18605       DIP( "%spextrq $%d, %s,%s\n", mbV,
18606            imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
18607    }
18608    return delta;
18609 }
18610
18611 static IRExpr* math_CTZ32(IRExpr *exp)
18612 {
18613    /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
18614    return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
18615 }
18616
18617 static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
18618                                Long delta, UChar opc, UChar imm,
18619                                HChar dis_buf[])
18620 {
18621    /* We only handle PCMPISTRI for now */
18622    vassert((opc & 0x03) == 0x03);
18623    /* And only an immediate byte of 0x38 or 0x3A */
18624    vassert((imm & ~0x02) == 0x38);
18625
18626    /* FIXME: Is this correct when RegNoL == 16 ? */
18627    IRTemp argL = newTemp(Ity_V128);
18628    assign(argL, getXMMReg(regNoL));
18629    IRTemp argR = newTemp(Ity_V128);
18630    assign(argR, getXMMReg(regNoR));
18631
18632    IRTemp zmaskL = newTemp(Ity_I32);
18633    assign(zmaskL, unop(Iop_16Uto32,
18634                        unop(Iop_GetMSBs8x16,
18635                             binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
18636    IRTemp zmaskR = newTemp(Ity_I32);
18637    assign(zmaskR, unop(Iop_16Uto32,
18638                        unop(Iop_GetMSBs8x16,
18639                             binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
18640
18641    /* We want validL = ~(zmaskL | -zmaskL)
18642
18643       But this formulation kills memcheck's validity tracking when any
18644       bits above the first "1" are invalid.  So reformulate as:
18645
18646       validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
18647    */
18648
18649    IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
18650
18651    /* Generate a bool expression which is zero iff the original is
18652       zero.  Do this carefully so memcheck can propagate validity bits
18653       correctly.
18654     */
18655    IRTemp zmaskL_zero = newTemp(Ity_I1);
18656    assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
18657
18658    IRTemp validL = newTemp(Ity_I32);
18659    assign(validL, binop(Iop_Sub32,
18660                         IRExpr_ITE(mkexpr(zmaskL_zero),
18661                                    binop(Iop_Shl32, mkU32(1), ctzL),
18662                                    mkU32(0)),
18663                         mkU32(1)));
18664
18665    /* And similarly for validR. */
18666    IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
18667    IRTemp zmaskR_zero = newTemp(Ity_I1);
18668    assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
18669    IRTemp validR = newTemp(Ity_I32);
18670    assign(validR, binop(Iop_Sub32,
18671                         IRExpr_ITE(mkexpr(zmaskR_zero),
18672                                    binop(Iop_Shl32, mkU32(1), ctzR),
18673                                    mkU32(0)),
18674                         mkU32(1)));
18675
18676    /* Do the actual comparison. */
18677    IRExpr *boolResII = unop(Iop_16Uto32,
18678                             unop(Iop_GetMSBs8x16,
18679                                  binop(Iop_CmpEQ8x16, mkexpr(argL),
18680                                                       mkexpr(argR))));
18681
18682    /* Compute boolresII & validL & validR (i.e., if both valid, use
18683       comparison result) */
18684    IRExpr *intRes1_a = binop(Iop_And32, boolResII,
18685                              binop(Iop_And32,
18686                                    mkexpr(validL), mkexpr(validR)));
18687
18688    /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
18689    IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
18690                                              mkexpr(validL), mkexpr(validR)));
18691    /* Otherwise, zero. */
18692    IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
18693                            binop(Iop_Or32, intRes1_a, intRes1_b));
18694
18695    /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
18696       result. */
18697    IRTemp intRes2 = newTemp(Ity_I32);
18698    assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
18699                          binop(Iop_Xor32, intRes1, mkexpr(validL))));
18700
18701    /* If the 0x40 bit were set in imm=0x3A, we would return the index
18702       of the msb.  Since it is clear, we return the index of the
18703       lsb. */
18704    IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
18705                                      mkexpr(intRes2), mkU32(0x10000)));
18706
18707    /* And thats our rcx. */
18708    putIReg32(R_RCX, newECX);
18709
18710    /* Now for the condition codes... */
18711
18712    /* C == 0 iff intRes2 == 0 */
18713    IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
18714                                      mkU32(0)),
18715                                mkU32(1 << AMD64G_CC_SHIFT_C),
18716                                mkU32(0));
18717    /* Z == 1 iff any in argL is 0 */
18718    IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
18719                                mkU32(1 << AMD64G_CC_SHIFT_Z),
18720                                mkU32(0));
18721    /* S == 1 iff any in argR is 0 */
18722    IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
18723                                mkU32(1 << AMD64G_CC_SHIFT_S),
18724                                mkU32(0));
18725    /* O == IntRes2[0] */
18726    IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
18727                                           mkU32(0x01)),
18728                          mkU8(AMD64G_CC_SHIFT_O));
18729
18730    /* Put them all together */
18731    IRTemp cc = newTemp(Ity_I64);
18732    assign(cc, widenUto64(binop(Iop_Or32,
18733                                binop(Iop_Or32, c_bit, z_bit),
18734                                binop(Iop_Or32, s_bit, o_bit))));
18735    stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
18736    stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
18737    stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
18738    stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
18739
18740    return delta;
18741 }
18742
18743 /* This can fail, in which case it returns the original (unchanged)
18744    delta. */
18745 static Long dis_PCMPxSTRx ( const VexAbiInfo* vbi, Prefix pfx,
18746                             Long delta, Bool isAvx, UChar opc )
18747 {
18748    Long   delta0  = delta;
18749    UInt   isISTRx = opc & 2;
18750    UInt   isxSTRM = (opc & 1) ^ 1;
18751    UInt   regNoL  = 0;
18752    UInt   regNoR  = 0;
18753    UChar  imm     = 0;
18754    IRTemp addr    = IRTemp_INVALID;
18755    Int    alen    = 0;
18756    HChar  dis_buf[50];
18757
18758    /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
18759       (which is clean).  Since we can't do that, use a dirty helper to
18760       compute the results directly from the XMM regs in the guest
18761       state.  That means for the memory case, we need to move the left
18762       operand into a pseudo-register (XMM16, let's call it). */
18763    UChar modrm = getUChar(delta);
18764    if (epartIsReg(modrm)) {
18765       regNoL = eregOfRexRM(pfx, modrm);
18766       regNoR = gregOfRexRM(pfx, modrm);
18767       imm = getUChar(delta+1);
18768       delta += 1+1;
18769    } else {
18770       regNoL = 16; /* use XMM16 as an intermediary */
18771       regNoR = gregOfRexRM(pfx, modrm);
18772       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18773       /* No alignment check; I guess that makes sense, given that
18774          these insns are for dealing with C style strings. */
18775       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
18776       imm = getUChar(delta+alen);
18777       delta += alen+1;
18778    }
18779
18780    /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
18781       itself. */
18782    if (regNoL == 16) {
18783       DIP("%spcmp%cstr%c $%x,%s,%s\n",
18784           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
18785           (UInt)imm, dis_buf, nameXMMReg(regNoR));
18786    } else {
18787       DIP("%spcmp%cstr%c $%x,%s,%s\n",
18788           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
18789           (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
18790    }
18791
18792    /* Handle special case(s). */
18793    if (imm == 0x3A && isISTRx && !isxSTRM) {
18794       return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
18795                                 opc, imm, dis_buf);
18796    }
18797
18798    /* Now we know the XMM reg numbers for the operands, and the
18799       immediate byte.  Is it one we can actually handle? Throw out any
18800       cases for which the helper function has not been verified. */
18801    switch (imm) {
18802       case 0x00: case 0x02:
18803       case 0x08: case 0x0A: case 0x0C: case 0x0E:
18804       case 0x10: case 0x12: case 0x14:
18805       case 0x18: case 0x1A:
18806       case 0x30:            case 0x34:
18807       case 0x38: case 0x3A:
18808       case 0x40: case 0x42: case 0x44: case 0x46:
18809                  case 0x4A:
18810                  case 0x62:
18811       case 0x70: case 0x72:
18812          break;
18813       // the 16-bit character versions of the above
18814       case 0x01: case 0x03:
18815       case 0x09: case 0x0B: case 0x0D:
18816                  case 0x13:
18817       case 0x19: case 0x1B:
18818       case 0x39: case 0x3B:
18819       case 0x41:            case 0x45:
18820                  case 0x4B:
18821          break;
18822       default:
18823          return delta0; /*FAIL*/
18824    }
18825
18826    /* Who ya gonna call?  Presumably not Ghostbusters. */
18827    void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
18828    const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
18829
18830    /* Round up the arguments.  Note that this is a kludge -- the use
18831       of mkU64 rather than mkIRExpr_HWord implies the assumption that
18832       the host's word size is 64-bit. */
18833    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
18834    UInt gstOffR = ymmGuestRegOffset(regNoR);
18835
18836    IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
18837    IRExpr*  gstOffLe     = mkU64(gstOffL);
18838    IRExpr*  gstOffRe     = mkU64(gstOffR);
18839    IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
18840    IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
18841    IRExpr** args
18842       = mkIRExprVec_6( IRExpr_GSPTR(),
18843                        opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
18844
18845    IRTemp   resT = newTemp(Ity_I64);
18846    IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
18847    /* It's not really a dirty call, but we can't use the clean helper
18848       mechanism here for the very lame reason that we can't pass 2 x
18849       V128s by value to a helper.  Hence this roundabout scheme. */
18850    d->nFxState = 2;
18851    vex_bzero(&d->fxState, sizeof(d->fxState));
18852    d->fxState[0].fx     = Ifx_Read;
18853    d->fxState[0].offset = gstOffL;
18854    d->fxState[0].size   = sizeof(U128);
18855    d->fxState[1].fx     = Ifx_Read;
18856    d->fxState[1].offset = gstOffR;
18857    d->fxState[1].size   = sizeof(U128);
18858    if (isxSTRM) {
18859       /* Declare that the helper writes XMM0. */
18860       d->nFxState = 3;
18861       d->fxState[2].fx     = Ifx_Write;
18862       d->fxState[2].offset = ymmGuestRegOffset(0);
18863       d->fxState[2].size   = sizeof(U128);
18864    }
18865
18866    stmt( IRStmt_Dirty(d) );
18867
18868    /* Now resT[15:0] holds the new OSZACP values, so the condition
18869       codes must be updated. And for a xSTRI case, resT[31:16] holds
18870       the new ECX value, so stash that too. */
18871    if (!isxSTRM) {
18872       putIReg64(R_RCX, binop(Iop_And64,
18873                              binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
18874                              mkU64(0xFFFF)));
18875    }
18876
18877    /* Zap the upper half of the dest reg as per AVX conventions. */
18878    if (isxSTRM && isAvx)
18879       putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
18880
18881    stmt( IRStmt_Put(
18882             OFFB_CC_DEP1,
18883             binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
18884    ));
18885    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
18886    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
18887    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
18888
18889    return delta;
18890 }
18891
18892
18893 static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
18894 {
18895    vassert(imm8 >= 0 && imm8 <= 15);
18896
18897    // Create a V128 value which has the selected byte in the
18898    // specified lane, and zeroes everywhere else.
18899    IRTemp tmp128    = newTemp(Ity_V128);
18900    IRTemp halfshift = newTemp(Ity_I64);
18901    assign(halfshift, binop(Iop_Shl64,
18902                            unop(Iop_8Uto64, mkexpr(u8)),
18903                            mkU8(8 * (imm8 & 7))));
18904    if (imm8 < 8) {
18905       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
18906    } else {
18907       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
18908    }
18909
18910    UShort mask = ~(1 << imm8);
18911    IRTemp res  = newTemp(Ity_V128);
18912    assign( res, binop(Iop_OrV128,
18913                       mkexpr(tmp128),
18914                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
18915    return res;
18916 }
18917
18918
18919 static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
18920 {
18921    IRTemp z32 = newTemp(Ity_I32);
18922    assign(z32, mkU32(0));
18923
18924    /* Surround u32 with zeroes as per imm, giving us something we can
18925       OR into a suitably masked-out v128.*/
18926    IRTemp withZs = newTemp(Ity_V128);
18927    UShort mask = 0;
18928    switch (imm8) {
18929       case 3:  mask = 0x0FFF;
18930                assign(withZs, mkV128from32s(u32, z32, z32, z32));
18931                break;
18932       case 2:  mask = 0xF0FF;
18933                assign(withZs, mkV128from32s(z32, u32, z32, z32));
18934                break;
18935       case 1:  mask = 0xFF0F;
18936                assign(withZs, mkV128from32s(z32, z32, u32, z32));
18937                break;
18938       case 0:  mask = 0xFFF0;
18939                assign(withZs, mkV128from32s(z32, z32, z32, u32));
18940                break;
18941       default: vassert(0);
18942    }
18943
18944    IRTemp res = newTemp(Ity_V128);
18945    assign(res, binop( Iop_OrV128,
18946                       mkexpr(withZs),
18947                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
18948    return res;
18949 }
18950
18951
18952 static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
18953 {
18954    /* Surround u64 with zeroes as per imm, giving us something we can
18955       OR into a suitably masked-out v128.*/
18956    IRTemp withZs = newTemp(Ity_V128);
18957    UShort mask = 0;
18958    if (imm8 == 0) {
18959       mask = 0xFF00;
18960       assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
18961    } else {
18962       vassert(imm8 == 1);
18963       mask = 0x00FF;
18964       assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
18965    }
18966
18967    IRTemp res = newTemp(Ity_V128);
18968    assign( res, binop( Iop_OrV128,
18969                        mkexpr(withZs),
18970                        binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
18971    return res;
18972 }
18973
18974
18975 static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
18976 {
18977    const IRTemp inval = IRTemp_INVALID;
18978    IRTemp dstDs[4] = { inval, inval, inval, inval };
18979    breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
18980
18981    vassert(imm8 <= 255);
18982    dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
18983
18984    UInt imm8_zmask = (imm8 & 15);
18985    IRTemp zero_32 = newTemp(Ity_I32);
18986    assign( zero_32, mkU32(0) );
18987    IRTemp resV = newTemp(Ity_V128);
18988    assign( resV, mkV128from32s(
18989                     ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
18990                     ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
18991                     ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
18992                     ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
18993    return resV;
18994 }
18995
18996
18997 static Long dis_PEXTRB_128_GtoE ( const VexAbiInfo* vbi, Prefix pfx,
18998                                   Long delta, Bool isAvx )
18999 {
19000    IRTemp addr     = IRTemp_INVALID;
19001    Int    alen     = 0;
19002    HChar  dis_buf[50];
19003    IRTemp xmm_vec  = newTemp(Ity_V128);
19004    IRTemp sel_lane = newTemp(Ity_I32);
19005    IRTemp shr_lane = newTemp(Ity_I32);
19006    const HChar* mbV = isAvx ? "v" : "";
19007    UChar  modrm    = getUChar(delta);
19008    IRTemp t3, t2, t1, t0;
19009    Int    imm8;
19010    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
19011    t3 = t2 = t1 = t0 = IRTemp_INVALID;
19012    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
19013
19014    if ( epartIsReg( modrm ) ) {
19015       imm8 = (Int)getUChar(delta+1);
19016    } else {
19017       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19018       imm8 = (Int)getUChar(delta+alen);
19019    }
19020    switch ( (imm8 >> 2) & 3 ) {
19021       case 0:  assign( sel_lane, mkexpr(t0) ); break;
19022       case 1:  assign( sel_lane, mkexpr(t1) ); break;
19023       case 2:  assign( sel_lane, mkexpr(t2) ); break;
19024       case 3:  assign( sel_lane, mkexpr(t3) ); break;
19025       default: vassert(0);
19026    }
19027    assign( shr_lane,
19028            binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
19029
19030    if ( epartIsReg( modrm ) ) {
19031       putIReg64( eregOfRexRM(pfx,modrm),
19032                  unop( Iop_32Uto64,
19033                        binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
19034       delta += 1+1;
19035       DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
19036            nameXMMReg( gregOfRexRM(pfx, modrm) ),
19037            nameIReg64( eregOfRexRM(pfx, modrm) ) );
19038    } else {
19039       storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
19040       delta += alen+1;
19041       DIP( "%spextrb $%d,%s,%s\n", mbV,
19042            imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
19043    }
19044
19045    return delta;
19046 }
19047
19048
19049 static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
19050 {
19051    vassert(imm8 < 256);
19052    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
19053    IRTemp and_vec = newTemp(Ity_V128);
19054    IRTemp sum_vec = newTemp(Ity_V128);
19055    IRTemp rm      = newTemp(Ity_I32);
19056    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
19057    assign( and_vec, binop( Iop_AndV128,
19058                            triop( Iop_Mul64Fx2,
19059                                   mkexpr(rm),
19060                                   mkexpr(dst_vec), mkexpr(src_vec) ),
19061                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
19062
19063    assign( sum_vec, binop( Iop_Add64F0x2,
19064                            binop( Iop_InterleaveHI64x2,
19065                                   mkexpr(and_vec), mkexpr(and_vec) ),
19066                            binop( Iop_InterleaveLO64x2,
19067                                   mkexpr(and_vec), mkexpr(and_vec) ) ) );
19068    IRTemp res = newTemp(Ity_V128);
19069    assign(res, binop( Iop_AndV128,
19070                       binop( Iop_InterleaveLO64x2,
19071                              mkexpr(sum_vec), mkexpr(sum_vec) ),
19072                       mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
19073    return res;
19074 }
19075
19076
19077 static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
19078 {
19079    vassert(imm8 < 256);
19080    IRTemp tmp_prod_vec = newTemp(Ity_V128);
19081    IRTemp prod_vec     = newTemp(Ity_V128);
19082    IRTemp sum_vec      = newTemp(Ity_V128);
19083    IRTemp rm           = newTemp(Ity_I32);
19084    IRTemp v3, v2, v1, v0;
19085    v3 = v2 = v1 = v0   = IRTemp_INVALID;
19086    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
19087                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
19088                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
19089                              0xFFFF };
19090
19091    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
19092    assign( tmp_prod_vec,
19093            binop( Iop_AndV128,
19094                   triop( Iop_Mul32Fx4,
19095                          mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
19096                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
19097    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
19098    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
19099
19100    assign( sum_vec, triop( Iop_Add32Fx4,
19101                            mkexpr(rm),
19102                            binop( Iop_InterleaveHI32x4,
19103                                   mkexpr(prod_vec), mkexpr(prod_vec) ),
19104                            binop( Iop_InterleaveLO32x4,
19105                                   mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
19106
19107    IRTemp res = newTemp(Ity_V128);
19108    assign( res, binop( Iop_AndV128,
19109                        triop( Iop_Add32Fx4,
19110                               mkexpr(rm),
19111                               binop( Iop_InterleaveHI32x4,
19112                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
19113                               binop( Iop_InterleaveLO32x4,
19114                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
19115                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
19116    return res;
19117 }
19118
19119
19120 static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
19121 {
19122    /* Mask out bits of the operands we don't need.  This isn't
19123       strictly necessary, but it does ensure Memcheck doesn't
19124       give us any false uninitialised value errors as a
19125       result. */
19126    UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
19127    UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
19128
19129    IRTemp src_maskV = newTemp(Ity_V128);
19130    IRTemp dst_maskV = newTemp(Ity_V128);
19131    assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
19132    assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
19133
19134    IRTemp src_masked = newTemp(Ity_V128);
19135    IRTemp dst_masked = newTemp(Ity_V128);
19136    assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
19137    assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
19138
19139    /* Generate 4 64 bit values that we can hand to a clean helper */
19140    IRTemp sHi = newTemp(Ity_I64);
19141    IRTemp sLo = newTemp(Ity_I64);
19142    assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
19143    assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
19144
19145    IRTemp dHi = newTemp(Ity_I64);
19146    IRTemp dLo = newTemp(Ity_I64);
19147    assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
19148    assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
19149
19150    /* Compute halves of the result separately */
19151    IRTemp resHi = newTemp(Ity_I64);
19152    IRTemp resLo = newTemp(Ity_I64);
19153
19154    IRExpr** argsHi
19155       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
19156                        mkU64( 0x80 | (imm8 & 7) ));
19157    IRExpr** argsLo
19158       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
19159                        mkU64( 0x00 | (imm8 & 7) ));
19160
19161    assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
19162                                 "amd64g_calc_mpsadbw",
19163                                 &amd64g_calc_mpsadbw, argsHi ));
19164    assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
19165                                 "amd64g_calc_mpsadbw",
19166                                 &amd64g_calc_mpsadbw, argsLo ));
19167
19168    IRTemp res = newTemp(Ity_V128);
19169    assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
19170    return res;
19171 }
19172
19173 static Long dis_EXTRACTPS ( const VexAbiInfo* vbi, Prefix pfx,
19174                             Long delta, Bool isAvx )
19175 {
19176    IRTemp addr       = IRTemp_INVALID;
19177    Int    alen       = 0;
19178    HChar  dis_buf[50];
19179    UChar  modrm      = getUChar(delta);
19180    Int imm8_10;
19181    IRTemp xmm_vec    = newTemp(Ity_V128);
19182    IRTemp src_dword  = newTemp(Ity_I32);
19183    UInt   rG         = gregOfRexRM(pfx,modrm);
19184    IRTemp t3, t2, t1, t0;
19185    t3 = t2 = t1 = t0 = IRTemp_INVALID;
19186
19187    assign( xmm_vec, getXMMReg( rG ) );
19188    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
19189
19190    if ( epartIsReg( modrm ) ) {
19191       imm8_10 = (Int)(getUChar(delta+1) & 3);
19192    } else {
19193       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19194       imm8_10 = (Int)(getUChar(delta+alen) & 3);
19195    }
19196
19197    switch ( imm8_10 ) {
19198       case 0:  assign( src_dword, mkexpr(t0) ); break;
19199       case 1:  assign( src_dword, mkexpr(t1) ); break;
19200       case 2:  assign( src_dword, mkexpr(t2) ); break;
19201       case 3:  assign( src_dword, mkexpr(t3) ); break;
19202       default: vassert(0);
19203    }
19204
19205    if ( epartIsReg( modrm ) ) {
19206       UInt rE = eregOfRexRM(pfx,modrm);
19207       putIReg32( rE, mkexpr(src_dword) );
19208       delta += 1+1;
19209       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
19210            nameXMMReg( rG ), nameIReg32( rE ) );
19211    } else {
19212       storeLE( mkexpr(addr), mkexpr(src_dword) );
19213       delta += alen+1;
19214       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
19215            nameXMMReg( rG ), dis_buf );
19216    }
19217
19218    return delta;
19219 }
19220
19221
19222 static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
19223 {
19224    IRTemp t0 = newTemp(Ity_I64);
19225    IRTemp t1 = newTemp(Ity_I64);
19226    assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
19227               mkexpr(dV)));
19228    assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
19229               mkexpr(sV)));
19230
19231    IRTemp t2 = newTemp(Ity_I64);
19232    IRTemp t3 = newTemp(Ity_I64);
19233
19234    IRExpr** args;
19235
19236    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
19237    assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
19238                             &amd64g_calculate_pclmul, args));
19239    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
19240    assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
19241                             &amd64g_calculate_pclmul, args));
19242
19243    IRTemp res     = newTemp(Ity_V128);
19244    assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
19245    return res;
19246 }
19247
19248
19249 __attribute__((noinline))
19250 static
19251 Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
19252                           const VexAbiInfo* vbi,
19253                           Prefix pfx, Int sz, Long deltaIN )
19254 {
19255    IRTemp addr  = IRTemp_INVALID;
19256    UChar  modrm = 0;
19257    Int    alen  = 0;
19258    HChar  dis_buf[50];
19259
19260    *decode_OK = False;
19261
19262    Long   delta = deltaIN;
19263    UChar  opc   = getUChar(delta);
19264    delta++;
19265    switch (opc) {
19266
19267    case 0x08:
19268       /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
19269       if (have66noF2noF3(pfx) && sz == 2) {
19270
19271          IRTemp src0 = newTemp(Ity_F32);
19272          IRTemp src1 = newTemp(Ity_F32);
19273          IRTemp src2 = newTemp(Ity_F32);
19274          IRTemp src3 = newTemp(Ity_F32);
19275          IRTemp res0 = newTemp(Ity_F32);
19276          IRTemp res1 = newTemp(Ity_F32);
19277          IRTemp res2 = newTemp(Ity_F32);
19278          IRTemp res3 = newTemp(Ity_F32);
19279          IRTemp rm   = newTemp(Ity_I32);
19280          Int    imm  = 0;
19281
19282          modrm = getUChar(delta);
19283
19284          if (epartIsReg(modrm)) {
19285             assign( src0,
19286                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
19287             assign( src1,
19288                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
19289             assign( src2,
19290                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
19291             assign( src3,
19292                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
19293             imm = getUChar(delta+1);
19294             if (imm & ~15) goto decode_failure;
19295             delta += 1+1;
19296             DIP( "roundps $%d,%s,%s\n",
19297                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
19298                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19299          } else {
19300             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19301             gen_SIGNAL_if_not_16_aligned(vbi, addr);
19302             assign( src0, loadLE(Ity_F32,
19303                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
19304             assign( src1, loadLE(Ity_F32,
19305                                  binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
19306             assign( src2, loadLE(Ity_F32,
19307                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
19308             assign( src3, loadLE(Ity_F32,
19309                                  binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
19310             imm = getUChar(delta+alen);
19311             if (imm & ~15) goto decode_failure;
19312             delta += alen+1;
19313             DIP( "roundps $%d,%s,%s\n",
19314                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19315          }
19316
19317          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
19318             that encoding is the same as the encoding for IRRoundingMode,
19319             we can use that value directly in the IR as a rounding
19320             mode. */
19321          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
19322
19323          assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
19324          assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
19325          assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
19326          assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
19327
19328          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
19329          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
19330          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
19331          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
19332
19333          goto decode_success;
19334       }
19335       break;
19336
19337    case 0x09:
19338       /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
19339       if (have66noF2noF3(pfx) && sz == 2) {
19340
19341          IRTemp src0 = newTemp(Ity_F64);
19342          IRTemp src1 = newTemp(Ity_F64);
19343          IRTemp res0 = newTemp(Ity_F64);
19344          IRTemp res1 = newTemp(Ity_F64);
19345          IRTemp rm   = newTemp(Ity_I32);
19346          Int    imm  = 0;
19347
19348          modrm = getUChar(delta);
19349
19350          if (epartIsReg(modrm)) {
19351             assign( src0,
19352                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
19353             assign( src1,
19354                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
19355             imm = getUChar(delta+1);
19356             if (imm & ~15) goto decode_failure;
19357             delta += 1+1;
19358             DIP( "roundpd $%d,%s,%s\n",
19359                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
19360                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19361          } else {
19362             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19363             gen_SIGNAL_if_not_16_aligned(vbi, addr);
19364             assign( src0, loadLE(Ity_F64,
19365                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
19366             assign( src1, loadLE(Ity_F64,
19367                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
19368             imm = getUChar(delta+alen);
19369             if (imm & ~15) goto decode_failure;
19370             delta += alen+1;
19371             DIP( "roundpd $%d,%s,%s\n",
19372                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19373          }
19374
19375          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
19376             that encoding is the same as the encoding for IRRoundingMode,
19377             we can use that value directly in the IR as a rounding
19378             mode. */
19379          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
19380
19381          assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
19382          assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
19383
19384          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
19385          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
19386
19387          goto decode_success;
19388       }
19389       break;
19390
19391    case 0x0A:
19392    case 0x0B:
19393       /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
19394          66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
19395       */
19396       if (have66noF2noF3(pfx) && sz == 2) {
19397
19398          Bool   isD = opc == 0x0B;
19399          IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
19400          IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
19401          Int    imm = 0;
19402
19403          modrm = getUChar(delta);
19404
19405          if (epartIsReg(modrm)) {
19406             assign( src,
19407                     isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
19408                         : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
19409             imm = getUChar(delta+1);
19410             if (imm & ~15) goto decode_failure;
19411             delta += 1+1;
19412             DIP( "rounds%c $%d,%s,%s\n",
19413                  isD ? 'd' : 's',
19414                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
19415                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19416          } else {
19417             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19418             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
19419             imm = getUChar(delta+alen);
19420             if (imm & ~15) goto decode_failure;
19421             delta += alen+1;
19422             DIP( "rounds%c $%d,%s,%s\n",
19423                  isD ? 'd' : 's',
19424                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19425          }
19426
19427          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
19428             that encoding is the same as the encoding for IRRoundingMode,
19429             we can use that value directly in the IR as a rounding
19430             mode. */
19431          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
19432                            (imm & 4) ? get_sse_roundingmode()
19433                                      : mkU32(imm & 3),
19434                            mkexpr(src)) );
19435
19436          if (isD)
19437             putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
19438          else
19439             putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
19440
19441          goto decode_success;
19442       }
19443       break;
19444
19445    case 0x0C:
19446       /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
19447          Blend Packed Single Precision Floating-Point Values (XMM) */
19448       if (have66noF2noF3(pfx) && sz == 2) {
19449
19450          Int imm8;
19451          IRTemp dst_vec = newTemp(Ity_V128);
19452          IRTemp src_vec = newTemp(Ity_V128);
19453
19454          modrm = getUChar(delta);
19455
19456          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
19457
19458          if ( epartIsReg( modrm ) ) {
19459             imm8 = (Int)getUChar(delta+1);
19460             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
19461             delta += 1+1;
19462             DIP( "blendps $%d, %s,%s\n", imm8,
19463                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
19464                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19465          } else {
19466             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19467                              1/* imm8 is 1 byte after the amode */ );
19468             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19469             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19470             imm8 = (Int)getUChar(delta+alen);
19471             delta += alen+1;
19472             DIP( "blendps $%d, %s,%s\n",
19473                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19474          }
19475
19476          putXMMReg( gregOfRexRM(pfx, modrm),
19477                     mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
19478          goto decode_success;
19479       }
19480       break;
19481
19482    case 0x0D:
19483       /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
19484          Blend Packed Double Precision Floating-Point Values (XMM) */
19485       if (have66noF2noF3(pfx) && sz == 2) {
19486
19487          Int imm8;
19488          IRTemp dst_vec = newTemp(Ity_V128);
19489          IRTemp src_vec = newTemp(Ity_V128);
19490
19491          modrm = getUChar(delta);
19492          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
19493
19494          if ( epartIsReg( modrm ) ) {
19495             imm8 = (Int)getUChar(delta+1);
19496             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
19497             delta += 1+1;
19498             DIP( "blendpd $%d, %s,%s\n", imm8,
19499                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
19500                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19501          } else {
19502             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19503                              1/* imm8 is 1 byte after the amode */ );
19504             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19505             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19506             imm8 = (Int)getUChar(delta+alen);
19507             delta += alen+1;
19508             DIP( "blendpd $%d, %s,%s\n",
19509                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19510          }
19511
19512          putXMMReg( gregOfRexRM(pfx, modrm),
19513                     mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
19514          goto decode_success;
19515       }
19516       break;
19517
19518    case 0x0E:
19519       /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
19520          Blend Packed Words (XMM) */
19521       if (have66noF2noF3(pfx) && sz == 2) {
19522
19523          Int imm8;
19524          IRTemp dst_vec = newTemp(Ity_V128);
19525          IRTemp src_vec = newTemp(Ity_V128);
19526
19527          modrm = getUChar(delta);
19528
19529          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
19530
19531          if ( epartIsReg( modrm ) ) {
19532             imm8 = (Int)getUChar(delta+1);
19533             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
19534             delta += 1+1;
19535             DIP( "pblendw $%d, %s,%s\n", imm8,
19536                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
19537                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19538          } else {
19539             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19540                              1/* imm8 is 1 byte after the amode */ );
19541             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19542             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19543             imm8 = (Int)getUChar(delta+alen);
19544             delta += alen+1;
19545             DIP( "pblendw $%d, %s,%s\n",
19546                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19547          }
19548
19549          putXMMReg( gregOfRexRM(pfx, modrm),
19550                     mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
19551          goto decode_success;
19552       }
19553       break;
19554
19555    case 0x14:
19556       /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
19557          Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
19558          (XMM) */
19559       if (have66noF2noF3(pfx) && sz == 2) {
19560          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
19561          goto decode_success;
19562       }
19563       break;
19564
19565    case 0x15:
19566       /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
19567          Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
19568          (XMM) */
19569       if (have66noF2noF3(pfx) && sz == 2) {
19570          delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
19571          goto decode_success;
19572       }
19573       break;
19574
19575    case 0x16:
19576       /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
19577          Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
19578          Note that this insn has the same opcodes as PEXTRQ, but
19579          here the REX.W bit is _not_ present */
19580       if (have66noF2noF3(pfx)
19581           && sz == 2 /* REX.W is _not_ present */) {
19582          delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
19583          goto decode_success;
19584       }
19585       /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
19586          Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
19587          Note that this insn has the same opcodes as PEXTRD, but
19588          here the REX.W bit is present */
19589       if (have66noF2noF3(pfx)
19590           && sz == 8 /* REX.W is present */) {
19591          delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
19592          goto decode_success;
19593       }
19594       break;
19595
19596    case 0x17:
19597       /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
19598          float from xmm reg and store in gen.reg or mem.  This is
19599          identical to PEXTRD, except that REX.W appears to be ignored.
19600       */
19601       if (have66noF2noF3(pfx)
19602           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
19603          delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
19604          goto decode_success;
19605       }
19606       break;
19607
19608    case 0x20:
19609       /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
19610          Extract byte from r32/m8 and insert into xmm1 */
19611       if (have66noF2noF3(pfx) && sz == 2) {
19612          Int    imm8;
19613          IRTemp new8 = newTemp(Ity_I8);
19614          modrm = getUChar(delta);
19615          UInt rG = gregOfRexRM(pfx, modrm);
19616          if ( epartIsReg( modrm ) ) {
19617             UInt rE = eregOfRexRM(pfx,modrm);
19618             imm8 = (Int)(getUChar(delta+1) & 0xF);
19619             assign( new8, unop(Iop_32to8, getIReg32(rE)) );
19620             delta += 1+1;
19621             DIP( "pinsrb $%d,%s,%s\n", imm8,
19622                  nameIReg32(rE), nameXMMReg(rG) );
19623          } else {
19624             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19625             imm8 = (Int)(getUChar(delta+alen) & 0xF);
19626             assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
19627             delta += alen+1;
19628             DIP( "pinsrb $%d,%s,%s\n",
19629                  imm8, dis_buf, nameXMMReg(rG) );
19630          }
19631          IRTemp src_vec = newTemp(Ity_V128);
19632          assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
19633          IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
19634          putXMMReg( rG, mkexpr(res) );
19635          goto decode_success;
19636       }
19637       break;
19638
19639    case 0x21:
19640       /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
19641          Insert Packed Single Precision Floating-Point Value (XMM) */
19642       if (have66noF2noF3(pfx) && sz == 2) {
19643          UInt   imm8;
19644          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
19645          const IRTemp inval = IRTemp_INVALID;
19646
19647          modrm = getUChar(delta);
19648          UInt rG = gregOfRexRM(pfx, modrm);
19649
19650          if ( epartIsReg( modrm ) ) {
19651             UInt   rE = eregOfRexRM(pfx, modrm);
19652             IRTemp vE = newTemp(Ity_V128);
19653             assign( vE, getXMMReg(rE) );
19654             IRTemp dsE[4] = { inval, inval, inval, inval };
19655             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
19656             imm8 = getUChar(delta+1);
19657             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
19658             delta += 1+1;
19659             DIP( "insertps $%u, %s,%s\n",
19660                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
19661          } else {
19662             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19663             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
19664             imm8 = getUChar(delta+alen);
19665             delta += alen+1;
19666             DIP( "insertps $%u, %s,%s\n",
19667                  imm8, dis_buf, nameXMMReg(rG) );
19668          }
19669
19670          IRTemp vG = newTemp(Ity_V128);
19671          assign( vG, getXMMReg(rG) );
19672
19673          putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
19674          goto decode_success;
19675       }
19676       break;
19677
19678    case 0x22:
19679       /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
19680          Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
19681       if (have66noF2noF3(pfx)
19682           && sz == 2 /* REX.W is NOT present */) {
19683          Int    imm8_10;
19684          IRTemp src_u32 = newTemp(Ity_I32);
19685          modrm = getUChar(delta);
19686          UInt rG = gregOfRexRM(pfx, modrm);
19687
19688          if ( epartIsReg( modrm ) ) {
19689             UInt rE = eregOfRexRM(pfx,modrm);
19690             imm8_10 = (Int)(getUChar(delta+1) & 3);
19691             assign( src_u32, getIReg32( rE ) );
19692             delta += 1+1;
19693             DIP( "pinsrd $%d, %s,%s\n",
19694                  imm8_10, nameIReg32(rE), nameXMMReg(rG) );
19695          } else {
19696             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19697             imm8_10 = (Int)(getUChar(delta+alen) & 3);
19698             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
19699             delta += alen+1;
19700             DIP( "pinsrd $%d, %s,%s\n",
19701                  imm8_10, dis_buf, nameXMMReg(rG) );
19702          }
19703
19704          IRTemp src_vec = newTemp(Ity_V128);
19705          assign(src_vec, getXMMReg( rG ));
19706          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
19707          putXMMReg( rG, mkexpr(res_vec) );
19708          goto decode_success;
19709       }
19710       /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
19711          Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
19712       if (have66noF2noF3(pfx)
19713           && sz == 8 /* REX.W is present */) {
19714          Int imm8_0;
19715          IRTemp src_u64 = newTemp(Ity_I64);
19716          modrm = getUChar(delta);
19717          UInt rG = gregOfRexRM(pfx, modrm);
19718
19719          if ( epartIsReg( modrm ) ) {
19720             UInt rE = eregOfRexRM(pfx,modrm);
19721             imm8_0 = (Int)(getUChar(delta+1) & 1);
19722             assign( src_u64, getIReg64( rE ) );
19723             delta += 1+1;
19724             DIP( "pinsrq $%d, %s,%s\n",
19725                  imm8_0, nameIReg64(rE), nameXMMReg(rG) );
19726          } else {
19727             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19728             imm8_0 = (Int)(getUChar(delta+alen) & 1);
19729             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
19730             delta += alen+1;
19731             DIP( "pinsrq $%d, %s,%s\n",
19732                  imm8_0, dis_buf, nameXMMReg(rG) );
19733          }
19734
19735          IRTemp src_vec = newTemp(Ity_V128);
19736          assign(src_vec, getXMMReg( rG ));
19737          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
19738          putXMMReg( rG, mkexpr(res_vec) );
19739          goto decode_success;
19740       }
19741       break;
19742
19743    case 0x40:
19744       /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
19745          Dot Product of Packed Single Precision Floating-Point Values (XMM) */
19746       if (have66noF2noF3(pfx) && sz == 2) {
19747          modrm = getUChar(delta);
19748          Int    imm8;
19749          IRTemp src_vec = newTemp(Ity_V128);
19750          IRTemp dst_vec = newTemp(Ity_V128);
19751          UInt   rG      = gregOfRexRM(pfx, modrm);
19752          assign( dst_vec, getXMMReg( rG ) );
19753          if ( epartIsReg( modrm ) ) {
19754             UInt rE = eregOfRexRM(pfx, modrm);
19755             imm8 = (Int)getUChar(delta+1);
19756             assign( src_vec, getXMMReg(rE) );
19757             delta += 1+1;
19758             DIP( "dpps $%d, %s,%s\n",
19759                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
19760          } else {
19761             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19762                              1/* imm8 is 1 byte after the amode */ );
19763             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19764             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19765             imm8 = (Int)getUChar(delta+alen);
19766             delta += alen+1;
19767             DIP( "dpps $%d, %s,%s\n",
19768                  imm8, dis_buf, nameXMMReg(rG) );
19769          }
19770          IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
19771          putXMMReg( rG, mkexpr(res) );
19772          goto decode_success;
19773       }
19774       break;
19775
19776    case 0x41:
19777       /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
19778          Dot Product of Packed Double Precision Floating-Point Values (XMM) */
19779       if (have66noF2noF3(pfx) && sz == 2) {
19780          modrm = getUChar(delta);
19781          Int    imm8;
19782          IRTemp src_vec = newTemp(Ity_V128);
19783          IRTemp dst_vec = newTemp(Ity_V128);
19784          UInt   rG      = gregOfRexRM(pfx, modrm);
19785          assign( dst_vec, getXMMReg( rG ) );
19786          if ( epartIsReg( modrm ) ) {
19787             UInt rE = eregOfRexRM(pfx, modrm);
19788             imm8 = (Int)getUChar(delta+1);
19789             assign( src_vec, getXMMReg(rE) );
19790             delta += 1+1;
19791             DIP( "dppd $%d, %s,%s\n",
19792                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
19793          } else {
19794             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19795                              1/* imm8 is 1 byte after the amode */ );
19796             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19797             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19798             imm8 = (Int)getUChar(delta+alen);
19799             delta += alen+1;
19800             DIP( "dppd $%d, %s,%s\n",
19801                  imm8, dis_buf, nameXMMReg(rG) );
19802          }
19803          IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
19804          putXMMReg( rG, mkexpr(res) );
19805          goto decode_success;
19806       }
19807       break;
19808
19809    case 0x42:
19810       /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
19811          Multiple Packed Sums of Absolule Difference (XMM) */
19812       if (have66noF2noF3(pfx) && sz == 2) {
19813          Int    imm8;
19814          IRTemp src_vec = newTemp(Ity_V128);
19815          IRTemp dst_vec = newTemp(Ity_V128);
19816          modrm          = getUChar(delta);
19817          UInt   rG      = gregOfRexRM(pfx, modrm);
19818
19819          assign( dst_vec, getXMMReg(rG) );
19820
19821          if ( epartIsReg( modrm ) ) {
19822             UInt rE = eregOfRexRM(pfx, modrm);
19823
19824             imm8 = (Int)getUChar(delta+1);
19825             assign( src_vec, getXMMReg(rE) );
19826             delta += 1+1;
19827             DIP( "mpsadbw $%d, %s,%s\n", imm8,
19828                  nameXMMReg(rE), nameXMMReg(rG) );
19829          } else {
19830             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19831                              1/* imm8 is 1 byte after the amode */ );
19832             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19833             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19834             imm8 = (Int)getUChar(delta+alen);
19835             delta += alen+1;
19836             DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
19837          }
19838
19839          putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
19840          goto decode_success;
19841       }
19842       break;
19843
19844    case 0x44:
19845       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
19846        * Carry-less multiplication of selected XMM quadwords into XMM
19847        * registers (a.k.a multiplication of polynomials over GF(2))
19848        */
19849       if (have66noF2noF3(pfx) && sz == 2) {
19850
19851          Int imm8;
19852          IRTemp svec = newTemp(Ity_V128);
19853          IRTemp dvec = newTemp(Ity_V128);
19854          modrm       = getUChar(delta);
19855          UInt   rG   = gregOfRexRM(pfx, modrm);
19856
19857          assign( dvec, getXMMReg(rG) );
19858
19859          if ( epartIsReg( modrm ) ) {
19860             UInt rE = eregOfRexRM(pfx, modrm);
19861             imm8 = (Int)getUChar(delta+1);
19862             assign( svec, getXMMReg(rE) );
19863             delta += 1+1;
19864             DIP( "pclmulqdq $%d, %s,%s\n", imm8,
19865                  nameXMMReg(rE), nameXMMReg(rG) );
19866          } else {
19867             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19868                              1/* imm8 is 1 byte after the amode */ );
19869             gen_SIGNAL_if_not_16_aligned( vbi, addr );
19870             assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
19871             imm8 = (Int)getUChar(delta+alen);
19872             delta += alen+1;
19873             DIP( "pclmulqdq $%d, %s,%s\n",
19874                  imm8, dis_buf, nameXMMReg(rG) );
19875          }
19876
19877          putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
19878          goto decode_success;
19879       }
19880       break;
19881
19882    case 0x60:
19883    case 0x61:
19884    case 0x62:
19885    case 0x63:
19886       /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
19887          66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
19888          66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
19889          66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
19890          (selected special cases that actually occur in glibc,
19891           not by any means a complete implementation.)
19892       */
19893       if (have66noF2noF3(pfx) && sz == 2) {
19894          Long delta0 = delta;
19895          delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
19896          if (delta > delta0) goto decode_success;
19897          /* else fall though; dis_PCMPxSTRx failed to decode it */
19898       }
19899       break;
19900
19901    case 0xDF:
19902       /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
19903       if (have66noF2noF3(pfx) && sz == 2) {
19904          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
19905          goto decode_success;
19906       }
19907       break;
19908
19909    default:
19910       break;
19911
19912    }
19913
19914   decode_failure:
19915    *decode_OK = False;
19916    return deltaIN;
19917
19918   decode_success:
19919    *decode_OK = True;
19920    return delta;
19921 }
19922
19923
19924 /*------------------------------------------------------------*/
19925 /*---                                                      ---*/
19926 /*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
19927 /*---                                                      ---*/
19928 /*------------------------------------------------------------*/
19929
19930 __attribute__((noinline))
19931 static
19932 Long dis_ESC_NONE (
19933         /*MB_OUT*/DisResult* dres,
19934         /*MB_OUT*/Bool*      expect_CAS,
19935         const VexArchInfo* archinfo,
19936         const VexAbiInfo*  vbi,
19937         Prefix pfx, Int sz, Long deltaIN
19938      )
19939 {
19940    Long   d64   = 0;
19941    UChar  abyte = 0;
19942    IRTemp addr  = IRTemp_INVALID;
19943    IRTemp t1    = IRTemp_INVALID;
19944    IRTemp t2    = IRTemp_INVALID;
19945    IRTemp t3    = IRTemp_INVALID;
19946    IRTemp t4    = IRTemp_INVALID;
19947    IRTemp t5    = IRTemp_INVALID;
19948    IRType ty    = Ity_INVALID;
19949    UChar  modrm = 0;
19950    Int    am_sz = 0;
19951    Int    d_sz  = 0;
19952    Int    alen  = 0;
19953    HChar  dis_buf[50];
19954
19955    Long   delta = deltaIN;
19956    UChar  opc   = getUChar(delta); delta++;
19957
19958    /* delta now points at the modrm byte.  In most of the cases that
19959       follow, neither the F2 nor F3 prefixes are allowed.  However,
19960       for some basic arithmetic operations we have to allow F2/XACQ or
19961       F3/XREL in the case where the destination is memory and the LOCK
19962       prefix is also present.  Do this check by looking at the modrm
19963       byte but not advancing delta over it. */
19964    /* By default, F2 and F3 are not allowed, so let's start off with
19965       that setting. */
19966    Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
19967    { UChar tmp_modrm = getUChar(delta);
19968      switch (opc) {
19969         case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
19970         case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
19971         case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
19972         case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
19973         case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
19974         case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
19975         case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
19976            if (!epartIsReg(tmp_modrm)
19977                && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
19978               /* dst is mem, and we have F2 or F3 but not both */
19979               validF2orF3 = True;
19980            }
19981            break;
19982         default:
19983            break;
19984      }
19985    }
19986
19987    /* Now, in the switch below, for the opc values examined by the
19988       switch above, use validF2orF3 rather than looking at pfx
19989       directly. */
19990    switch (opc) {
19991
19992    case 0x00: /* ADD Gb,Eb */
19993       if (!validF2orF3) goto decode_failure;
19994       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagNone, True, 1, delta, "add" );
19995       return delta;
19996    case 0x01: /* ADD Gv,Ev */
19997       if (!validF2orF3) goto decode_failure;
19998       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagNone, True, sz, delta, "add" );
19999       return delta;
20000
20001    case 0x02: /* ADD Eb,Gb */
20002       if (haveF2orF3(pfx)) goto decode_failure;
20003       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagNone, True, 1, delta, "add" );
20004       return delta;
20005    case 0x03: /* ADD Ev,Gv */
20006       if (haveF2orF3(pfx)) goto decode_failure;
20007       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagNone, True, sz, delta, "add" );
20008       return delta;
20009
20010    case 0x04: /* ADD Ib, AL */
20011       if (haveF2orF3(pfx)) goto decode_failure;
20012       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
20013       return delta;
20014    case 0x05: /* ADD Iv, eAX */
20015       if (haveF2orF3(pfx)) goto decode_failure;
20016       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
20017       return delta;
20018
20019    case 0x08: /* OR Gb,Eb */
20020       if (!validF2orF3) goto decode_failure;
20021       delta = dis_op2_G_E ( vbi, pfx, Iop_Or8, WithFlagNone, True, 1, delta, "or" );
20022       return delta;
20023    case 0x09: /* OR Gv,Ev */
20024       if (!validF2orF3) goto decode_failure;
20025       delta = dis_op2_G_E ( vbi, pfx, Iop_Or8, WithFlagNone, True, sz, delta, "or" );
20026       return delta;
20027
20028    case 0x0A: /* OR Eb,Gb */
20029       if (haveF2orF3(pfx)) goto decode_failure;
20030       delta = dis_op2_E_G ( vbi, pfx, Iop_Or8, WithFlagNone, True, 1, delta, "or" );
20031       return delta;
20032    case 0x0B: /* OR Ev,Gv */
20033       if (haveF2orF3(pfx)) goto decode_failure;
20034       delta = dis_op2_E_G ( vbi, pfx, Iop_Or8, WithFlagNone, True, sz, delta, "or" );
20035       return delta;
20036
20037    case 0x0C: /* OR Ib, AL */
20038       if (haveF2orF3(pfx)) goto decode_failure;
20039       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
20040       return delta;
20041    case 0x0D: /* OR Iv, eAX */
20042       if (haveF2orF3(pfx)) goto decode_failure;
20043       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
20044       return delta;
20045
20046    case 0x10: /* ADC Gb,Eb */
20047       if (!validF2orF3) goto decode_failure;
20048       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagCarry, True, 1, delta, "adc" );
20049       return delta;
20050    case 0x11: /* ADC Gv,Ev */
20051       if (!validF2orF3) goto decode_failure;
20052       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagCarry, True, sz, delta, "adc" );
20053       return delta;
20054
20055    case 0x12: /* ADC Eb,Gb */
20056       if (haveF2orF3(pfx)) goto decode_failure;
20057       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarry, True, 1, delta, "adc" );
20058       return delta;
20059    case 0x13: /* ADC Ev,Gv */
20060       if (haveF2orF3(pfx)) goto decode_failure;
20061       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarry, True, sz, delta, "adc" );
20062       return delta;
20063
20064    case 0x14: /* ADC Ib, AL */
20065       if (haveF2orF3(pfx)) goto decode_failure;
20066       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
20067       return delta;
20068    case 0x15: /* ADC Iv, eAX */
20069       if (haveF2orF3(pfx)) goto decode_failure;
20070       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
20071       return delta;
20072
20073    case 0x18: /* SBB Gb,Eb */
20074       if (!validF2orF3) goto decode_failure;
20075       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, 1, delta, "sbb" );
20076       return delta;
20077    case 0x19: /* SBB Gv,Ev */
20078       if (!validF2orF3) goto decode_failure;
20079       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, sz, delta, "sbb" );
20080       return delta;
20081
20082    case 0x1A: /* SBB Eb,Gb */
20083       if (haveF2orF3(pfx)) goto decode_failure;
20084       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, 1, delta, "sbb" );
20085       return delta;
20086    case 0x1B: /* SBB Ev,Gv */
20087       if (haveF2orF3(pfx)) goto decode_failure;
20088       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, sz, delta, "sbb" );
20089       return delta;
20090
20091    case 0x1C: /* SBB Ib, AL */
20092       if (haveF2orF3(pfx)) goto decode_failure;
20093       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
20094       return delta;
20095    case 0x1D: /* SBB Iv, eAX */
20096       if (haveF2orF3(pfx)) goto decode_failure;
20097       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
20098       return delta;
20099
20100    case 0x20: /* AND Gb,Eb */
20101       if (!validF2orF3) goto decode_failure;
20102       delta = dis_op2_G_E ( vbi, pfx, Iop_And8, WithFlagNone, True, 1, delta, "and" );
20103       return delta;
20104    case 0x21: /* AND Gv,Ev */
20105       if (!validF2orF3) goto decode_failure;
20106       delta = dis_op2_G_E ( vbi, pfx, Iop_And8, WithFlagNone, True, sz, delta, "and" );
20107       return delta;
20108
20109    case 0x22: /* AND Eb,Gb */
20110       if (haveF2orF3(pfx)) goto decode_failure;
20111       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, True, 1, delta, "and" );
20112       return delta;
20113    case 0x23: /* AND Ev,Gv */
20114       if (haveF2orF3(pfx)) goto decode_failure;
20115       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, True, sz, delta, "and" );
20116       return delta;
20117
20118    case 0x24: /* AND Ib, AL */
20119       if (haveF2orF3(pfx)) goto decode_failure;
20120       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
20121       return delta;
20122    case 0x25: /* AND Iv, eAX */
20123       if (haveF2orF3(pfx)) goto decode_failure;
20124       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
20125       return delta;
20126
20127    case 0x28: /* SUB Gb,Eb */
20128       if (!validF2orF3) goto decode_failure;
20129       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, True, 1, delta, "sub" );
20130       return delta;
20131    case 0x29: /* SUB Gv,Ev */
20132       if (!validF2orF3) goto decode_failure;
20133       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, True, sz, delta, "sub" );
20134       return delta;
20135
20136    case 0x2A: /* SUB Eb,Gb */
20137       if (haveF2orF3(pfx)) goto decode_failure;
20138       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, True, 1, delta, "sub" );
20139       return delta;
20140    case 0x2B: /* SUB Ev,Gv */
20141       if (haveF2orF3(pfx)) goto decode_failure;
20142       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, True, sz, delta, "sub" );
20143       return delta;
20144
20145    case 0x2C: /* SUB Ib, AL */
20146       if (haveF2orF3(pfx)) goto decode_failure;
20147       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
20148       return delta;
20149    case 0x2D: /* SUB Iv, eAX */
20150       if (haveF2orF3(pfx)) goto decode_failure;
20151       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
20152       return delta;
20153
20154    case 0x30: /* XOR Gb,Eb */
20155       if (!validF2orF3) goto decode_failure;
20156       delta = dis_op2_G_E ( vbi, pfx, Iop_Xor8, WithFlagNone, True, 1, delta, "xor" );
20157       return delta;
20158    case 0x31: /* XOR Gv,Ev */
20159       if (!validF2orF3) goto decode_failure;
20160       delta = dis_op2_G_E ( vbi, pfx, Iop_Xor8, WithFlagNone, True, sz, delta, "xor" );
20161       return delta;
20162
20163    case 0x32: /* XOR Eb,Gb */
20164       if (haveF2orF3(pfx)) goto decode_failure;
20165       delta = dis_op2_E_G ( vbi, pfx, Iop_Xor8, WithFlagNone, True, 1, delta, "xor" );
20166       return delta;
20167    case 0x33: /* XOR Ev,Gv */
20168       if (haveF2orF3(pfx)) goto decode_failure;
20169       delta = dis_op2_E_G ( vbi, pfx, Iop_Xor8, WithFlagNone, True, sz, delta, "xor" );
20170       return delta;
20171
20172    case 0x34: /* XOR Ib, AL */
20173       if (haveF2orF3(pfx)) goto decode_failure;
20174       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
20175       return delta;
20176    case 0x35: /* XOR Iv, eAX */
20177       if (haveF2orF3(pfx)) goto decode_failure;
20178       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
20179       return delta;
20180
20181    case 0x38: /* CMP Gb,Eb */
20182       if (haveF2orF3(pfx)) goto decode_failure;
20183       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, False, 1, delta, "cmp" );
20184       return delta;
20185    case 0x39: /* CMP Gv,Ev */
20186       if (haveF2orF3(pfx)) goto decode_failure;
20187       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, False, sz, delta, "cmp" );
20188       return delta;
20189
20190    case 0x3A: /* CMP Eb,Gb */
20191       if (haveF2orF3(pfx)) goto decode_failure;
20192       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, False, 1, delta, "cmp" );
20193       return delta;
20194    case 0x3B: /* CMP Ev,Gv */
20195       if (haveF2orF3(pfx)) goto decode_failure;
20196       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, False, sz, delta, "cmp" );
20197       return delta;
20198
20199    case 0x3C: /* CMP Ib, AL */
20200       if (haveF2orF3(pfx)) goto decode_failure;
20201       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
20202       return delta;
20203    case 0x3D: /* CMP Iv, eAX */
20204       if (haveF2orF3(pfx)) goto decode_failure;
20205       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
20206       return delta;
20207
20208    case 0x50: /* PUSH eAX */
20209    case 0x51: /* PUSH eCX */
20210    case 0x52: /* PUSH eDX */
20211    case 0x53: /* PUSH eBX */
20212    case 0x55: /* PUSH eBP */
20213    case 0x56: /* PUSH eSI */
20214    case 0x57: /* PUSH eDI */
20215    case 0x54: /* PUSH eSP */
20216       /* This is the Right Way, in that the value to be pushed is
20217          established before %rsp is changed, so that pushq %rsp
20218          correctly pushes the old value. */
20219       if (haveF2orF3(pfx)) goto decode_failure;
20220       vassert(sz == 2 || sz == 4 || sz == 8);
20221       if (sz == 4)
20222          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
20223       ty = sz==2 ? Ity_I16 : Ity_I64;
20224       t1 = newTemp(ty);
20225       t2 = newTemp(Ity_I64);
20226       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
20227       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
20228       putIReg64(R_RSP, mkexpr(t2) );
20229       storeLE(mkexpr(t2),mkexpr(t1));
20230       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
20231       return delta;
20232
20233    case 0x58: /* POP eAX */
20234    case 0x59: /* POP eCX */
20235    case 0x5A: /* POP eDX */
20236    case 0x5B: /* POP eBX */
20237    case 0x5D: /* POP eBP */
20238    case 0x5E: /* POP eSI */
20239    case 0x5F: /* POP eDI */
20240    case 0x5C: /* POP eSP */
20241       if (haveF2orF3(pfx)) goto decode_failure;
20242       vassert(sz == 2 || sz == 4 || sz == 8);
20243       if (sz == 4)
20244          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
20245       t1 = newTemp(szToITy(sz));
20246       t2 = newTemp(Ity_I64);
20247       assign(t2, getIReg64(R_RSP));
20248       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
20249       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
20250       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
20251       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
20252       return delta;
20253
20254    case 0x63: /* MOVSX */
20255       if (haveF2orF3(pfx)) goto decode_failure;
20256       if (haveREX(pfx) && 1==getRexW(pfx)) {
20257          vassert(sz == 8);
20258          /* movsx r/m32 to r64 */
20259          modrm = getUChar(delta);
20260          if (epartIsReg(modrm)) {
20261             delta++;
20262             putIRegG(8, pfx, modrm,
20263                              unop(Iop_32Sto64,
20264                                   getIRegE(4, pfx, modrm)));
20265             DIP("movslq %s,%s\n",
20266                 nameIRegE(4, pfx, modrm),
20267                 nameIRegG(8, pfx, modrm));
20268             return delta;
20269          } else {
20270             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20271             delta += alen;
20272             putIRegG(8, pfx, modrm,
20273                              unop(Iop_32Sto64,
20274                                   loadLE(Ity_I32, mkexpr(addr))));
20275             DIP("movslq %s,%s\n", dis_buf,
20276                 nameIRegG(8, pfx, modrm));
20277             return delta;
20278          }
20279       } else {
20280          goto decode_failure;
20281       }
20282
20283    case 0x68: /* PUSH Iv */
20284       if (haveF2orF3(pfx)) goto decode_failure;
20285       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
20286       if (sz == 4) sz = 8;
20287       d64 = getSDisp(imin(4,sz),delta);
20288       delta += imin(4,sz);
20289       goto do_push_I;
20290
20291    case 0x69: /* IMUL Iv, Ev, Gv */
20292       if (haveF2orF3(pfx)) goto decode_failure;
20293       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
20294       return delta;
20295
20296    case 0x6A: /* PUSH Ib, sign-extended to sz */
20297       if (haveF2orF3(pfx)) goto decode_failure;
20298       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
20299       if (sz == 4) sz = 8;
20300       d64 = getSDisp8(delta); delta += 1;
20301       goto do_push_I;
20302    do_push_I:
20303       ty = szToITy(sz);
20304       t1 = newTemp(Ity_I64);
20305       t2 = newTemp(ty);
20306       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
20307       putIReg64(R_RSP, mkexpr(t1) );
20308       /* stop mkU16 asserting if d32 is a negative 16-bit number
20309          (bug #132813) */
20310       if (ty == Ity_I16)
20311          d64 &= 0xFFFF;
20312       storeLE( mkexpr(t1), mkU(ty,d64) );
20313       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
20314       return delta;
20315
20316    case 0x6B: /* IMUL Ib, Ev, Gv */
20317       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
20318       return delta;
20319
20320    case 0x70:
20321    case 0x71:
20322    case 0x72:   /* JBb/JNAEb (jump below) */
20323    case 0x73:   /* JNBb/JAEb (jump not below) */
20324    case 0x74:   /* JZb/JEb (jump zero) */
20325    case 0x75:   /* JNZb/JNEb (jump not zero) */
20326    case 0x76:   /* JBEb/JNAb (jump below or equal) */
20327    case 0x77:   /* JNBEb/JAb (jump not below or equal) */
20328    case 0x78:   /* JSb (jump negative) */
20329    case 0x79:   /* JSb (jump not negative) */
20330    case 0x7A:   /* JP (jump parity even) */
20331    case 0x7B:   /* JNP/JPO (jump parity odd) */
20332    case 0x7C:   /* JLb/JNGEb (jump less) */
20333    case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
20334    case 0x7E:   /* JLEb/JNGb (jump less or equal) */
20335    case 0x7F: { /* JGb/JNLEb (jump greater) */
20336       Long   jmpDelta;
20337       const HChar* comment  = "";
20338       if (haveF3(pfx)) goto decode_failure;
20339       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20340       jmpDelta = getSDisp8(delta);
20341       vassert(-128 <= jmpDelta && jmpDelta < 128);
20342       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
20343       delta++;
20344       /* End the block at this point. */
20345       jcc_01( dres, (AMD64Condcode)(opc - 0x70),
20346               guest_RIP_bbstart+delta, d64 );
20347       vassert(dres->whatNext == Dis_StopHere);
20348       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), (ULong)d64,
20349           comment);
20350       return delta;
20351    }
20352
20353    case 0x80: /* Grp1 Ib,Eb */
20354       modrm = getUChar(delta);
20355       /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
20356          just one for the mem case and also require LOCK in this case.
20357          Note that this erroneously allows XACQ/XREL on CMP since we
20358          don't check the subopcode here.  No big deal. */
20359       if (epartIsReg(modrm) && haveF2orF3(pfx))
20360          goto decode_failure;
20361       if (!epartIsReg(modrm) && haveF2andF3(pfx))
20362          goto decode_failure;
20363       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
20364          goto decode_failure;
20365       am_sz = lengthAMode(pfx,delta);
20366       sz    = 1;
20367       d_sz  = 1;
20368       d64   = getSDisp8(delta + am_sz);
20369       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
20370       return delta;
20371
20372    case 0x81: /* Grp1 Iv,Ev */
20373       modrm = getUChar(delta);
20374       /* Same comment as for case 0x80 just above. */
20375       if (epartIsReg(modrm) && haveF2orF3(pfx))
20376          goto decode_failure;
20377       if (!epartIsReg(modrm) && haveF2andF3(pfx))
20378          goto decode_failure;
20379       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
20380          goto decode_failure;
20381       am_sz = lengthAMode(pfx,delta);
20382       d_sz  = imin(sz,4);
20383       d64   = getSDisp(d_sz, delta + am_sz);
20384       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
20385       return delta;
20386
20387    case 0x83: /* Grp1 Ib,Ev */
20388       if (haveF2orF3(pfx)) goto decode_failure;
20389       modrm = getUChar(delta);
20390       am_sz = lengthAMode(pfx,delta);
20391       d_sz  = 1;
20392       d64   = getSDisp8(delta + am_sz);
20393       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
20394       return delta;
20395
20396    case 0x84: /* TEST Eb,Gb */
20397       if (haveF2orF3(pfx)) goto decode_failure;
20398       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, False,
20399                             1, delta, "test" );
20400       return delta;
20401
20402    case 0x85: /* TEST Ev,Gv */
20403       if (haveF2orF3(pfx)) goto decode_failure;
20404       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, False,
20405                             sz, delta, "test" );
20406       return delta;
20407
20408    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
20409       prefix.  Therefore, generate CAS regardless of the presence or
20410       otherwise of a LOCK prefix. */
20411    case 0x86: /* XCHG Gb,Eb */
20412       sz = 1;
20413       /* Fall through ... */
20414    case 0x87: /* XCHG Gv,Ev */
20415       modrm = getUChar(delta);
20416       /* Check whether F2 or F3 are allowable.  For the mem case, one
20417          or the othter but not both are.  We don't care about the
20418          presence of LOCK in this case -- XCHG is unusual in this
20419          respect. */
20420       if (haveF2orF3(pfx)) {
20421          if (epartIsReg(modrm)) {
20422             goto decode_failure;
20423          } else {
20424             if (haveF2andF3(pfx))
20425                goto decode_failure;
20426          }
20427       }
20428       ty = szToITy(sz);
20429       t1 = newTemp(ty); t2 = newTemp(ty);
20430       if (epartIsReg(modrm)) {
20431          assign(t1, getIRegE(sz, pfx, modrm));
20432          assign(t2, getIRegG(sz, pfx, modrm));
20433          putIRegG(sz, pfx, modrm, mkexpr(t1));
20434          putIRegE(sz, pfx, modrm, mkexpr(t2));
20435          delta++;
20436          DIP("xchg%c %s, %s\n",
20437              nameISize(sz), nameIRegG(sz, pfx, modrm),
20438                             nameIRegE(sz, pfx, modrm));
20439       } else {
20440          *expect_CAS = True;
20441          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20442          assign( t1, loadLE(ty, mkexpr(addr)) );
20443          assign( t2, getIRegG(sz, pfx, modrm) );
20444          casLE( mkexpr(addr),
20445                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
20446          putIRegG( sz, pfx, modrm, mkexpr(t1) );
20447          delta += alen;
20448          DIP("xchg%c %s, %s\n", nameISize(sz),
20449                                 nameIRegG(sz, pfx, modrm), dis_buf);
20450       }
20451       return delta;
20452
20453    case 0x88: { /* MOV Gb,Eb */
20454       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
20455       Bool ok = True;
20456       delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
20457       if (!ok) goto decode_failure;
20458       return delta;
20459    }
20460
20461    case 0x89: { /* MOV Gv,Ev */
20462       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
20463       Bool ok = True;
20464       delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
20465       if (!ok) goto decode_failure;
20466       return delta;
20467    }
20468
20469    case 0x8A: /* MOV Eb,Gb */
20470       if (haveF2orF3(pfx)) goto decode_failure;
20471       delta = dis_mov_E_G(vbi, pfx, 1, delta);
20472       return delta;
20473
20474    case 0x8B: /* MOV Ev,Gv */
20475       if (haveF2orF3(pfx)) goto decode_failure;
20476       delta = dis_mov_E_G(vbi, pfx, sz, delta);
20477       return delta;
20478
20479    case 0x8C: /* MOV S,E -- MOV from a SEGMENT REGISTER */
20480       if (haveF2orF3(pfx)) goto decode_failure;
20481       delta = dis_mov_S_E(vbi, pfx, sz, delta);
20482       return delta;
20483
20484    case 0x8D: /* LEA M,Gv */
20485       if (haveF2orF3(pfx)) goto decode_failure;
20486       if (sz != 4 && sz != 8)
20487          goto decode_failure;
20488       modrm = getUChar(delta);
20489       if (epartIsReg(modrm))
20490          goto decode_failure;
20491       /* NOTE!  this is the one place where a segment override prefix
20492          has no effect on the address calculation.  Therefore we clear
20493          any segment override bits in pfx. */
20494       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
20495       delta += alen;
20496       /* This is a hack.  But it isn't clear that really doing the
20497          calculation at 32 bits is really worth it.  Hence for leal,
20498          do the full 64-bit calculation and then truncate it. */
20499       putIRegG( sz, pfx, modrm,
20500                          sz == 4
20501                             ? unop(Iop_64to32, mkexpr(addr))
20502                             : mkexpr(addr)
20503               );
20504       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
20505                             nameIRegG(sz,pfx,modrm));
20506       return delta;
20507
20508    case 0x8F: { /* POPQ m64 / POPW m16 */
20509       Int   len;
20510       UChar rm;
20511       /* There is no encoding for 32-bit pop in 64-bit mode.
20512          So sz==4 actually means sz==8. */
20513       if (haveF2orF3(pfx)) goto decode_failure;
20514       vassert(sz == 2 || sz == 4
20515               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
20516       if (sz == 4) sz = 8;
20517       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20518
20519       rm = getUChar(delta);
20520
20521       /* make sure this instruction is correct POP */
20522       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
20523          goto decode_failure;
20524       /* and has correct size */
20525       vassert(sz == 8);
20526
20527       t1 = newTemp(Ity_I64);
20528       t3 = newTemp(Ity_I64);
20529       assign( t1, getIReg64(R_RSP) );
20530       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
20531
20532       /* Increase RSP; must be done before the STORE.  Intel manual
20533          says: If the RSP register is used as a base register for
20534          addressing a destination operand in memory, the POP
20535          instruction computes the effective address of the operand
20536          after it increments the RSP register.  */
20537       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
20538
20539       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
20540       storeLE( mkexpr(addr), mkexpr(t3) );
20541
20542       DIP("popl %s\n", dis_buf);
20543
20544       delta += len;
20545       return delta;
20546    }
20547
20548    case 0x90: /* XCHG eAX,eAX */
20549       /* detect and handle F3 90 (rep nop) specially */
20550       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
20551          DIP("rep nop (P4 pause)\n");
20552          /* "observe" the hint.  The Vex client needs to be careful not
20553             to cause very long delays as a result, though. */
20554          jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
20555          vassert(dres->whatNext == Dis_StopHere);
20556          return delta;
20557       }
20558       /* detect and handle NOPs specially */
20559       if (/* F2/F3 probably change meaning completely */
20560           !haveF2orF3(pfx)
20561           /* If REX.B is 1, we're not exchanging rAX with itself */
20562           && getRexB(pfx)==0 ) {
20563          DIP("nop\n");
20564          return delta;
20565       }
20566       /* else fall through to normal case. */
20567    case 0x91: /* XCHG rAX,rCX */
20568    case 0x92: /* XCHG rAX,rDX */
20569    case 0x93: /* XCHG rAX,rBX */
20570    case 0x94: /* XCHG rAX,rSP */
20571    case 0x95: /* XCHG rAX,rBP */
20572    case 0x96: /* XCHG rAX,rSI */
20573    case 0x97: /* XCHG rAX,rDI */
20574       /* guard against mutancy */
20575       if (haveF2orF3(pfx)) goto decode_failure;
20576       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
20577       return delta;
20578
20579    case 0x98: /* CBW */
20580       if (haveF2orF3(pfx)) goto decode_failure;
20581       if (sz == 8) {
20582          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
20583          DIP(/*"cdqe\n"*/"cltq\n");
20584          return delta;
20585       }
20586       if (sz == 4) {
20587          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
20588          DIP("cwtl\n");
20589          return delta;
20590       }
20591       if (sz == 2) {
20592          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
20593          DIP("cbw\n");
20594          return delta;
20595       }
20596       goto decode_failure;
20597
20598    case 0x99: /* CWD/CDQ/CQO */
20599       if (haveF2orF3(pfx)) goto decode_failure;
20600       vassert(sz == 2 || sz == 4 || sz == 8);
20601       ty = szToITy(sz);
20602       putIRegRDX( sz,
20603                   binop(mkSizedOp(ty,Iop_Sar8),
20604                         getIRegRAX(sz),
20605                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
20606       DIP(sz == 2 ? "cwd\n"
20607                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
20608                              : "cqo\n"));
20609       return delta;
20610
20611    case 0x9B: /* FWAIT (X87 insn) */
20612       /* ignore? */
20613       DIP("fwait\n");
20614       return delta;
20615
20616    case 0x9C: /* PUSHF */ {
20617       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
20618          mode.  So sz==4 actually means sz==8. */
20619       /* 24 July 06: has also been seen with a redundant REX prefix,
20620          so must also allow sz==8. */
20621       if (haveF2orF3(pfx)) goto decode_failure;
20622       vassert(sz == 2 || sz == 4 || sz == 8);
20623       if (sz == 4) sz = 8;
20624       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20625
20626       t1 = newTemp(Ity_I64);
20627       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
20628       putIReg64(R_RSP, mkexpr(t1) );
20629
20630       t2 = newTemp(Ity_I64);
20631       assign( t2, mk_amd64g_calculate_rflags_all() );
20632
20633       /* Patch in the D flag.  This can simply be a copy of bit 10 of
20634          baseBlock[OFFB_DFLAG]. */
20635       t3 = newTemp(Ity_I64);
20636       assign( t3, binop(Iop_Or64,
20637                         mkexpr(t2),
20638                         binop(Iop_And64,
20639                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
20640                               mkU64(1<<10)))
20641             );
20642
20643       /* And patch in the ID flag. */
20644       t4 = newTemp(Ity_I64);
20645       assign( t4, binop(Iop_Or64,
20646                         mkexpr(t3),
20647                         binop(Iop_And64,
20648                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
20649                                                mkU8(21)),
20650                               mkU64(1<<21)))
20651             );
20652
20653       /* And patch in the AC flag too. */
20654       t5 = newTemp(Ity_I64);
20655       assign( t5, binop(Iop_Or64,
20656                         mkexpr(t4),
20657                         binop(Iop_And64,
20658                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
20659                                                mkU8(18)),
20660                               mkU64(1<<18)))
20661             );
20662
20663       /* if sz==2, the stored value needs to be narrowed. */
20664       if (sz == 2)
20665         storeLE( mkexpr(t1), unop(Iop_32to16,
20666                              unop(Iop_64to32,mkexpr(t5))) );
20667       else
20668         storeLE( mkexpr(t1), mkexpr(t5) );
20669
20670       DIP("pushf%c\n", nameISize(sz));
20671       return delta;
20672    }
20673
20674    case 0x9D: /* POPF */
20675       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
20676          So sz==4 actually means sz==8. */
20677       if (haveF2orF3(pfx)) goto decode_failure;
20678       vassert(sz == 2 || sz == 4 || sz == 8);
20679       if (sz == 4) sz = 8;
20680       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20681       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
20682       assign(t2, getIReg64(R_RSP));
20683       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
20684       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
20685       /* t1 is the flag word.  Mask out everything except OSZACP and
20686          set the flags thunk to AMD64G_CC_OP_COPY. */
20687       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
20688       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
20689       stmt( IRStmt_Put( OFFB_CC_DEP1,
20690                         binop(Iop_And64,
20691                               mkexpr(t1),
20692                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
20693                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
20694                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
20695                              )
20696                        )
20697           );
20698       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
20699
20700       /* Also need to set the D flag, which is held in bit 10 of t1.
20701          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
20702       stmt( IRStmt_Put(
20703                OFFB_DFLAG,
20704                IRExpr_ITE(
20705                   unop(Iop_64to1,
20706                        binop(Iop_And64,
20707                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
20708                              mkU64(1))),
20709                   mkU64(0xFFFFFFFFFFFFFFFFULL),
20710                   mkU64(1)))
20711           );
20712
20713       /* And set the ID flag */
20714       stmt( IRStmt_Put(
20715                OFFB_IDFLAG,
20716                IRExpr_ITE(
20717                   unop(Iop_64to1,
20718                        binop(Iop_And64,
20719                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
20720                              mkU64(1))),
20721                   mkU64(1),
20722                   mkU64(0)))
20723           );
20724
20725       /* And set the AC flag too */
20726       stmt( IRStmt_Put(
20727                OFFB_ACFLAG,
20728                IRExpr_ITE(
20729                   unop(Iop_64to1,
20730                        binop(Iop_And64,
20731                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
20732                              mkU64(1))),
20733                   mkU64(1),
20734                   mkU64(0)))
20735           );
20736
20737       DIP("popf%c\n", nameISize(sz));
20738       return delta;
20739
20740    case 0x9E: /* SAHF */
20741       codegen_SAHF();
20742       DIP("sahf\n");
20743       return delta;
20744
20745    case 0x9F: /* LAHF */
20746       codegen_LAHF();
20747       DIP("lahf\n");
20748       return delta;
20749
20750    case 0xA0: /* MOV Ob,AL */
20751       if (have66orF2orF3(pfx)) goto decode_failure;
20752       sz = 1;
20753       /* Fall through ... */
20754    case 0xA1: /* MOV Ov,eAX */
20755       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
20756          goto decode_failure;
20757       d64 = getDisp64(delta);
20758       delta += 8;
20759       ty = szToITy(sz);
20760       addr = newTemp(Ity_I64);
20761       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
20762       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
20763       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
20764                                   segRegTxt(pfx), (ULong)d64,
20765                                   nameIRegRAX(sz));
20766       return delta;
20767
20768    case 0xA2: /* MOV AL,Ob */
20769       if (have66orF2orF3(pfx)) goto decode_failure;
20770       sz = 1;
20771       /* Fall through ... */
20772    case 0xA3: /* MOV eAX,Ov */
20773       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
20774          goto decode_failure;
20775       d64 = getDisp64(delta);
20776       delta += 8;
20777       ty = szToITy(sz);
20778       addr = newTemp(Ity_I64);
20779       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
20780       storeLE( mkexpr(addr), getIRegRAX(sz) );
20781       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
20782                                   segRegTxt(pfx), (ULong)d64);
20783       return delta;
20784
20785    case 0xA4:
20786    case 0xA5:
20787       /* F3 A4: rep movsb */
20788       if (haveF3(pfx) && !haveF2(pfx)) {
20789          if (opc == 0xA4)
20790             sz = 1;
20791          dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
20792                       guest_RIP_curr_instr,
20793                       guest_RIP_bbstart+delta, "rep movs", pfx );
20794         dres->whatNext = Dis_StopHere;
20795         return delta;
20796       }
20797       /* A4: movsb */
20798       if (!haveF3(pfx) && !haveF2(pfx)) {
20799          if (opc == 0xA4)
20800             sz = 1;
20801          dis_string_op( dis_MOVS, sz, "movs", pfx );
20802          return delta;
20803       }
20804       goto decode_failure;
20805
20806    case 0xA6:
20807    case 0xA7:
20808       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
20809       if (haveF3(pfx) && !haveF2(pfx)) {
20810          if (opc == 0xA6)
20811             sz = 1;
20812          dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
20813                       guest_RIP_curr_instr,
20814                       guest_RIP_bbstart+delta, "repe cmps", pfx );
20815          dres->whatNext = Dis_StopHere;
20816          return delta;
20817       }
20818       goto decode_failure;
20819
20820    case 0xAA:
20821    case 0xAB:
20822       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
20823       if (haveF3(pfx) && !haveF2(pfx)) {
20824          if (opc == 0xAA)
20825             sz = 1;
20826          dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
20827                       guest_RIP_curr_instr,
20828                       guest_RIP_bbstart+delta, "rep stos", pfx );
20829          vassert(dres->whatNext == Dis_StopHere);
20830          return delta;
20831       }
20832       /* AA/AB: stosb/stos{w,l,q} */
20833       if (!haveF3(pfx) && !haveF2(pfx)) {
20834          if (opc == 0xAA)
20835             sz = 1;
20836          dis_string_op( dis_STOS, sz, "stos", pfx );
20837          return delta;
20838       }
20839       goto decode_failure;
20840
20841    case 0xA8: /* TEST Ib, AL */
20842       if (haveF2orF3(pfx)) goto decode_failure;
20843       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
20844       return delta;
20845    case 0xA9: /* TEST Iv, eAX */
20846       if (haveF2orF3(pfx)) goto decode_failure;
20847       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
20848       return delta;
20849
20850    case 0xAC: /* LODS, no REP prefix */
20851    case 0xAD:
20852       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
20853       return delta;
20854
20855    case 0xAE:
20856    case 0xAF:
20857       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
20858       if (haveF2(pfx) && !haveF3(pfx)) {
20859          if (opc == 0xAE)
20860             sz = 1;
20861          dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
20862                       guest_RIP_curr_instr,
20863                       guest_RIP_bbstart+delta, "repne scas", pfx );
20864          vassert(dres->whatNext == Dis_StopHere);
20865          return delta;
20866       }
20867       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
20868       if (!haveF2(pfx) && haveF3(pfx)) {
20869          if (opc == 0xAE)
20870             sz = 1;
20871          dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
20872                       guest_RIP_curr_instr,
20873                       guest_RIP_bbstart+delta, "repe scas", pfx );
20874          vassert(dres->whatNext == Dis_StopHere);
20875          return delta;
20876       }
20877       /* AE/AF: scasb/scas{w,l,q} */
20878       if (!haveF2(pfx) && !haveF3(pfx)) {
20879          if (opc == 0xAE)
20880             sz = 1;
20881          dis_string_op( dis_SCAS, sz, "scas", pfx );
20882          return delta;
20883       }
20884       goto decode_failure;
20885
20886    /* XXXX be careful here with moves to AH/BH/CH/DH */
20887    case 0xB0: /* MOV imm,AL */
20888    case 0xB1: /* MOV imm,CL */
20889    case 0xB2: /* MOV imm,DL */
20890    case 0xB3: /* MOV imm,BL */
20891    case 0xB4: /* MOV imm,AH */
20892    case 0xB5: /* MOV imm,CH */
20893    case 0xB6: /* MOV imm,DH */
20894    case 0xB7: /* MOV imm,BH */
20895       if (haveF2orF3(pfx)) goto decode_failure;
20896       d64 = getUChar(delta);
20897       delta += 1;
20898       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
20899       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
20900       return delta;
20901
20902    case 0xB8: /* MOV imm,eAX */
20903    case 0xB9: /* MOV imm,eCX */
20904    case 0xBA: /* MOV imm,eDX */
20905    case 0xBB: /* MOV imm,eBX */
20906    case 0xBC: /* MOV imm,eSP */
20907    case 0xBD: /* MOV imm,eBP */
20908    case 0xBE: /* MOV imm,eSI */
20909    case 0xBF: /* MOV imm,eDI */
20910       /* This is the one-and-only place where 64-bit literals are
20911          allowed in the instruction stream. */
20912       if (haveF2orF3(pfx)) goto decode_failure;
20913       if (sz == 8) {
20914          d64 = getDisp64(delta);
20915          delta += 8;
20916          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
20917          DIP("movabsq $%lld,%s\n", (Long)d64,
20918                                    nameIRegRexB(8,pfx,opc-0xB8));
20919       } else {
20920          d64 = getSDisp(imin(4,sz),delta);
20921          delta += imin(4,sz);
20922          putIRegRexB(sz, pfx, opc-0xB8,
20923                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
20924          DIP("mov%c $%lld,%s\n", nameISize(sz),
20925                                  (Long)d64,
20926                                  nameIRegRexB(sz,pfx,opc-0xB8));
20927       }
20928       return delta;
20929
20930    case 0xC0: { /* Grp2 Ib,Eb */
20931       Bool decode_OK = True;
20932       if (haveF2orF3(pfx)) goto decode_failure;
20933       modrm = getUChar(delta);
20934       am_sz = lengthAMode(pfx,delta);
20935       d_sz  = 1;
20936       d64   = getUChar(delta + am_sz);
20937       sz    = 1;
20938       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20939                          mkU8(d64 & 0xFF), NULL, &decode_OK );
20940       if (!decode_OK) goto decode_failure;
20941       return delta;
20942    }
20943
20944    case 0xC1: { /* Grp2 Ib,Ev */
20945       Bool decode_OK = True;
20946       if (haveF2orF3(pfx)) goto decode_failure;
20947       modrm = getUChar(delta);
20948       am_sz = lengthAMode(pfx,delta);
20949       d_sz  = 1;
20950       d64   = getUChar(delta + am_sz);
20951       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20952                          mkU8(d64 & 0xFF), NULL, &decode_OK );
20953       if (!decode_OK) goto decode_failure;
20954       return delta;
20955    }
20956
20957    case 0xC2: /* RET imm16 */
20958       if (have66orF3(pfx)) goto decode_failure;
20959       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20960       d64 = getUDisp16(delta);
20961       delta += 2;
20962       dis_ret(dres, vbi, d64);
20963       DIP("ret $%lld\n", d64);
20964       return delta;
20965
20966    case 0xC3: /* RET */
20967       if (have66(pfx)) goto decode_failure;
20968       /* F3 is acceptable on AMD. */
20969       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20970       dis_ret(dres, vbi, 0);
20971       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
20972       return delta;
20973
20974    case 0xC6: /* C6 /0 = MOV Ib,Eb */
20975       sz = 1;
20976       goto maybe_do_Mov_I_E;
20977    case 0xC7: /* C7 /0 = MOV Iv,Ev */
20978       goto maybe_do_Mov_I_E;
20979    maybe_do_Mov_I_E:
20980       modrm = getUChar(delta);
20981       if (gregLO3ofRM(modrm) == 0) {
20982          if (epartIsReg(modrm)) {
20983             /* Neither F2 nor F3 are allowable. */
20984             if (haveF2orF3(pfx)) goto decode_failure;
20985             delta++; /* mod/rm byte */
20986             d64 = getSDisp(imin(4,sz),delta);
20987             delta += imin(4,sz);
20988             putIRegE(sz, pfx, modrm,
20989                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
20990             DIP("mov%c $%lld, %s\n", nameISize(sz),
20991                                      (Long)d64,
20992                                      nameIRegE(sz,pfx,modrm));
20993          } else {
20994             if (haveF2(pfx)) goto decode_failure;
20995             /* F3(XRELEASE) is allowable here */
20996             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
20997                               /*xtra*/imin(4,sz) );
20998             delta += alen;
20999             d64 = getSDisp(imin(4,sz),delta);
21000             delta += imin(4,sz);
21001             storeLE(mkexpr(addr),
21002                     mkU(szToITy(sz), d64 & mkSizeMask(sz)));
21003             DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
21004          }
21005          return delta;
21006       }
21007       /* BEGIN HACKY SUPPORT FOR xbegin */
21008       if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
21009           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21010          delta++; /* mod/rm byte */
21011          d64 = getSDisp(4,delta);
21012          delta += 4;
21013          guest_RIP_next_mustcheck = True;
21014          guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
21015          Addr64 failAddr = guest_RIP_bbstart + delta + d64;
21016          /* EAX contains the failure status code.  Bit 3 is "Set if an
21017             internal buffer overflowed", which seems like the
21018             least-bogus choice we can make here. */
21019          putIRegRAX(4, mkU32(1<<3));
21020          /* And jump to the fail address. */
21021          jmp_lit(dres, Ijk_Boring, failAddr);
21022          vassert(dres->whatNext == Dis_StopHere);
21023          DIP("xbeginq 0x%llx\n", failAddr);
21024          return delta;
21025       }
21026       /* END HACKY SUPPORT FOR xbegin */
21027       /* BEGIN HACKY SUPPORT FOR xabort */
21028       if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
21029           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21030          delta++; /* mod/rm byte */
21031          abyte = getUChar(delta); delta++;
21032          /* There is never a real transaction in progress, so do nothing. */
21033          DIP("xabort $%d", (Int)abyte);
21034          return delta;
21035       }
21036       /* END HACKY SUPPORT FOR xabort */
21037       goto decode_failure;
21038
21039    case 0xC8: /* ENTER */
21040       /* Same comments re operand size as for LEAVE below apply.
21041          Also, only handles the case "enter $imm16, $0"; other cases
21042          for the second operand (nesting depth) are not handled. */
21043       if (sz != 4)
21044          goto decode_failure;
21045       d64 = getUDisp16(delta);
21046       delta += 2;
21047       vassert(d64 >= 0 && d64 <= 0xFFFF);
21048       if (getUChar(delta) != 0)
21049          goto decode_failure;
21050       delta++;
21051       /* Intel docs seem to suggest:
21052            push rbp
21053            temp = rsp
21054            rbp = temp
21055            rsp = rsp - imm16
21056       */
21057       t1 = newTemp(Ity_I64);
21058       assign(t1, getIReg64(R_RBP));
21059       t2 = newTemp(Ity_I64);
21060       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
21061       putIReg64(R_RSP, mkexpr(t2));
21062       storeLE(mkexpr(t2), mkexpr(t1));
21063       putIReg64(R_RBP, mkexpr(t2));
21064       if (d64 > 0) {
21065          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
21066       }
21067       DIP("enter $%u, $0\n", (UInt)d64);
21068       return delta;
21069
21070    case 0xC9: /* LEAVE */
21071       /* In 64-bit mode this defaults to a 64-bit operand size.  There
21072          is no way to encode a 32-bit variant.  Hence sz==4 but we do
21073          it as if sz=8. */
21074       if (sz != 4)
21075          goto decode_failure;
21076       t1 = newTemp(Ity_I64);
21077       t2 = newTemp(Ity_I64);
21078       assign(t1, getIReg64(R_RBP));
21079       /* First PUT RSP looks redundant, but need it because RSP must
21080          always be up-to-date for Memcheck to work... */
21081       putIReg64(R_RSP, mkexpr(t1));
21082       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
21083       putIReg64(R_RBP, mkexpr(t2));
21084       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
21085       DIP("leave\n");
21086       return delta;
21087
21088    case 0xCC: /* INT 3 */
21089       jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
21090       vassert(dres->whatNext == Dis_StopHere);
21091       DIP("int $0x3\n");
21092       return delta;
21093
21094    case 0xCD: /* INT imm8 */
21095       d64 = getUChar(delta); delta++;
21096
21097       /* Handle int $0xD2 (Solaris fasttrap syscalls). */
21098       if (d64 == 0xD2) {
21099          jmp_lit(dres, Ijk_Sys_int210, guest_RIP_bbstart + delta);
21100          vassert(dres->whatNext == Dis_StopHere);
21101          DIP("int $0xD2\n");
21102          return delta;
21103       }
21104       goto decode_failure;
21105
21106    case 0xCF: /* IRET */
21107       /* Note, this is an extremely kludgey and limited implementation of iret
21108          based on the extremely kludgey and limited implementation of iret for x86
21109             popq %RIP; popl %CS; popq %RFLAGS; popq %RSP; popl %SS
21110          %CS and %SS are ignored */
21111       if (sz != 8 || have66orF2orF3(pfx)) goto decode_failure;
21112
21113       t1 = newTemp(Ity_I64); /* RSP */
21114       t2 = newTemp(Ity_I64); /* new RIP */
21115       /* t3 = newTemp(Ity_I32);  new CS */
21116       t4 = newTemp(Ity_I64); /* new RFLAGS */
21117       t5 = newTemp(Ity_I64); /* new RSP */
21118       /* t6 = newTemp(Ity_I32);  new SS */
21119
21120       assign(t1, getIReg64(R_RSP));
21121       assign(t2, loadLE(Ity_I64, binop(Iop_Add64,mkexpr(t1),mkU64(0))));
21122       /* assign(t3, loadLE(Ity_I32, binop(Iop_Add64,mkexpr(t1),mkU64(8)))); */
21123       assign(t4, loadLE(Ity_I64, binop(Iop_Add64,mkexpr(t1),mkU64(16))));
21124       assign(t5, loadLE(Ity_I64, binop(Iop_Add64,mkexpr(t1),mkU64(24))));
21125       /* assign(t6, loadLE(Ity_I32, binop(Iop_Add64,mkexpr(t1),mkU64(32)))); */
21126
21127       /* set %RFLAGS */
21128       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21129       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21130       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21131       stmt( IRStmt_Put( OFFB_CC_DEP1,
21132                         binop(Iop_And64,
21133                               mkexpr(t4),
21134                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
21135                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
21136                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
21137                              )
21138                        )
21139           );
21140
21141       /* Also need to set the D flag, which is held in bit 10 of t4.
21142          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
21143       stmt( IRStmt_Put(
21144                OFFB_DFLAG,
21145                IRExpr_ITE(
21146                   unop(Iop_64to1,
21147                        binop(Iop_And64,
21148                              binop(Iop_Shr64, mkexpr(t4), mkU8(10)),
21149                              mkU64(1))),
21150                   mkU64(0xFFFFFFFFFFFFFFFFULL),
21151                   mkU64(1)))
21152           );
21153
21154       /* And set the ID flag */
21155       stmt( IRStmt_Put(
21156                OFFB_IDFLAG,
21157                IRExpr_ITE(
21158                   unop(Iop_64to1,
21159                        binop(Iop_And64,
21160                              binop(Iop_Shr64, mkexpr(t4), mkU8(21)),
21161                              mkU64(1))),
21162                   mkU64(1),
21163                   mkU64(0)))
21164           );
21165
21166       /* And set the AC flag too */
21167       stmt( IRStmt_Put(
21168                OFFB_ACFLAG,
21169                IRExpr_ITE(
21170                   unop(Iop_64to1,
21171                        binop(Iop_And64,
21172                              binop(Iop_Shr64, mkexpr(t4), mkU8(18)),
21173                              mkU64(1))),
21174                   mkU64(1),
21175                   mkU64(0)))
21176           );
21177
21178
21179       /* set new stack */
21180       putIReg64(R_RSP, mkexpr(t5));
21181
21182       /* goto new RIP value */
21183       jmp_treg(dres, Ijk_Ret, t2);
21184       DIP("iret (very kludgey)\n");
21185       return delta;
21186
21187    case 0xD0: { /* Grp2 1,Eb */
21188       Bool decode_OK = True;
21189       if (haveF2orF3(pfx)) goto decode_failure;
21190       modrm = getUChar(delta);
21191       am_sz = lengthAMode(pfx,delta);
21192       d_sz  = 0;
21193       d64   = 1;
21194       sz    = 1;
21195       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21196                          mkU8(d64), NULL, &decode_OK );
21197       if (!decode_OK) goto decode_failure;
21198       return delta;
21199    }
21200
21201    case 0xD1: { /* Grp2 1,Ev */
21202       Bool decode_OK = True;
21203       if (haveF2orF3(pfx)) goto decode_failure;
21204       modrm = getUChar(delta);
21205       am_sz = lengthAMode(pfx,delta);
21206       d_sz  = 0;
21207       d64   = 1;
21208       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21209                          mkU8(d64), NULL, &decode_OK );
21210       if (!decode_OK) goto decode_failure;
21211       return delta;
21212    }
21213
21214    case 0xD2: { /* Grp2 CL,Eb */
21215       Bool decode_OK = True;
21216       if (haveF2orF3(pfx)) goto decode_failure;
21217       modrm = getUChar(delta);
21218       am_sz = lengthAMode(pfx,delta);
21219       d_sz  = 0;
21220       sz    = 1;
21221       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21222                          getIRegCL(), "%cl", &decode_OK );
21223       if (!decode_OK) goto decode_failure;
21224       return delta;
21225    }
21226
21227    case 0xD3: { /* Grp2 CL,Ev */
21228       Bool decode_OK = True;
21229       if (haveF2orF3(pfx)) goto decode_failure;
21230       modrm = getUChar(delta);
21231       am_sz = lengthAMode(pfx,delta);
21232       d_sz  = 0;
21233       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21234                          getIRegCL(), "%cl", &decode_OK );
21235       if (!decode_OK) goto decode_failure;
21236       return delta;
21237    }
21238
21239    case 0xD8: /* X87 instructions */
21240    case 0xD9:
21241    case 0xDA:
21242    case 0xDB:
21243    case 0xDC:
21244    case 0xDD:
21245    case 0xDE:
21246    case 0xDF: {
21247       Bool redundantREXWok = False;
21248
21249       if (haveF2orF3(pfx))
21250          goto decode_failure;
21251
21252       /* kludge to tolerate redundant rex.w prefixes (should do this
21253          properly one day) */
21254       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
21255       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
21256          redundantREXWok = True;
21257
21258       Bool size_OK = False;
21259       if ( sz == 4 )
21260          size_OK = True;
21261       else if ( sz == 8 )
21262          size_OK = redundantREXWok;
21263       else if ( sz == 2 ) {
21264          int mod_rm = getUChar(delta+0);
21265          int reg = gregLO3ofRM(mod_rm);
21266          /* The HotSpot JVM uses these */
21267          if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
21268                                 reg == 4 /* FNSAVE */ ||
21269                                 reg == 6 /* FRSTOR */ ) )
21270             size_OK = True;
21271       }
21272       /* AMD manual says 0x66 size override is ignored, except where
21273          it is meaningful */
21274       if (!size_OK)
21275          goto decode_failure;
21276
21277       Bool decode_OK = False;
21278       delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
21279       if (!decode_OK)
21280          goto decode_failure;
21281
21282       return delta;
21283    }
21284
21285    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
21286    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
21287    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
21288     { /* The docs say this uses rCX as a count depending on the
21289          address size override, not the operand one. */
21290       IRExpr* zbit  = NULL;
21291       IRExpr* count = NULL;
21292       IRExpr* cond  = NULL;
21293       const HChar* xtra = NULL;
21294
21295       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
21296       /* So at this point we've rejected any variants which appear to
21297          be governed by the usual operand-size modifiers.  Hence only
21298          the address size prefix can have an effect.  It changes the
21299          size from 64 (default) to 32. */
21300       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
21301       delta++;
21302       if (haveASO(pfx)) {
21303          /* 64to32 of 64-bit get is merely a get-put improvement
21304             trick. */
21305          putIReg32(R_RCX, binop(Iop_Sub32,
21306                                 unop(Iop_64to32, getIReg64(R_RCX)),
21307                                 mkU32(1)));
21308       } else {
21309          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
21310       }
21311
21312       /* This is correct, both for 32- and 64-bit versions.  If we're
21313          doing a 32-bit dec and the result is zero then the default
21314          zero extension rule will cause the upper 32 bits to be zero
21315          too.  Hence a 64-bit check against zero is OK. */
21316       count = getIReg64(R_RCX);
21317       cond = binop(Iop_CmpNE64, count, mkU64(0));
21318       switch (opc) {
21319          case 0xE2:
21320             xtra = "";
21321             break;
21322          case 0xE1:
21323             xtra = "e";
21324             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
21325             cond = mkAnd1(cond, zbit);
21326             break;
21327          case 0xE0:
21328             xtra = "ne";
21329             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
21330             cond = mkAnd1(cond, zbit);
21331             break;
21332          default:
21333             vassert(0);
21334       }
21335       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
21336
21337       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", (ULong)d64);
21338       return delta;
21339     }
21340
21341    case 0xE3:
21342       /* JRCXZ or JECXZ, depending address size override. */
21343       if (have66orF2orF3(pfx)) goto decode_failure;
21344       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
21345       delta++;
21346       if (haveASO(pfx)) {
21347          /* 32-bit */
21348          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
21349                                   unop(Iop_32Uto64, getIReg32(R_RCX)),
21350                                   mkU64(0)),
21351                             Ijk_Boring,
21352                             IRConst_U64(d64),
21353                             OFFB_RIP
21354              ));
21355          DIP("jecxz 0x%llx\n", (ULong)d64);
21356       } else {
21357          /* 64-bit */
21358          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
21359                                   getIReg64(R_RCX),
21360                                   mkU64(0)),
21361                             Ijk_Boring,
21362                             IRConst_U64(d64),
21363                             OFFB_RIP
21364                ));
21365          DIP("jrcxz 0x%llx\n", (ULong)d64);
21366       }
21367       return delta;
21368
21369    case 0xE4: /* IN imm8, AL */
21370       sz = 1;
21371       t1 = newTemp(Ity_I64);
21372       abyte = getUChar(delta); delta++;
21373       assign(t1, mkU64( abyte & 0xFF ));
21374       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
21375       goto do_IN;
21376    case 0xE5: /* IN imm8, eAX */
21377       if (!(sz == 2 || sz == 4)) goto decode_failure;
21378       t1 = newTemp(Ity_I64);
21379       abyte = getUChar(delta); delta++;
21380       assign(t1, mkU64( abyte & 0xFF ));
21381       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
21382       goto do_IN;
21383    case 0xEC: /* IN %DX, AL */
21384       sz = 1;
21385       t1 = newTemp(Ity_I64);
21386       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
21387       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
21388                                          nameIRegRAX(sz));
21389       goto do_IN;
21390    case 0xED: /* IN %DX, eAX */
21391       if (!(sz == 2 || sz == 4)) goto decode_failure;
21392       t1 = newTemp(Ity_I64);
21393       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
21394       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
21395                                          nameIRegRAX(sz));
21396       goto do_IN;
21397    do_IN: {
21398       /* At this point, sz indicates the width, and t1 is a 64-bit
21399          value giving port number. */
21400       IRDirty* d;
21401       if (haveF2orF3(pfx)) goto decode_failure;
21402       vassert(sz == 1 || sz == 2 || sz == 4);
21403       ty = szToITy(sz);
21404       t2 = newTemp(Ity_I64);
21405       d = unsafeIRDirty_1_N(
21406              t2,
21407              0/*regparms*/,
21408              "amd64g_dirtyhelper_IN",
21409              &amd64g_dirtyhelper_IN,
21410              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
21411           );
21412       /* do the call, dumping the result in t2. */
21413       stmt( IRStmt_Dirty(d) );
21414       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
21415       return delta;
21416    }
21417
21418    case 0xE6: /* OUT AL, imm8 */
21419       sz = 1;
21420       t1 = newTemp(Ity_I64);
21421       abyte = getUChar(delta); delta++;
21422       assign( t1, mkU64( abyte & 0xFF ) );
21423       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
21424       goto do_OUT;
21425    case 0xE7: /* OUT eAX, imm8 */
21426       if (!(sz == 2 || sz == 4)) goto decode_failure;
21427       t1 = newTemp(Ity_I64);
21428       abyte = getUChar(delta); delta++;
21429       assign( t1, mkU64( abyte & 0xFF ) );
21430       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
21431       goto do_OUT;
21432    case 0xEE: /* OUT AL, %DX */
21433       sz = 1;
21434       t1 = newTemp(Ity_I64);
21435       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
21436       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
21437                                           nameIRegRDX(2));
21438       goto do_OUT;
21439    case 0xEF: /* OUT eAX, %DX */
21440       if (!(sz == 2 || sz == 4)) goto decode_failure;
21441       t1 = newTemp(Ity_I64);
21442       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
21443       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
21444                                           nameIRegRDX(2));
21445       goto do_OUT;
21446    do_OUT: {
21447       /* At this point, sz indicates the width, and t1 is a 64-bit
21448          value giving port number. */
21449       IRDirty* d;
21450       if (haveF2orF3(pfx)) goto decode_failure;
21451       vassert(sz == 1 || sz == 2 || sz == 4);
21452       ty = szToITy(sz);
21453       d = unsafeIRDirty_0_N(
21454              0/*regparms*/,
21455              "amd64g_dirtyhelper_OUT",
21456              &amd64g_dirtyhelper_OUT,
21457              mkIRExprVec_3( mkexpr(t1),
21458                             widenUto64( getIRegRAX(sz) ),
21459                             mkU64(sz) )
21460           );
21461       stmt( IRStmt_Dirty(d) );
21462       return delta;
21463    }
21464
21465    case 0xE8: /* CALL J4 */
21466       if (haveF3(pfx)) goto decode_failure;
21467       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21468       d64 = getSDisp32(delta); delta += 4;
21469       d64 += (guest_RIP_bbstart+delta);
21470       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
21471       t1 = newTemp(Ity_I64);
21472       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
21473       putIReg64(R_RSP, mkexpr(t1));
21474       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
21475       t2 = newTemp(Ity_I64);
21476       assign(t2, mkU64((Addr64)d64));
21477       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
21478       jmp_lit(dres, Ijk_Call, d64);
21479       vassert(dres->whatNext == Dis_StopHere);
21480       DIP("call 0x%llx\n", (ULong)d64);
21481       return delta;
21482
21483    case 0xE9: /* Jv (jump, 16/32 offset) */
21484       if (haveF3(pfx)) goto decode_failure;
21485       sz = 4; /* Prefixes that change operand size are ignored for this
21486                  instruction. Operand size is forced to 32bit. */
21487       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21488       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
21489       delta += sz;
21490       jmp_lit(dres, Ijk_Boring, d64);
21491       vassert(dres->whatNext == Dis_StopHere);
21492       DIP("jmp 0x%llx\n", (ULong)d64);
21493       return delta;
21494
21495    case 0xEB: /* Jb (jump, byte offset) */
21496       if (haveF3(pfx)) goto decode_failure;
21497       /* Prefixes that change operand size are ignored for this instruction. */
21498       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21499       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
21500       delta++;
21501       jmp_lit(dres, Ijk_Boring, d64);
21502       vassert(dres->whatNext == Dis_StopHere);
21503       DIP("jmp-8 0x%llx\n", (ULong)d64);
21504       return delta;
21505
21506    case 0xF5: /* CMC */
21507    case 0xF8: /* CLC */
21508    case 0xF9: /* STC */
21509       t1 = newTemp(Ity_I64);
21510       t2 = newTemp(Ity_I64);
21511       assign( t1, mk_amd64g_calculate_rflags_all() );
21512       switch (opc) {
21513          case 0xF5:
21514             assign( t2, binop(Iop_Xor64, mkexpr(t1),
21515                                          mkU64(AMD64G_CC_MASK_C)));
21516             DIP("cmc\n");
21517             break;
21518          case 0xF8:
21519             assign( t2, binop(Iop_And64, mkexpr(t1),
21520                                          mkU64(~AMD64G_CC_MASK_C)));
21521             DIP("clc\n");
21522             break;
21523          case 0xF9:
21524             assign( t2, binop(Iop_Or64, mkexpr(t1),
21525                                         mkU64(AMD64G_CC_MASK_C)));
21526             DIP("stc\n");
21527             break;
21528          default:
21529             vpanic("disInstr(x64)(cmc/clc/stc)");
21530       }
21531       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21532       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21533       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
21534       /* Set NDEP even though it isn't used.  This makes redundant-PUT
21535          elimination of previous stores to this field work better. */
21536       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21537       return delta;
21538
21539    case 0xF6: { /* Grp3 Eb */
21540       Bool decode_OK = True;
21541       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21542       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
21543       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
21544       if (!decode_OK) goto decode_failure;
21545       return delta;
21546    }
21547
21548    case 0xF7: { /* Grp3 Ev */
21549       Bool decode_OK = True;
21550       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21551       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
21552       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
21553       if (!decode_OK) goto decode_failure;
21554       return delta;
21555    }
21556
21557    case 0xFC: /* CLD */
21558       if (haveF2orF3(pfx)) goto decode_failure;
21559       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
21560       DIP("cld\n");
21561       return delta;
21562
21563    case 0xFD: /* STD */
21564       if (haveF2orF3(pfx)) goto decode_failure;
21565       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
21566       DIP("std\n");
21567       return delta;
21568
21569    case 0xFE: { /* Grp4 Eb */
21570       Bool decode_OK = True;
21571       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21572       /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
21573       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
21574       if (!decode_OK) goto decode_failure;
21575       return delta;
21576    }
21577
21578    case 0xFF: { /* Grp5 Ev */
21579       Bool decode_OK = True;
21580       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21581       /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
21582       delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
21583       if (!decode_OK) goto decode_failure;
21584       return delta;
21585    }
21586
21587    default:
21588       break;
21589
21590    }
21591
21592   decode_failure:
21593    return deltaIN; /* fail */
21594 }
21595
21596
21597 /*------------------------------------------------------------*/
21598 /*---                                                      ---*/
21599 /*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
21600 /*---                                                      ---*/
21601 /*------------------------------------------------------------*/
21602
21603 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
21604 {
21605    IRTemp t2 = newTemp(ty);
21606    if (ty == Ity_I64) {
21607       IRTemp m8  = newTemp(Ity_I64);
21608       IRTemp s8  = newTemp(Ity_I64);
21609       IRTemp m16 = newTemp(Ity_I64);
21610       IRTemp s16 = newTemp(Ity_I64);
21611       IRTemp m32 = newTemp(Ity_I64);
21612       assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
21613       assign( s8,
21614               binop(Iop_Or64,
21615                     binop(Iop_Shr64,
21616                           binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
21617                           mkU8(8)),
21618                     binop(Iop_And64,
21619                           binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
21620                           mkexpr(m8))
21621                    )
21622             );
21623
21624       assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
21625       assign( s16,
21626               binop(Iop_Or64,
21627                     binop(Iop_Shr64,
21628                           binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
21629                           mkU8(16)),
21630                     binop(Iop_And64,
21631                           binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
21632                           mkexpr(m16))
21633                    )
21634             );
21635
21636       assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
21637       assign( t2,
21638               binop(Iop_Or64,
21639                     binop(Iop_Shr64,
21640                           binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
21641                           mkU8(32)),
21642                     binop(Iop_And64,
21643                           binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
21644                           mkexpr(m32))
21645                    )
21646             );
21647       return t2;
21648    }
21649    if (ty == Ity_I32) {
21650       assign( t2,
21651          binop(
21652             Iop_Or32,
21653             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
21654             binop(
21655                Iop_Or32,
21656                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
21657                                 mkU32(0x00FF0000)),
21658                binop(Iop_Or32,
21659                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
21660                                       mkU32(0x0000FF00)),
21661                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
21662                                       mkU32(0x000000FF) )
21663             )))
21664       );
21665       return t2;
21666    }
21667    if (ty == Ity_I16) {
21668       assign(t2,
21669              binop(Iop_Or16,
21670                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
21671                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
21672       return t2;
21673    }
21674    vassert(0);
21675    /*NOTREACHED*/
21676    return IRTemp_INVALID;
21677 }
21678
21679
21680 __attribute__((noinline))
21681 static
21682 Long dis_ESC_0F (
21683         /*MB_OUT*/DisResult* dres,
21684         /*MB_OUT*/Bool*      expect_CAS,
21685         const VexArchInfo* archinfo,
21686         const VexAbiInfo*  vbi,
21687         Prefix pfx, Int sz, Long deltaIN
21688      )
21689 {
21690    Long   d64   = 0;
21691    IRTemp addr  = IRTemp_INVALID;
21692    IRTemp t1    = IRTemp_INVALID;
21693    IRTemp t2    = IRTemp_INVALID;
21694    UChar  modrm = 0;
21695    Int    am_sz = 0;
21696    Int    alen  = 0;
21697    HChar  dis_buf[50];
21698
21699    /* In the first switch, look for ordinary integer insns. */
21700    Long   delta = deltaIN;
21701    UChar  opc   = getUChar(delta);
21702    delta++;
21703    switch (opc) { /* first switch */
21704
21705    case 0x01:
21706    {
21707       modrm = getUChar(delta);
21708       /* 0F 01 /0 -- SGDT */
21709       /* 0F 01 /1 -- SIDT */
21710       if (!epartIsReg(modrm)
21711           && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
21712          /* This is really revolting, but ... since each processor
21713             (core) only has one IDT and one GDT, just let the guest
21714             see it (pass-through semantics).  I can't see any way to
21715             construct a faked-up value, so don't bother to try. */
21716          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21717          delta += alen;
21718          switch (gregLO3ofRM(modrm)) {
21719             case 0: DIP("sgdt %s\n", dis_buf); break;
21720             case 1: DIP("sidt %s\n", dis_buf); break;
21721             default: vassert(0); /*NOTREACHED*/
21722          }
21723          IRDirty* d = unsafeIRDirty_0_N (
21724                           0/*regparms*/,
21725                           "amd64g_dirtyhelper_SxDT",
21726                           &amd64g_dirtyhelper_SxDT,
21727                           mkIRExprVec_2( mkexpr(addr),
21728                                          mkU64(gregLO3ofRM(modrm)) )
21729                       );
21730          /* declare we're writing memory */
21731          d->mFx   = Ifx_Write;
21732          d->mAddr = mkexpr(addr);
21733          d->mSize = 6;
21734          stmt( IRStmt_Dirty(d) );
21735          return delta;
21736       }
21737       /* 0F 01 D0 = XGETBV */
21738       if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21739          delta += 1;
21740          DIP("xgetbv\n");
21741          /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
21742             am not sure if that translates in to SEGV or to something
21743             else, in user space. */
21744          t1 = newTemp(Ity_I32);
21745          assign( t1, getIReg32(R_RCX) );
21746          stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
21747                            Ijk_SigSEGV,
21748                            IRConst_U64(guest_RIP_curr_instr),
21749                            OFFB_RIP
21750          ));
21751          putIRegRAX(4, mkU32(7));
21752          putIRegRDX(4, mkU32(0));
21753          return delta;
21754       }
21755       /* BEGIN HACKY SUPPORT FOR xend */
21756       /* 0F 01 D5 = XEND */
21757       if (modrm == 0xD5 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21758          /* We are never in an transaction (xbegin immediately aborts).
21759             So this just always generates a General Protection Fault. */
21760          delta += 1;
21761          jmp_lit(dres, Ijk_SigSEGV, guest_RIP_bbstart + delta);
21762          vassert(dres->whatNext == Dis_StopHere);
21763          DIP("xend\n");
21764          return delta;
21765       }
21766       /* END HACKY SUPPORT FOR xend */
21767       /* BEGIN HACKY SUPPORT FOR xtest */
21768       /* 0F 01 D6 = XTEST */
21769       if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21770          /* Sets ZF because there never is a transaction, and all
21771             CF, OF, SF, PF and AF are always cleared by xtest. */
21772          delta += 1;
21773          DIP("xtest\n");
21774          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21775          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21776          stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
21777          /* Set NDEP even though it isn't used.  This makes redundant-PUT
21778             elimination of previous stores to this field work better. */
21779          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21780          return delta;
21781       }
21782       /* END HACKY SUPPORT FOR xtest */
21783       /* 0F 01 F9 = RDTSCP */
21784       if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
21785          delta += 1;
21786          /* Uses dirty helper:
21787             void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
21788             declared to wr rax, rcx, rdx
21789          */
21790          const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
21791          void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
21792          IRDirty* d
21793             = unsafeIRDirty_0_N ( 0/*regparms*/,
21794                                   fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
21795          /* declare guest state effects */
21796          d->nFxState = 3;
21797          vex_bzero(&d->fxState, sizeof(d->fxState));
21798          d->fxState[0].fx     = Ifx_Write;
21799          d->fxState[0].offset = OFFB_RAX;
21800          d->fxState[0].size   = 8;
21801          d->fxState[1].fx     = Ifx_Write;
21802          d->fxState[1].offset = OFFB_RCX;
21803          d->fxState[1].size   = 8;
21804          d->fxState[2].fx     = Ifx_Write;
21805          d->fxState[2].offset = OFFB_RDX;
21806          d->fxState[2].size   = 8;
21807          /* execute the dirty call, side-effecting guest state */
21808          stmt( IRStmt_Dirty(d) );
21809          /* RDTSCP is a serialising insn.  So, just in case someone is
21810             using it as a memory fence ... */
21811          stmt( IRStmt_MBE(Imbe_Fence) );
21812          DIP("rdtscp\n");
21813          return delta;
21814       }
21815       /* else decode failed */
21816       break;
21817    }
21818
21819    case 0x05: /* SYSCALL */
21820       guest_RIP_next_mustcheck = True;
21821       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
21822       putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
21823       /* It's important that all guest state is up-to-date
21824          at this point.  So we declare an end-of-block here, which
21825          forces any cached guest state to be flushed. */
21826       jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
21827       vassert(dres->whatNext == Dis_StopHere);
21828       DIP("syscall\n");
21829       return delta;
21830
21831    case 0x0B: /* UD2 */
21832       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
21833       jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
21834       vassert(dres->whatNext == Dis_StopHere);
21835       DIP("ud2\n");
21836       return delta;
21837
21838    case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
21839               /* 0F 0D /1 -- prefetchw mem8 */
21840       if (have66orF2orF3(pfx)) goto decode_failure;
21841       modrm = getUChar(delta);
21842       if (epartIsReg(modrm)) goto decode_failure;
21843       if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
21844          goto decode_failure;
21845       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21846       delta += alen;
21847       switch (gregLO3ofRM(modrm)) {
21848          case 0: DIP("prefetch %s\n", dis_buf); break;
21849          case 1: DIP("prefetchw %s\n", dis_buf); break;
21850          default: vassert(0); /*NOTREACHED*/
21851       }
21852       return delta;
21853
21854    case 0x19:
21855    case 0x1C:
21856    case 0x1D:
21857    case 0x1E:
21858    case 0x1F:
21859       // Intel CET instructions can have any prefixes before NOPs
21860       // and can use any ModRM, SIB and disp
21861       modrm = getUChar(delta);
21862       if (epartIsReg(modrm)) {
21863          delta += 1;
21864          DIP("nop%c\n", nameISize(sz));
21865       } else {
21866          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21867          delta += alen;
21868          DIP("nop%c %s\n", nameISize(sz), dis_buf);
21869       }
21870       return delta;
21871
21872    case 0x31: { /* RDTSC */
21873       IRTemp   val  = newTemp(Ity_I64);
21874       IRExpr** args = mkIRExprVec_0();
21875       IRDirty* d    = unsafeIRDirty_1_N (
21876                          val,
21877                          0/*regparms*/,
21878                          "amd64g_dirtyhelper_RDTSC",
21879                          &amd64g_dirtyhelper_RDTSC,
21880                          args
21881                       );
21882       if (have66orF2orF3(pfx)) goto decode_failure;
21883       /* execute the dirty call, dumping the result in val. */
21884       stmt( IRStmt_Dirty(d) );
21885       putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
21886       putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
21887       DIP("rdtsc\n");
21888       return delta;
21889    }
21890
21891    case 0x40:
21892    case 0x41:
21893    case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
21894    case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
21895    case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
21896    case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
21897    case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
21898    case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
21899    case 0x48: /* CMOVSb (cmov negative) */
21900    case 0x49: /* CMOVSb (cmov not negative) */
21901    case 0x4A: /* CMOVP (cmov parity even) */
21902    case 0x4B: /* CMOVNP (cmov parity odd) */
21903    case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
21904    case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
21905    case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
21906    case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
21907       if (haveF2orF3(pfx)) goto decode_failure;
21908       delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
21909       return delta;
21910
21911    case 0x80:
21912    case 0x81:
21913    case 0x82:   /* JBb/JNAEb (jump below) */
21914    case 0x83:   /* JNBb/JAEb (jump not below) */
21915    case 0x84:   /* JZb/JEb (jump zero) */
21916    case 0x85:   /* JNZb/JNEb (jump not zero) */
21917    case 0x86:   /* JBEb/JNAb (jump below or equal) */
21918    case 0x87:   /* JNBEb/JAb (jump not below or equal) */
21919    case 0x88:   /* JSb (jump negative) */
21920    case 0x89:   /* JSb (jump not negative) */
21921    case 0x8A:   /* JP (jump parity even) */
21922    case 0x8B:   /* JNP/JPO (jump parity odd) */
21923    case 0x8C:   /* JLb/JNGEb (jump less) */
21924    case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
21925    case 0x8E:   /* JLEb/JNGb (jump less or equal) */
21926    case 0x8F: { /* JGb/JNLEb (jump greater) */
21927       Long   jmpDelta;
21928       const HChar* comment  = "";
21929       if (haveF3(pfx)) goto decode_failure;
21930       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21931       jmpDelta = getSDisp32(delta);
21932       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
21933       delta += 4;
21934       /* End the block at this point. */
21935       jcc_01( dres, (AMD64Condcode)(opc - 0x80),
21936               guest_RIP_bbstart+delta, d64 );
21937       vassert(dres->whatNext == Dis_StopHere);
21938       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), (ULong)d64,
21939           comment);
21940       return delta;
21941    }
21942
21943    case 0x90:
21944    case 0x91:
21945    case 0x92: /* set-Bb/set-NAEb (set if below) */
21946    case 0x93: /* set-NBb/set-AEb (set if not below) */
21947    case 0x94: /* set-Zb/set-Eb (set if zero) */
21948    case 0x95: /* set-NZb/set-NEb (set if not zero) */
21949    case 0x96: /* set-BEb/set-NAb (set if below or equal) */
21950    case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
21951    case 0x98: /* set-Sb (set if negative) */
21952    case 0x99: /* set-Sb (set if not negative) */
21953    case 0x9A: /* set-P (set if parity even) */
21954    case 0x9B: /* set-NP (set if parity odd) */
21955    case 0x9C: /* set-Lb/set-NGEb (set if less) */
21956    case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
21957    case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
21958    case 0x9F: /* set-Gb/set-NLEb (set if greater) */
21959       if (haveF2orF3(pfx)) goto decode_failure;
21960       t1 = newTemp(Ity_I8);
21961       assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
21962       modrm = getUChar(delta);
21963       if (epartIsReg(modrm)) {
21964          delta++;
21965          putIRegE(1, pfx, modrm, mkexpr(t1));
21966          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
21967                            nameIRegE(1,pfx,modrm));
21968       } else {
21969          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21970          delta += alen;
21971          storeLE( mkexpr(addr), mkexpr(t1) );
21972          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
21973       }
21974       return delta;
21975
21976    case 0x1A:
21977    case 0x1B: { /* Future MPX instructions, currently NOPs.
21978                    BNDMK b, m     F3 0F 1B
21979                    BNDCL b, r/m   F3 0F 1A
21980                    BNDCU b, r/m   F2 0F 1A
21981                    BNDCN b, r/m   F2 0F 1B
21982                    BNDMOV b, b/m  66 0F 1A
21983                    BNDMOV b/m, b  66 0F 1B
21984                    BNDLDX b, mib     0F 1A
21985                    BNDSTX mib, b     0F 1B */
21986
21987       /* All instructions have two operands. One operand is always the
21988          bnd register number (bnd0-bnd3, other register numbers are
21989          ignored when MPX isn't enabled, but should generate an
21990          exception if MPX is enabled) given by gregOfRexRM. The other
21991          operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
21992          address, all of which can be decoded by using either
21993          eregOfRexRM or disAMode. */
21994
21995       modrm = getUChar(delta);
21996       int bnd = gregOfRexRM(pfx,modrm);
21997       const HChar *oper;
21998       if (epartIsReg(modrm)) {
21999          oper = nameIReg64 (eregOfRexRM(pfx,modrm));
22000          delta += 1;
22001       } else {
22002          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22003          delta += alen;
22004          oper = dis_buf;
22005       }
22006
22007       if (haveF3no66noF2 (pfx)) {
22008          if (opc == 0x1B) {
22009             DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
22010          } else /* opc == 0x1A */ {
22011             DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
22012          }
22013       } else if (haveF2no66noF3 (pfx)) {
22014          if (opc == 0x1A) {
22015             DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
22016          } else /* opc == 0x1B */ {
22017             DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
22018          }
22019       } else if (have66noF2noF3 (pfx)) {
22020          if (opc == 0x1A) {
22021             DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
22022          } else /* opc == 0x1B */ {
22023             DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
22024          }
22025       } else if (haveNo66noF2noF3 (pfx)) {
22026          if (opc == 0x1A) {
22027             DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
22028          } else /* opc == 0x1B */ {
22029             DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
22030          }
22031       } else goto decode_failure;
22032
22033       return delta;
22034    }
22035
22036    case 0xA2: { /* CPUID */
22037       /* Uses dirty helper:
22038             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
22039          declared to mod rax, wr rbx, rcx, rdx
22040       */
22041       IRDirty*     d     = NULL;
22042       const HChar* fName = NULL;
22043       void*        fAddr = NULL;
22044
22045       if (haveF2orF3(pfx)) goto decode_failure;
22046
22047       /* This isn't entirely correct, CPUID should depend on the VEX
22048          capabilities, not on the underlying CPU. See bug #324882. */
22049       if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSSE3) &&
22050           (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
22051           (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) {
22052          fName = "amd64g_dirtyhelper_CPUID_avx2";
22053          fAddr = &amd64g_dirtyhelper_CPUID_avx2;
22054          /* This is a Core-i7-4910-like machine */
22055       }
22056       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSSE3) &&
22057                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
22058                (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
22059          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
22060          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
22061          /* This is a Core-i5-2300-like machine */
22062       }
22063       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSSE3) &&
22064                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
22065          fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
22066          fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
22067          /* This is a Core-i5-670-like machine */
22068       }
22069       else {
22070          /* Give a CPUID for at least a baseline machine, SSE2
22071             only, and no CX16 */
22072          fName = "amd64g_dirtyhelper_CPUID_baseline";
22073          fAddr = &amd64g_dirtyhelper_CPUID_baseline;
22074       }
22075
22076       vassert(fName); vassert(fAddr);
22077       IRExpr** args = NULL;
22078       if (fAddr == &amd64g_dirtyhelper_CPUID_avx2
22079           || fAddr == &amd64g_dirtyhelper_CPUID_avx_and_cx16) {
22080          Bool hasF16C   = (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C) != 0;
22081          Bool hasRDRAND = (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDRAND) != 0;
22082          Bool hasRDSEED = (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDSEED) != 0;
22083          args = mkIRExprVec_4(IRExpr_GSPTR(),
22084                               mkIRExpr_HWord(hasF16C ? 1 : 0),
22085                               mkIRExpr_HWord(hasRDRAND ? 1 : 0),
22086                               mkIRExpr_HWord(hasRDSEED ? 1 : 0));
22087       } else {
22088          args = mkIRExprVec_1(IRExpr_GSPTR());
22089       }
22090       d = unsafeIRDirty_0_N ( 0/*regparms*/, fName, fAddr, args );
22091
22092       /* Declare guest state effects.  EAX, EBX, ECX and EDX are written. EAX
22093          is also read, hence is marked as Modified.  ECX is sometimes also
22094          read, depending on the value in EAX; that much is obvious from
22095          inspection of the helper function.
22096
22097          This is a bit of a problem: if we mark ECX as Modified -- hence, by
22098          implication, Read -- then we may get false positives from Memcheck in
22099          the case where ECX contains undefined bits, but the EAX value is such
22100          that the instruction wouldn't read ECX anyway.  The obvious way out
22101          of this is to mark it as written only, but that means Memcheck will
22102          effectively ignore undefinedness in the incoming ECX value.  That
22103          seems like a small loss to take to avoid false positives here,
22104          though.  Fundamentally the problem exists because CPUID itself has
22105          conditional dataflow -- whether ECX is read depends on the value in
22106          EAX -- but the annotation mechanism for dirty helpers can't represent
22107          that conditionality.
22108
22109          A fully-accurate solution might be to change the helpers so that the
22110          EAX and ECX values are passed as parameters.  Then, for the ECX
22111          value, we can pass, effectively "if EAX is some value for which ECX
22112          is ignored { 0 } else { ECX }", and Memcheck will see and understand
22113          this conditionality. */
22114       d->nFxState = 4;
22115       vex_bzero(&d->fxState, sizeof(d->fxState));
22116       d->fxState[0].fx     = Ifx_Modify;
22117       d->fxState[0].offset = OFFB_RAX;
22118       d->fxState[0].size   = 8;
22119       d->fxState[1].fx     = Ifx_Write;
22120       d->fxState[1].offset = OFFB_RBX;
22121       d->fxState[1].size   = 8;
22122       d->fxState[2].fx     = Ifx_Write; /* was: Ifx_Modify; */
22123       d->fxState[2].offset = OFFB_RCX;
22124       d->fxState[2].size   = 8;
22125       d->fxState[3].fx     = Ifx_Write;
22126       d->fxState[3].offset = OFFB_RDX;
22127       d->fxState[3].size   = 8;
22128       /* Execute the dirty call, side-effecting guest state. */
22129       stmt( IRStmt_Dirty(d) );
22130       /* CPUID is a serialising insn.  So, just in case someone is
22131          using it as a memory fence ... */
22132       stmt( IRStmt_MBE(Imbe_Fence) );
22133       DIP("cpuid\n");
22134       return delta;
22135    }
22136
22137    case 0xA3: { /* BT Gv,Ev */
22138       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22139       Bool ok = True;
22140       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22141       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
22142       if (!ok) goto decode_failure;
22143       return delta;
22144    }
22145
22146    case 0xA4: /* SHLDv imm8,Gv,Ev */
22147       modrm = getUChar(delta);
22148       d64   = delta + lengthAMode(pfx, delta);
22149       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
22150       delta = dis_SHLRD_Gv_Ev (
22151                  vbi, pfx, delta, modrm, sz,
22152                  mkU8(getUChar(d64)), True, /* literal */
22153                  dis_buf, True /* left */ );
22154       return delta;
22155
22156    case 0xA5: /* SHLDv %cl,Gv,Ev */
22157       modrm = getUChar(delta);
22158       delta = dis_SHLRD_Gv_Ev (
22159                  vbi, pfx, delta, modrm, sz,
22160                  getIRegCL(), False, /* not literal */
22161                  "%cl", True /* left */ );
22162       return delta;
22163
22164    case 0xAB: { /* BTS Gv,Ev */
22165       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22166       Bool ok = True;
22167       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22168       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
22169       if (!ok) goto decode_failure;
22170       return delta;
22171    }
22172
22173    case 0xAC: /* SHRDv imm8,Gv,Ev */
22174       modrm = getUChar(delta);
22175       d64   = delta + lengthAMode(pfx, delta);
22176       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
22177       delta = dis_SHLRD_Gv_Ev (
22178                  vbi, pfx, delta, modrm, sz,
22179                  mkU8(getUChar(d64)), True, /* literal */
22180                  dis_buf, False /* right */ );
22181       return delta;
22182
22183    case 0xAD: /* SHRDv %cl,Gv,Ev */
22184       modrm = getUChar(delta);
22185       delta = dis_SHLRD_Gv_Ev (
22186                  vbi, pfx, delta, modrm, sz,
22187                  getIRegCL(), False, /* not literal */
22188                  "%cl", False /* right */);
22189       return delta;
22190
22191    case 0xAF: /* IMUL Ev, Gv */
22192       if (haveF2orF3(pfx)) goto decode_failure;
22193       delta = dis_mul_E_G ( vbi, pfx, sz, delta );
22194       return delta;
22195
22196    case 0xB0: { /* CMPXCHG Gb,Eb */
22197       Bool ok = True;
22198       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
22199       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
22200       if (!ok) goto decode_failure;
22201       return delta;
22202    }
22203
22204    case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
22205       Bool ok = True;
22206       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
22207       if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
22208       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
22209       if (!ok) goto decode_failure;
22210       return delta;
22211    }
22212
22213    case 0xB3: { /* BTR Gv,Ev */
22214       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22215       Bool ok = True;
22216       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22217       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
22218       if (!ok) goto decode_failure;
22219       return delta;
22220    }
22221
22222    case 0xB6: /* MOVZXb Eb,Gv */
22223       if (haveF2orF3(pfx)) goto decode_failure;
22224       if (sz != 2 && sz != 4 && sz != 8)
22225          goto decode_failure;
22226       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
22227       return delta;
22228
22229    case 0xB7: /* MOVZXw Ew,Gv */
22230       if (haveF2orF3(pfx)) goto decode_failure;
22231       if (sz != 4 && sz != 8)
22232          goto decode_failure;
22233       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
22234       return delta;
22235
22236    case 0xBA: { /* Grp8 Ib,Ev */
22237       /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
22238       Bool decode_OK = False;
22239       modrm = getUChar(delta);
22240       am_sz = lengthAMode(pfx,delta);
22241       d64   = getSDisp8(delta + am_sz);
22242       delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
22243                              &decode_OK );
22244       if (!decode_OK)
22245          goto decode_failure;
22246       return delta;
22247    }
22248
22249    case 0xBB: { /* BTC Gv,Ev */
22250       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22251       Bool ok = False;
22252       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22253       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
22254       if (!ok) goto decode_failure;
22255       return delta;
22256    }
22257
22258    case 0xBC: /* BSF Gv,Ev */
22259       if (!haveF2orF3(pfx)
22260           || (haveF3noF2(pfx)
22261               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
22262          /* no-F2 no-F3 0F BC = BSF
22263                   or F3 0F BC = REP; BSF on older CPUs.  */
22264          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
22265          return delta;
22266       }
22267       /* Fall through, since F3 0F BC is TZCNT, and needs to
22268          be handled by dis_ESC_0F__SSE4. */
22269       break;
22270
22271    case 0xBD: /* BSR Gv,Ev */
22272       if (!haveF2orF3(pfx)
22273           || (haveF3noF2(pfx)
22274               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
22275          /* no-F2 no-F3 0F BD = BSR
22276                   or F3 0F BD = REP; BSR on older CPUs.  */
22277          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
22278          return delta;
22279       }
22280       /* Fall through, since F3 0F BD is LZCNT, and needs to
22281          be handled by dis_ESC_0F__SSE4. */
22282       break;
22283
22284    case 0xBE: /* MOVSXb Eb,Gv */
22285       if (haveF2orF3(pfx)) goto decode_failure;
22286       if (sz != 2 && sz != 4 && sz != 8)
22287          goto decode_failure;
22288       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
22289       return delta;
22290
22291    case 0xBF: /* MOVSXw Ew,Gv */
22292       if (haveF2orF3(pfx)) goto decode_failure;
22293       if (sz != 4 && sz != 8)
22294          goto decode_failure;
22295       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
22296       return delta;
22297
22298    case 0xC0: { /* XADD Gb,Eb */
22299       Bool decode_OK = False;
22300       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
22301       if (!decode_OK)
22302          goto decode_failure;
22303       return delta;
22304    }
22305
22306    case 0xC1: { /* XADD Gv,Ev */
22307       Bool decode_OK = False;
22308       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
22309       if (!decode_OK)
22310          goto decode_failure;
22311       return delta;
22312    }
22313
22314    case 0xC7: {
22315       modrm = getUChar(delta);
22316
22317       // Detecting valid CMPXCHG combinations is pretty complex.
22318       Bool isValidCMPXCHG = gregLO3ofRM(modrm) == 1;
22319       if (isValidCMPXCHG) {
22320          if (have66(pfx)) isValidCMPXCHG = False;
22321          if (sz != 4 && sz != 8) isValidCMPXCHG = False;
22322          if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
22323             isValidCMPXCHG = False;
22324          if (epartIsReg(modrm)) isValidCMPXCHG = False;
22325          if (haveF2orF3(pfx)) {
22326             /* Since the e-part is memory only, F2 or F3 (one or the
22327                other) is acceptable if LOCK is also present.  But only
22328                for cmpxchg8b. */
22329             if (sz == 8) isValidCMPXCHG = False;
22330             if (haveF2andF3(pfx) || !haveLOCK(pfx)) isValidCMPXCHG = False;
22331          }
22332       }
22333
22334       /* 0F C7 /1 (with qualifications) = CMPXCHG */
22335       if (isValidCMPXCHG) {
22336          // Note that we've already read the modrm byte by this point, but we
22337          // haven't moved delta past it.
22338          IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
22339          IRTemp  expdHi     = newTemp(elemTy);
22340          IRTemp  expdLo     = newTemp(elemTy);
22341          IRTemp  dataHi     = newTemp(elemTy);
22342          IRTemp  dataLo     = newTemp(elemTy);
22343          IRTemp  oldHi      = newTemp(elemTy);
22344          IRTemp  oldLo      = newTemp(elemTy);
22345          IRTemp  flags_old  = newTemp(Ity_I64);
22346          IRTemp  flags_new  = newTemp(Ity_I64);
22347          IRTemp  success    = newTemp(Ity_I1);
22348          IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
22349          IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
22350          IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
22351          IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
22352          IRTemp expdHi64    = newTemp(Ity_I64);
22353          IRTemp expdLo64    = newTemp(Ity_I64);
22354
22355          /* Translate this using a DCAS, even if there is no LOCK
22356             prefix.  Life is too short to bother with generating two
22357             different translations for the with/without-LOCK-prefix
22358             cases. */
22359          *expect_CAS = True;
22360
22361          /* Generate address */
22362          vassert(!epartIsReg(modrm));
22363          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22364          delta += alen;
22365
22366          /* cmpxchg16b requires an alignment check. */
22367          if (sz == 8)
22368             gen_SIGNAL_if_not_16_aligned( vbi, addr );
22369
22370          /* Get the expected and new values. */
22371          assign( expdHi64, getIReg64(R_RDX) );
22372          assign( expdLo64, getIReg64(R_RAX) );
22373
22374          /* These are the correctly-sized expected and new values.
22375             However, we also get expdHi64/expdLo64 above as 64-bits
22376             regardless, because we will need them later in the 32-bit
22377             case (paradoxically). */
22378          assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
22379                                : mkexpr(expdHi64) );
22380          assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
22381                                : mkexpr(expdLo64) );
22382          assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
22383          assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
22384
22385          /* Do the DCAS */
22386          stmt( IRStmt_CAS(
22387                   mkIRCAS( oldHi, oldLo,
22388                            Iend_LE, mkexpr(addr),
22389                            mkexpr(expdHi), mkexpr(expdLo),
22390                            mkexpr(dataHi), mkexpr(dataLo)
22391                )));
22392
22393          /* success when oldHi:oldLo == expdHi:expdLo */
22394          assign( success,
22395                  binop(opCasCmpEQ,
22396                        binop(opOR,
22397                              binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
22398                              binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
22399                        ),
22400                        zero
22401                  ));
22402
22403          /* If the DCAS is successful, that is to say oldHi:oldLo ==
22404             expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
22405             which is where they came from originally.  Both the actual
22406             contents of these two regs, and any shadow values, are
22407             unchanged.  If the DCAS fails then we're putting into
22408             RDX:RAX the value seen in memory. */
22409          /* Now of course there's a complication in the 32-bit case
22410             (bah!): if the DCAS succeeds, we need to leave RDX:RAX
22411             unchanged; but if we use the same scheme as in the 64-bit
22412             case, we get hit by the standard rule that a write to the
22413             bottom 32 bits of an integer register zeros the upper 32
22414             bits.  And so the upper halves of RDX and RAX mysteriously
22415             become zero.  So we have to stuff back in the original
22416             64-bit values which we previously stashed in
22417             expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
22418          /* It's just _so_ much fun ... */
22419          putIRegRDX( 8,
22420                      IRExpr_ITE( mkexpr(success),
22421                                  mkexpr(expdHi64),
22422                                  sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
22423                                          : mkexpr(oldHi)
22424                    ));
22425          putIRegRAX( 8,
22426                      IRExpr_ITE( mkexpr(success),
22427                                  mkexpr(expdLo64),
22428                                  sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
22429                                          : mkexpr(oldLo)
22430                    ));
22431
22432          /* Copy the success bit into the Z flag and leave the others
22433             unchanged */
22434          assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
22435          assign(
22436             flags_new,
22437             binop(Iop_Or64,
22438                   binop(Iop_And64, mkexpr(flags_old),
22439                                    mkU64(~AMD64G_CC_MASK_Z)),
22440                   binop(Iop_Shl64,
22441                         binop(Iop_And64,
22442                               unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
22443                         mkU8(AMD64G_CC_SHIFT_Z)) ));
22444
22445          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
22446          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
22447          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
22448          /* Set NDEP even though it isn't used.  This makes
22449             redundant-PUT elimination of previous stores to this field
22450             work better. */
22451          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
22452
22453          /* Sheesh.  Aren't you glad it was me and not you that had to
22454             write and validate all this grunge? */
22455
22456          DIP("cmpxchg8b %s\n", dis_buf);
22457          return delta;
22458       } // if (isValidCMPXCHG)
22459
22460       /* 0F C7 /6 no-F2-or-F3 = RDRAND, 0F C7 /7 = RDSEED */
22461       int insn = gregLO3ofRM(modrm);
22462       if (((insn == 6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDRAND))
22463            || (insn == 7 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDSEED)))
22464           && epartIsReg(modrm) && haveNoF2noF3(pfx)
22465           && (sz == 8 || sz == 4 || sz == 2)) {
22466
22467          delta++; // move past modrm
22468          IRType   ty    = szToITy(sz);
22469
22470          // Pull a first 32 bits of randomness, plus C flag, out of the host.
22471          IRTemp pairLO = newTemp(Ity_I64);
22472          IRDirty* dLO;
22473          if (insn == 6) /* RDRAND */
22474              dLO = unsafeIRDirty_1_N(pairLO, 0/*regparms*/,
22475                                      "amd64g_dirtyhelper_RDRAND",
22476                                      &amd64g_dirtyhelper_RDRAND, mkIRExprVec_0());
22477          else /* RDSEED */
22478              dLO = unsafeIRDirty_1_N(pairLO, 0/*regparms*/,
22479                                      "amd64g_dirtyhelper_RDSEED",
22480                                      &amd64g_dirtyhelper_RDSEED, mkIRExprVec_0());
22481
22482          // There are no guest state or memory effects to declare for |dLO|.
22483          stmt( IRStmt_Dirty(dLO) );
22484
22485          IRTemp randsLO = newTemp(Ity_I32);
22486          assign(randsLO, unop(Iop_64to32, mkexpr(pairLO)));
22487          IRTemp cLO = newTemp(Ity_I64);
22488          assign(cLO, binop(Iop_Shr64, mkexpr(pairLO), mkU8(32)));
22489
22490          // We'll assemble the final pairing in (cFinal, randsNearlyFinal).
22491          IRTemp randsNearlyFinal = newTemp(Ity_I64);
22492          IRTemp cFinal = newTemp(Ity_I64);
22493
22494          if (ty == Ity_I64) {
22495             // Pull another 32 bits of randomness out of the host.
22496             IRTemp pairHI = newTemp(Ity_I64);
22497             IRDirty* dHI;
22498             if (insn == 6) /* RDRAND */
22499                 dHI = unsafeIRDirty_1_N(pairHI, 0/*regparms*/,
22500                                         "amd64g_dirtyhelper_RDRAND",
22501                                         &amd64g_dirtyhelper_RDRAND, mkIRExprVec_0());
22502             else /* RDSEED */
22503                 dHI = unsafeIRDirty_1_N(pairHI, 0/*regparms*/,
22504                                         "amd64g_dirtyhelper_RDSEED",
22505                                         &amd64g_dirtyhelper_RDSEED, mkIRExprVec_0());
22506
22507             // There are no guest state or memory effects to declare for |dHI|.
22508             stmt( IRStmt_Dirty(dHI) );
22509
22510             IRTemp randsHI = newTemp(Ity_I32);
22511             assign(randsHI, unop(Iop_64to32, mkexpr(pairHI)));
22512             IRTemp cHI = newTemp(Ity_I64);
22513             assign(cHI, binop(Iop_Shr64, mkexpr(pairHI), mkU8(32)));
22514             assign(randsNearlyFinal, binop(Iop_32HLto64,
22515                                            mkexpr(randsHI), mkexpr(randsLO)));
22516             assign(cFinal, binop(Iop_And64,
22517                                  binop(Iop_And64, mkexpr(cHI), mkexpr(cLO)),
22518                                  mkU64(1)));
22519          } else {
22520             assign(randsNearlyFinal, unop(Iop_32Uto64, mkexpr(randsLO)));
22521             assign(cFinal, binop(Iop_And64, mkexpr(cLO), mkU64(1)));
22522          }
22523
22524          /* Now cFinal[0] is the final success/failure flag (cFinal[0] == 1
22525             means success).  But there's another twist.  If we failed then the
22526             returned value must be forced to zero.  Otherwise we could have the
22527             situation, when sz==8, where one of the host calls failed but the
22528             other didn't.  This would give cFinal[0] == 0 (correctly) but
22529             randsNearlyFinal not being zero, because it contains the 32 bit
22530             result of the non-failing call. */
22531          IRTemp randsFinal = newTemp(Ity_I64);
22532          assign(randsFinal,
22533                 binop(Iop_And64,
22534                       mkexpr(randsNearlyFinal),
22535                       binop(Iop_Sar64,
22536                             binop(Iop_Shl64, mkexpr(cFinal), mkU8(63)),
22537                             mkU8(63))
22538          ));
22539
22540          // So, finally, update the guest state.
22541          putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(randsFinal)));
22542
22543          // Set C=<success indication>, O,S,Z,A,P = 0.  cFinal has already been
22544          // masked so only the lowest bit remains.
22545          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
22546          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(cFinal) ));
22547          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
22548          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
22549
22550          if (insn == 6) {
22551              DIP("rdrand %s", nameIRegE(sz, pfx, modrm));
22552          } else {
22553              DIP("rdseed %s", nameIRegE(sz, pfx, modrm));
22554          }
22555
22556          return delta;
22557       }
22558
22559       goto decode_failure;
22560    }
22561
22562    case 0xC8: /* BSWAP %eax */
22563    case 0xC9:
22564    case 0xCA:
22565    case 0xCB:
22566    case 0xCC:
22567    case 0xCD:
22568    case 0xCE:
22569    case 0xCF: /* BSWAP %edi */
22570       if (haveF2orF3(pfx)) goto decode_failure;
22571       /* According to the AMD64 docs, this insn can have size 4 or
22572          8. */
22573       if (sz == 4) {
22574          t1 = newTemp(Ity_I32);
22575          assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
22576          t2 = math_BSWAP( t1, Ity_I32 );
22577          putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
22578          DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
22579          return delta;
22580       }
22581       if (sz == 8) {
22582          t1 = newTemp(Ity_I64);
22583          t2 = newTemp(Ity_I64);
22584          assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
22585          t2 = math_BSWAP( t1, Ity_I64 );
22586          putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
22587          DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
22588          return delta;
22589       }
22590       goto decode_failure;
22591
22592    default:
22593       break;
22594
22595    } /* first switch */
22596
22597
22598    /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
22599    /* In the second switch, pick off MMX insns. */
22600
22601    if (!have66orF2orF3(pfx)) {
22602       /* So there's no SIMD prefix. */
22603
22604       vassert(sz == 4 || sz == 8);
22605
22606       switch (opc) { /* second switch */
22607
22608       case 0x71:
22609       case 0x72:
22610       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
22611
22612       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
22613       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
22614       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
22615       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
22616
22617       case 0xFC:
22618       case 0xFD:
22619       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
22620
22621       case 0xEC:
22622       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
22623
22624       case 0xDC:
22625       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
22626
22627       case 0xF8:
22628       case 0xF9:
22629       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
22630
22631       case 0xE8:
22632       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
22633
22634       case 0xD8:
22635       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
22636
22637       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
22638       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
22639
22640       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
22641
22642       case 0x74:
22643       case 0x75:
22644       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
22645
22646       case 0x64:
22647       case 0x65:
22648       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
22649
22650       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
22651       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
22652       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
22653
22654       case 0x68:
22655       case 0x69:
22656       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
22657
22658       case 0x60:
22659       case 0x61:
22660       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
22661
22662       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
22663       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
22664       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
22665       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
22666
22667       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
22668       case 0xF2:
22669       case 0xF3:
22670
22671       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
22672       case 0xD2:
22673       case 0xD3:
22674
22675       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
22676       case 0xE2: {
22677          Bool decode_OK = False;
22678          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
22679          if (decode_OK)
22680             return delta;
22681          goto decode_failure;
22682       }
22683
22684       default:
22685          break;
22686       } /* second switch */
22687
22688    }
22689
22690    /* A couple of MMX corner cases */
22691    if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
22692       if (sz != 4)
22693          goto decode_failure;
22694       do_EMMS_preamble();
22695       DIP("{f}emms\n");
22696       return delta;
22697    }
22698
22699    /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
22700    /* Perhaps it's an SSE or SSE2 instruction.  We can try this
22701       without checking the guest hwcaps because SSE2 is a baseline
22702       facility in 64 bit mode. */
22703    {
22704       Bool decode_OK = False;
22705       delta = dis_ESC_0F__SSE2 ( &decode_OK,
22706                                  archinfo, vbi, pfx, sz, deltaIN, dres );
22707       if (decode_OK)
22708          return delta;
22709    }
22710
22711    /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
22712    /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
22713       first. */
22714    {
22715       Bool decode_OK = False;
22716       delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22717       if (decode_OK)
22718          return delta;
22719    }
22720
22721    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22722    /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
22723       first. */
22724    {
22725       Bool decode_OK = False;
22726       delta = dis_ESC_0F__SSE4 ( &decode_OK,
22727                                  archinfo, vbi, pfx, sz, deltaIN );
22728       if (decode_OK)
22729          return delta;
22730    }
22731
22732   decode_failure:
22733    return deltaIN; /* fail */
22734 }
22735
22736
22737 /*------------------------------------------------------------*/
22738 /*---                                                      ---*/
22739 /*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
22740 /*---                                                      ---*/
22741 /*------------------------------------------------------------*/
22742
22743 __attribute__((noinline))
22744 static
22745 Long dis_ESC_0F38 (
22746         /*MB_OUT*/DisResult* dres,
22747         const VexArchInfo* archinfo,
22748         const VexAbiInfo*  vbi,
22749         Prefix pfx, Int sz, Long deltaIN
22750      )
22751 {
22752    Long   delta = deltaIN;
22753    UChar  opc   = getUChar(delta);
22754    delta++;
22755    switch (opc) {
22756
22757    case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
22758    case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
22759       if (!haveF2orF3(pfx) && !haveVEX(pfx)
22760           && (sz == 2 || sz == 4 || sz == 8)) {
22761          IRTemp addr  = IRTemp_INVALID;
22762          UChar  modrm = 0;
22763          Int    alen  = 0;
22764          HChar  dis_buf[50];
22765          modrm = getUChar(delta);
22766          if (epartIsReg(modrm)) break;
22767          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22768          delta += alen;
22769          IRType ty = szToITy(sz);
22770          IRTemp src = newTemp(ty);
22771          if (opc == 0xF0) { /* LOAD */
22772             assign(src, loadLE(ty, mkexpr(addr)));
22773             IRTemp dst = math_BSWAP(src, ty);
22774             putIRegG(sz, pfx, modrm, mkexpr(dst));
22775             DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
22776          } else { /* STORE */
22777             assign(src, getIRegG(sz, pfx, modrm));
22778             IRTemp dst = math_BSWAP(src, ty);
22779             storeLE(mkexpr(addr), mkexpr(dst));
22780             DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
22781          }
22782          return delta;
22783       }
22784       /* else fall through; maybe one of the decoders below knows what
22785          it is. */
22786       break;
22787    }
22788
22789    default:
22790       break;
22791    }
22792
22793    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
22794    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
22795       rather than proceeding indiscriminately. */
22796    {
22797       Bool decode_OK = False;
22798       delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22799       if (decode_OK)
22800          return delta;
22801    }
22802
22803    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22804    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
22805       rather than proceeding indiscriminately. */
22806    {
22807       Bool decode_OK = False;
22808       delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
22809       if (decode_OK)
22810          return delta;
22811    }
22812
22813    /* Ignore previous decode attempts and restart from the beginning of
22814       the instruction. */
22815    delta = deltaIN;
22816    opc   = getUChar(delta);
22817    delta++;
22818
22819    switch (opc) {
22820
22821    case 0xF6: {
22822       /* 66 0F 38 F6 = ADCX r32/64(G), m32/64(E) */
22823       /* F3 0F 38 F6 = ADOX r32/64(G), m32/64(E) */
22824       /* These were introduced in Broadwell.  Gate them on AVX so as to at
22825          least reject them on earlier guests.  Has no host requirements. */
22826       if (have66noF2noF3(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
22827          if (sz == 2) {
22828             sz = 4; /* 66 prefix but operand size is 4/8 */
22829          }
22830          delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarryX, True,
22831                                sz, delta, "adcx" );
22832          return delta;
22833       }
22834       if (haveF3no66noF2(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
22835          delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagOverX, True,
22836                                sz, delta, "adox" );
22837          return delta;
22838       }
22839       /* else fall through */
22840       break;
22841    }
22842
22843    default:
22844       break;
22845    }
22846
22847   /*decode_failure:*/
22848    return deltaIN; /* fail */
22849 }
22850
22851
22852 /*------------------------------------------------------------*/
22853 /*---                                                      ---*/
22854 /*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
22855 /*---                                                      ---*/
22856 /*------------------------------------------------------------*/
22857
22858 __attribute__((noinline))
22859 static
22860 Long dis_ESC_0F3A (
22861         /*MB_OUT*/DisResult* dres,
22862         const VexArchInfo* archinfo,
22863         const VexAbiInfo*  vbi,
22864         Prefix pfx, Int sz, Long deltaIN
22865      )
22866 {
22867    Long   delta = deltaIN;
22868    UChar  opc   = getUChar(delta);
22869    delta++;
22870    switch (opc) {
22871
22872    default:
22873       break;
22874
22875    }
22876
22877    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
22878    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
22879       rather than proceeding indiscriminately. */
22880    {
22881       Bool decode_OK = False;
22882       delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22883       if (decode_OK)
22884          return delta;
22885    }
22886
22887    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22888    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
22889       rather than proceeding indiscriminately. */
22890    {
22891       Bool decode_OK = False;
22892       delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
22893       if (decode_OK)
22894          return delta;
22895    }
22896
22897    return deltaIN; /* fail */
22898 }
22899
22900
22901 /*------------------------------------------------------------*/
22902 /*---                                                      ---*/
22903 /*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
22904 /*---                                                      ---*/
22905 /*------------------------------------------------------------*/
22906
22907 /* FIXME: common up with the _256_ version below? */
22908 static
22909 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
22910         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
22911         Prefix pfx, Long delta, const HChar* name,
22912         /* The actual operation.  Use either 'op' or 'opfn',
22913            but not both. */
22914         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
22915         Bool invertLeftArg,
22916         Bool swapArgs
22917      )
22918 {
22919    UChar  modrm = getUChar(delta);
22920    UInt   rD    = gregOfRexRM(pfx, modrm);
22921    UInt   rSL   = getVexNvvvv(pfx);
22922    IRTemp tSL   = newTemp(Ity_V128);
22923    IRTemp tSR   = newTemp(Ity_V128);
22924    IRTemp addr  = IRTemp_INVALID;
22925    HChar  dis_buf[50];
22926    Int    alen  = 0;
22927    vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
22928
22929    assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
22930                              : getXMMReg(rSL));
22931
22932    if (epartIsReg(modrm)) {
22933       UInt rSR = eregOfRexRM(pfx, modrm);
22934       delta += 1;
22935       assign(tSR, getXMMReg(rSR));
22936       DIP("%s %s,%s,%s\n",
22937           name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
22938    } else {
22939       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
22940       delta += alen;
22941       assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
22942       DIP("%s %s,%s,%s\n",
22943           name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
22944    }
22945
22946    IRTemp res = IRTemp_INVALID;
22947    if (op != Iop_INVALID) {
22948       vassert(opFn == NULL);
22949       res = newTemp(Ity_V128);
22950       if (requiresRMode(op)) {
22951          IRTemp rm = newTemp(Ity_I32);
22952          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
22953          assign(res, swapArgs
22954                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
22955                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
22956       } else {
22957          assign(res, swapArgs
22958                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
22959                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
22960       }
22961    } else {
22962       vassert(opFn != NULL);
22963       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
22964    }
22965
22966    putYMMRegLoAndZU(rD, mkexpr(res));
22967
22968    *uses_vvvv = True;
22969    return delta;
22970 }
22971
22972
22973 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
22974    for the operation, no inversion of the left arg, and no swapping of
22975    args. */
22976 static
22977 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
22978         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
22979         Prefix pfx, Long delta, const HChar* name,
22980         IROp op
22981      )
22982 {
22983    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22984              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
22985 }
22986
22987
22988 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
22989    generator to compute the result, no inversion of the left
22990    arg, and no swapping of args. */
22991 static
22992 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
22993         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
22994         Prefix pfx, Long delta, const HChar* name,
22995         IRTemp(*opFn)(IRTemp,IRTemp)
22996      )
22997 {
22998    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22999              uses_vvvv, vbi, pfx, delta, name,
23000              Iop_INVALID, opFn, False, False );
23001 }
23002
23003
23004 /* Vector by scalar shift of V by the amount specified at the bottom
23005    of E. */
23006 static ULong dis_AVX128_shiftV_byE ( const VexAbiInfo* vbi,
23007                                      Prefix pfx, Long delta,
23008                                      const HChar* opname, IROp op )
23009 {
23010    HChar   dis_buf[50];
23011    Int     alen, size;
23012    IRTemp  addr;
23013    Bool    shl, shr, sar;
23014    UChar   modrm = getUChar(delta);
23015    UInt    rG    = gregOfRexRM(pfx,modrm);
23016    UInt    rV    = getVexNvvvv(pfx);;
23017    IRTemp  g0    = newTemp(Ity_V128);
23018    IRTemp  g1    = newTemp(Ity_V128);
23019    IRTemp  amt   = newTemp(Ity_I64);
23020    IRTemp  amt8  = newTemp(Ity_I8);
23021    if (epartIsReg(modrm)) {
23022       UInt rE = eregOfRexRM(pfx,modrm);
23023       assign( amt, getXMMRegLane64(rE, 0) );
23024       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
23025           nameXMMReg(rV), nameXMMReg(rG) );
23026       delta++;
23027    } else {
23028       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23029       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
23030       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
23031       delta += alen;
23032    }
23033    assign( g0, getXMMReg(rV) );
23034    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
23035
23036    shl = shr = sar = False;
23037    size = 0;
23038    switch (op) {
23039       case Iop_ShlN16x8: shl = True; size = 32; break;
23040       case Iop_ShlN32x4: shl = True; size = 32; break;
23041       case Iop_ShlN64x2: shl = True; size = 64; break;
23042       case Iop_SarN16x8: sar = True; size = 16; break;
23043       case Iop_SarN32x4: sar = True; size = 32; break;
23044       case Iop_ShrN16x8: shr = True; size = 16; break;
23045       case Iop_ShrN32x4: shr = True; size = 32; break;
23046       case Iop_ShrN64x2: shr = True; size = 64; break;
23047       default: vassert(0);
23048    }
23049
23050    if (shl || shr) {
23051      assign(
23052         g1,
23053         IRExpr_ITE(
23054            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
23055            binop(op, mkexpr(g0), mkexpr(amt8)),
23056            mkV128(0x0000)
23057         )
23058      );
23059    } else
23060    if (sar) {
23061      assign(
23062         g1,
23063         IRExpr_ITE(
23064            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
23065            binop(op, mkexpr(g0), mkexpr(amt8)),
23066            binop(op, mkexpr(g0), mkU8(size-1))
23067         )
23068      );
23069    } else {
23070       vassert(0);
23071    }
23072
23073    putYMMRegLoAndZU( rG, mkexpr(g1) );
23074    return delta;
23075 }
23076
23077
23078 /* Vector by scalar shift of V by the amount specified at the bottom
23079    of E. */
23080 static ULong dis_AVX256_shiftV_byE ( const VexAbiInfo* vbi,
23081                                      Prefix pfx, Long delta,
23082                                      const HChar* opname, IROp op )
23083 {
23084    HChar   dis_buf[50];
23085    Int     alen, size;
23086    IRTemp  addr;
23087    Bool    shl, shr, sar;
23088    UChar   modrm = getUChar(delta);
23089    UInt    rG    = gregOfRexRM(pfx,modrm);
23090    UInt    rV    = getVexNvvvv(pfx);;
23091    IRTemp  g0    = newTemp(Ity_V256);
23092    IRTemp  g1    = newTemp(Ity_V256);
23093    IRTemp  amt   = newTemp(Ity_I64);
23094    IRTemp  amt8  = newTemp(Ity_I8);
23095    if (epartIsReg(modrm)) {
23096       UInt rE = eregOfRexRM(pfx,modrm);
23097       assign( amt, getXMMRegLane64(rE, 0) );
23098       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
23099           nameYMMReg(rV), nameYMMReg(rG) );
23100       delta++;
23101    } else {
23102       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23103       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
23104       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
23105       delta += alen;
23106    }
23107    assign( g0, getYMMReg(rV) );
23108    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
23109
23110    shl = shr = sar = False;
23111    size = 0;
23112    switch (op) {
23113       case Iop_ShlN16x16: shl = True; size = 32; break;
23114       case Iop_ShlN32x8:  shl = True; size = 32; break;
23115       case Iop_ShlN64x4:  shl = True; size = 64; break;
23116       case Iop_SarN16x16: sar = True; size = 16; break;
23117       case Iop_SarN32x8:  sar = True; size = 32; break;
23118       case Iop_ShrN16x16: shr = True; size = 16; break;
23119       case Iop_ShrN32x8:  shr = True; size = 32; break;
23120       case Iop_ShrN64x4:  shr = True; size = 64; break;
23121       default: vassert(0);
23122    }
23123
23124    if (shl || shr) {
23125      assign(
23126         g1,
23127         IRExpr_ITE(
23128            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
23129            binop(op, mkexpr(g0), mkexpr(amt8)),
23130            binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
23131         )
23132      );
23133    } else
23134    if (sar) {
23135      assign(
23136         g1,
23137         IRExpr_ITE(
23138            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
23139            binop(op, mkexpr(g0), mkexpr(amt8)),
23140            binop(op, mkexpr(g0), mkU8(size-1))
23141         )
23142      );
23143    } else {
23144       vassert(0);
23145    }
23146
23147    putYMMReg( rG, mkexpr(g1) );
23148    return delta;
23149 }
23150
23151
23152 /* Vector by vector shift of V by the amount specified at the bottom
23153    of E.  Vector by vector shifts are defined for all shift amounts,
23154    so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
23155    anyway).  */
23156 static ULong dis_AVX_var_shiftV_byE ( const VexAbiInfo* vbi,
23157                                       Prefix pfx, Long delta,
23158                                       const HChar* opname, IROp op, Bool isYMM )
23159 {
23160    HChar   dis_buf[50];
23161    Int     alen, size, i;
23162    IRTemp  addr;
23163    UChar   modrm = getUChar(delta);
23164    UInt    rG    = gregOfRexRM(pfx,modrm);
23165    UInt    rV    = getVexNvvvv(pfx);;
23166    IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
23167    IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
23168    IRTemp  amts[8], sVs[8], res[8];
23169    if (epartIsReg(modrm)) {
23170       UInt rE = eregOfRexRM(pfx,modrm);
23171       assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
23172       if (isYMM) {
23173          DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
23174              nameYMMReg(rV), nameYMMReg(rG) );
23175       } else {
23176          DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
23177              nameXMMReg(rV), nameXMMReg(rG) );
23178       }
23179       delta++;
23180    } else {
23181       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23182       assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
23183       if (isYMM) {
23184          DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
23185              nameYMMReg(rG) );
23186       } else {
23187          DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
23188              nameXMMReg(rG) );
23189       }
23190       delta += alen;
23191    }
23192    assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
23193
23194    size = 0;
23195    switch (op) {
23196       case Iop_Shl32: size = 32; break;
23197       case Iop_Shl64: size = 64; break;
23198       case Iop_Sar32: size = 32; break;
23199       case Iop_Shr32: size = 32; break;
23200       case Iop_Shr64: size = 64; break;
23201       default: vassert(0);
23202    }
23203
23204    for (i = 0; i < 8; i++) {
23205       sVs[i] = IRTemp_INVALID;
23206       amts[i] = IRTemp_INVALID;
23207    }
23208    switch (size) {
23209       case 32:
23210          if (isYMM) {
23211             breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
23212                                   &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
23213             breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
23214                                    &amts[3], &amts[2], &amts[1], &amts[0] );
23215          } else {
23216             breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
23217             breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
23218         }
23219          break;
23220       case 64:
23221          if (isYMM) {
23222             breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
23223             breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
23224          } else {
23225             breakupV128to64s( sV, &sVs[1], &sVs[0] );
23226             breakupV128to64s( amt, &amts[1], &amts[0] );
23227          }
23228          break;
23229       default: vassert(0);
23230    }
23231    for (i = 0; i < 8; i++)
23232       if (sVs[i] != IRTemp_INVALID) {
23233          res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
23234          assign( res[i],
23235                  IRExpr_ITE(
23236                     binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
23237                           mkexpr(amts[i]),
23238                           size == 32 ? mkU32(size) : mkU64(size)),
23239                     binop(op, mkexpr(sVs[i]),
23240                                unop(size == 32 ? Iop_32to8 : Iop_64to8,
23241                                     mkexpr(amts[i]))),
23242                     op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
23243                                     : size == 32 ? mkU32(0) : mkU64(0)
23244          ));
23245       }
23246    switch (size) {
23247       case 32:
23248          for (i = 0; i < 8; i++)
23249             putYMMRegLane32( rG, i, (i < 4 || isYMM)
23250                                     ? mkexpr(res[i]) : mkU32(0) );
23251          break;
23252       case 64:
23253          for (i = 0; i < 4; i++)
23254             putYMMRegLane64( rG, i, (i < 2 || isYMM)
23255                                     ? mkexpr(res[i]) : mkU64(0) );
23256          break;
23257       default: vassert(0);
23258    }
23259
23260    return delta;
23261 }
23262
23263
23264 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
23265    version of dis_SSE_shiftE_imm. */
23266 static
23267 Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
23268                                  Long delta, const HChar* opname, IROp op )
23269 {
23270    Bool    shl, shr, sar;
23271    UChar   rm   = getUChar(delta);
23272    IRTemp  e0   = newTemp(Ity_V128);
23273    IRTemp  e1   = newTemp(Ity_V128);
23274    UInt    rD   = getVexNvvvv(pfx);
23275    UChar   amt, size;
23276    vassert(epartIsReg(rm));
23277    vassert(gregLO3ofRM(rm) == 2
23278            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
23279    amt = getUChar(delta+1);
23280    delta += 2;
23281    DIP("%s $%d,%s,%s\n", opname,
23282                          (Int)amt,
23283                          nameXMMReg(eregOfRexRM(pfx,rm)),
23284                          nameXMMReg(rD));
23285    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
23286
23287    shl = shr = sar = False;
23288    size = 0;
23289    switch (op) {
23290       case Iop_ShlN16x8: shl = True; size = 16; break;
23291       case Iop_ShlN32x4: shl = True; size = 32; break;
23292       case Iop_ShlN64x2: shl = True; size = 64; break;
23293       case Iop_SarN16x8: sar = True; size = 16; break;
23294       case Iop_SarN32x4: sar = True; size = 32; break;
23295       case Iop_ShrN16x8: shr = True; size = 16; break;
23296       case Iop_ShrN32x4: shr = True; size = 32; break;
23297       case Iop_ShrN64x2: shr = True; size = 64; break;
23298       default: vassert(0);
23299    }
23300
23301    if (shl || shr) {
23302      assign( e1, amt >= size
23303                     ? mkV128(0x0000)
23304                     : binop(op, mkexpr(e0), mkU8(amt))
23305      );
23306    } else
23307    if (sar) {
23308      assign( e1, amt >= size
23309                     ? binop(op, mkexpr(e0), mkU8(size-1))
23310                     : binop(op, mkexpr(e0), mkU8(amt))
23311      );
23312    } else {
23313       vassert(0);
23314    }
23315
23316    putYMMRegLoAndZU( rD, mkexpr(e1) );
23317    return delta;
23318 }
23319
23320
23321 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
23322    version of dis_AVX128_shiftE_to_V_imm. */
23323 static
23324 Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
23325                                  Long delta, const HChar* opname, IROp op )
23326 {
23327    Bool    shl, shr, sar;
23328    UChar   rm   = getUChar(delta);
23329    IRTemp  e0   = newTemp(Ity_V256);
23330    IRTemp  e1   = newTemp(Ity_V256);
23331    UInt    rD   = getVexNvvvv(pfx);
23332    UChar   amt, size;
23333    vassert(epartIsReg(rm));
23334    vassert(gregLO3ofRM(rm) == 2
23335            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
23336    amt = getUChar(delta+1);
23337    delta += 2;
23338    DIP("%s $%d,%s,%s\n", opname,
23339                          (Int)amt,
23340                          nameYMMReg(eregOfRexRM(pfx,rm)),
23341                          nameYMMReg(rD));
23342    assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
23343
23344    shl = shr = sar = False;
23345    size = 0;
23346    switch (op) {
23347       case Iop_ShlN16x16: shl = True; size = 16; break;
23348       case Iop_ShlN32x8:  shl = True; size = 32; break;
23349       case Iop_ShlN64x4:  shl = True; size = 64; break;
23350       case Iop_SarN16x16: sar = True; size = 16; break;
23351       case Iop_SarN32x8:  sar = True; size = 32; break;
23352       case Iop_ShrN16x16: shr = True; size = 16; break;
23353       case Iop_ShrN32x8:  shr = True; size = 32; break;
23354       case Iop_ShrN64x4:  shr = True; size = 64; break;
23355       default: vassert(0);
23356    }
23357
23358
23359    if (shl || shr) {
23360      assign( e1, amt >= size
23361                     ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
23362                     : binop(op, mkexpr(e0), mkU8(amt))
23363      );
23364    } else
23365    if (sar) {
23366      assign( e1, amt >= size
23367                     ? binop(op, mkexpr(e0), mkU8(size-1))
23368                     : binop(op, mkexpr(e0), mkU8(amt))
23369      );
23370    } else {
23371       vassert(0);
23372    }
23373
23374    putYMMReg( rD, mkexpr(e1) );
23375    return delta;
23376 }
23377
23378
23379 /* Lower 64-bit lane only AVX128 binary operation:
23380    G[63:0]    = V[63:0] `op` E[63:0]
23381    G[127:64]  = V[127:64]
23382    G[255:128] = 0.
23383    The specified op must be of the 64F0x2 kind, so that it
23384    copies the upper half of the left operand to the result.
23385 */
23386 static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
23387                                        const VexAbiInfo* vbi,
23388                                        Prefix pfx, Long delta,
23389                                        const HChar* opname, IROp op )
23390 {
23391    HChar   dis_buf[50];
23392    Int     alen;
23393    IRTemp  addr;
23394    UChar   rm    = getUChar(delta);
23395    UInt    rG    = gregOfRexRM(pfx,rm);
23396    UInt    rV    = getVexNvvvv(pfx);
23397    IRExpr* vpart = getXMMReg(rV);
23398    if (epartIsReg(rm)) {
23399       UInt rE = eregOfRexRM(pfx,rm);
23400       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
23401       DIP("%s %s,%s,%s\n", opname,
23402           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23403       delta = delta+1;
23404    } else {
23405       /* We can only do a 64-bit memory read, so the upper half of the
23406          E operand needs to be made simply of zeroes. */
23407       IRTemp epart = newTemp(Ity_V128);
23408       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23409       assign( epart, unop( Iop_64UtoV128,
23410                            loadLE(Ity_I64, mkexpr(addr))) );
23411       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
23412       DIP("%s %s,%s,%s\n", opname,
23413           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23414       delta = delta+alen;
23415    }
23416    putYMMRegLane128( rG, 1, mkV128(0) );
23417    *uses_vvvv = True;
23418    return delta;
23419 }
23420
23421
23422 /* Lower 64-bit lane only AVX128 unary operation:
23423    G[63:0]    = op(E[63:0])
23424    G[127:64]  = V[127:64]
23425    G[255:128] = 0
23426    The specified op must be of the 64F0x2 kind, so that it
23427    copies the upper half of the operand to the result.
23428 */
23429 static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
23430                                              const VexAbiInfo* vbi,
23431                                              Prefix pfx, Long delta,
23432                                              const HChar* opname, IROp op )
23433 {
23434    HChar   dis_buf[50];
23435    Int     alen;
23436    IRTemp  addr;
23437    UChar   rm  = getUChar(delta);
23438    UInt    rG  = gregOfRexRM(pfx,rm);
23439    UInt    rV  = getVexNvvvv(pfx);
23440    IRTemp  e64 = newTemp(Ity_I64);
23441
23442    /* Fetch E[63:0] */
23443    if (epartIsReg(rm)) {
23444       UInt rE = eregOfRexRM(pfx,rm);
23445       assign(e64, getXMMRegLane64(rE, 0));
23446       DIP("%s %s,%s,%s\n", opname,
23447           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23448       delta += 1;
23449    } else {
23450       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23451       assign(e64, loadLE(Ity_I64, mkexpr(addr)));
23452       DIP("%s %s,%s,%s\n", opname,
23453           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23454       delta += alen;
23455    }
23456
23457    /* Create a value 'arg' as V[127:64]++E[63:0] */
23458    IRTemp arg = newTemp(Ity_V128);
23459    assign(arg,
23460           binop(Iop_SetV128lo64,
23461                 getXMMReg(rV), mkexpr(e64)));
23462    /* and apply op to it */
23463    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
23464    *uses_vvvv = True;
23465    return delta;
23466 }
23467
23468
23469 /* Lower 32-bit lane only AVX128 unary operation:
23470    G[31:0]    = op(E[31:0])
23471    G[127:32]  = V[127:32]
23472    G[255:128] = 0
23473    The specified op must be of the 32F0x4 kind, so that it
23474    copies the upper 3/4 of the operand to the result.
23475 */
23476 static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
23477                                              const VexAbiInfo* vbi,
23478                                              Prefix pfx, Long delta,
23479                                              const HChar* opname, IROp op )
23480 {
23481    HChar   dis_buf[50];
23482    Int     alen;
23483    IRTemp  addr;
23484    UChar   rm  = getUChar(delta);
23485    UInt    rG  = gregOfRexRM(pfx,rm);
23486    UInt    rV  = getVexNvvvv(pfx);
23487    IRTemp  e32 = newTemp(Ity_I32);
23488
23489    /* Fetch E[31:0] */
23490    if (epartIsReg(rm)) {
23491       UInt rE = eregOfRexRM(pfx,rm);
23492       assign(e32, getXMMRegLane32(rE, 0));
23493       DIP("%s %s,%s,%s\n", opname,
23494           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23495       delta += 1;
23496    } else {
23497       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23498       assign(e32, loadLE(Ity_I32, mkexpr(addr)));
23499       DIP("%s %s,%s,%s\n", opname,
23500           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23501       delta += alen;
23502    }
23503
23504    /* Create a value 'arg' as V[127:32]++E[31:0] */
23505    IRTemp arg = newTemp(Ity_V128);
23506    assign(arg,
23507           binop(Iop_SetV128lo32,
23508                 getXMMReg(rV), mkexpr(e32)));
23509    /* and apply op to it */
23510    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
23511    *uses_vvvv = True;
23512    return delta;
23513 }
23514
23515
23516 /* Lower 32-bit lane only AVX128 binary operation:
23517    G[31:0]    = V[31:0] `op` E[31:0]
23518    G[127:32]  = V[127:32]
23519    G[255:128] = 0.
23520    The specified op must be of the 32F0x4 kind, so that it
23521    copies the upper 3/4 of the left operand to the result.
23522 */
23523 static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
23524                                        const VexAbiInfo* vbi,
23525                                        Prefix pfx, Long delta,
23526                                        const HChar* opname, IROp op )
23527 {
23528    HChar   dis_buf[50];
23529    Int     alen;
23530    IRTemp  addr;
23531    UChar   rm    = getUChar(delta);
23532    UInt    rG    = gregOfRexRM(pfx,rm);
23533    UInt    rV    = getVexNvvvv(pfx);
23534    IRExpr* vpart = getXMMReg(rV);
23535    if (epartIsReg(rm)) {
23536       UInt rE = eregOfRexRM(pfx,rm);
23537       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
23538       DIP("%s %s,%s,%s\n", opname,
23539           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23540       delta = delta+1;
23541    } else {
23542       /* We can only do a 32-bit memory read, so the upper 3/4 of the
23543          E operand needs to be made simply of zeroes. */
23544       IRTemp epart = newTemp(Ity_V128);
23545       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23546       assign( epart, unop( Iop_32UtoV128,
23547                            loadLE(Ity_I32, mkexpr(addr))) );
23548       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
23549       DIP("%s %s,%s,%s\n", opname,
23550           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23551       delta = delta+alen;
23552    }
23553    putYMMRegLane128( rG, 1, mkV128(0) );
23554    *uses_vvvv = True;
23555    return delta;
23556 }
23557
23558
23559 /* All-lanes AVX128 binary operation:
23560    G[127:0]   = V[127:0] `op` E[127:0]
23561    G[255:128] = 0.
23562 */
23563 static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
23564                                   const VexAbiInfo* vbi,
23565                                   Prefix pfx, Long delta,
23566                                   const HChar* opname, IROp op )
23567 {
23568    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23569              uses_vvvv, vbi, pfx, delta, opname, op,
23570              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
23571    );
23572 }
23573
23574
23575 /* Handles AVX128 32F/64F comparisons.  A derivative of
23576    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
23577    original delta to indicate failure. */
23578 static
23579 Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
23580                                const VexAbiInfo* vbi,
23581                                Prefix pfx, Long delta,
23582                                const HChar* opname, Bool all_lanes, Int sz )
23583 {
23584    vassert(sz == 4 || sz == 8);
23585    Long    deltaIN = delta;
23586    HChar   dis_buf[50];
23587    Int     alen;
23588    UInt    imm8;
23589    IRTemp  addr;
23590    Bool    preZero = False;
23591    Bool    preSwap = False;
23592    IROp    op      = Iop_INVALID;
23593    Bool    postNot = False;
23594    IRTemp  plain   = newTemp(Ity_V128);
23595    UChar   rm      = getUChar(delta);
23596    UInt    rG      = gregOfRexRM(pfx, rm);
23597    UInt    rV      = getVexNvvvv(pfx);
23598    IRTemp  argL    = newTemp(Ity_V128);
23599    IRTemp  argR    = newTemp(Ity_V128);
23600
23601    assign(argL, getXMMReg(rV));
23602    if (epartIsReg(rm)) {
23603       imm8 = getUChar(delta+1);
23604       Bool ok = findSSECmpOp(&preZero, &preSwap, &op, &postNot,
23605                              imm8, all_lanes, sz);
23606       if (!ok) return deltaIN; /* FAIL */
23607       UInt rE = eregOfRexRM(pfx,rm);
23608       assign(argR, getXMMReg(rE));
23609       delta += 1+1;
23610       DIP("%s $%u,%s,%s,%s\n",
23611           opname, imm8,
23612           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23613    } else {
23614       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23615       imm8 = getUChar(delta+alen);
23616       Bool ok = findSSECmpOp(&preZero, &preSwap, &op, &postNot,
23617                              imm8, all_lanes, sz);
23618       if (!ok) return deltaIN; /* FAIL */
23619       assign(argR,
23620              all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
23621              : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
23622              : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
23623       delta += alen+1;
23624       DIP("%s $%u,%s,%s,%s\n",
23625           opname, imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23626    }
23627
23628    IRTemp argMask = newTemp(Ity_V128);
23629    if (preZero) {
23630       // In this case, preSwap is irrelevant, but it's harmless to honour it
23631       // anyway.
23632       assign(argMask, mkV128(all_lanes ? 0x0000 : (sz==4 ? 0xFFF0 : 0xFF00)));
23633    } else {
23634       assign(argMask, mkV128(0xFFFF));
23635    }
23636
23637    assign(
23638       plain,
23639       preSwap ? binop(op, binop(Iop_AndV128, mkexpr(argR), mkexpr(argMask)),
23640                           binop(Iop_AndV128, mkexpr(argL), mkexpr(argMask)))
23641               : binop(op, binop(Iop_AndV128, mkexpr(argL), mkexpr(argMask)),
23642                           binop(Iop_AndV128, mkexpr(argR), mkexpr(argMask)))
23643    );
23644
23645    if (all_lanes) {
23646       /* This is simple: just invert the result, if necessary, and
23647          have done. */
23648       if (postNot) {
23649          putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
23650       } else {
23651          putYMMRegLoAndZU( rG, mkexpr(plain) );
23652       }
23653    }
23654    else
23655    if (!preSwap) {
23656       /* More complex.  It's a one-lane-only, hence need to possibly
23657          invert only that one lane.  But at least the other lanes are
23658          correctly "in" the result, having been copied from the left
23659          operand (argL). */
23660       if (postNot) {
23661          IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
23662          putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
23663                                                   mask) );
23664       } else {
23665          putYMMRegLoAndZU( rG, mkexpr(plain) );
23666       }
23667    }
23668    else {
23669       /* This is the most complex case.  One-lane-only, but the args
23670          were swapped.  So we have to possibly invert the bottom lane,
23671          and (definitely) we have to copy the upper lane(s) from argL
23672          since, due to the swapping, what's currently there is from
23673          argR, which is not correct. */
23674       IRTemp res     = newTemp(Ity_V128);
23675       IRTemp mask    = newTemp(Ity_V128);
23676       IRTemp notMask = newTemp(Ity_V128);
23677       assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
23678       assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
23679       if (postNot) {
23680          assign(res,
23681                 binop(Iop_OrV128,
23682                       binop(Iop_AndV128,
23683                             unop(Iop_NotV128, mkexpr(plain)),
23684                             mkexpr(mask)),
23685                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
23686       } else {
23687          assign(res,
23688                 binop(Iop_OrV128,
23689                       binop(Iop_AndV128,
23690                             mkexpr(plain),
23691                             mkexpr(mask)),
23692                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
23693       }
23694       putYMMRegLoAndZU( rG, mkexpr(res) );
23695    }
23696
23697    *uses_vvvv = True;
23698    return delta;
23699 }
23700
23701
23702 /* Handles AVX256 32F/64F comparisons.  A derivative of
23703    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
23704    original delta to indicate failure. */
23705 static
23706 Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
23707                                const VexAbiInfo* vbi,
23708                                Prefix pfx, Long delta,
23709                                const HChar* opname, Int sz )
23710 {
23711    vassert(sz == 4 || sz == 8);
23712    Long    deltaIN = delta;
23713    HChar   dis_buf[50];
23714    Int     alen;
23715    UInt    imm8;
23716    IRTemp  addr;
23717    Bool    preZero = False;
23718    Bool    preSwap = False;
23719    IROp    op      = Iop_INVALID;
23720    Bool    postNot = False;
23721    IRTemp  plain   = newTemp(Ity_V256);
23722    UChar   rm      = getUChar(delta);
23723    UInt    rG      = gregOfRexRM(pfx, rm);
23724    UInt    rV      = getVexNvvvv(pfx);
23725    IRTemp argL     = newTemp(Ity_V256);
23726    IRTemp argR     = newTemp(Ity_V256);
23727    IRTemp argLhi   = IRTemp_INVALID;
23728    IRTemp argLlo   = IRTemp_INVALID;
23729    IRTemp argRhi   = IRTemp_INVALID;
23730    IRTemp argRlo   = IRTemp_INVALID;
23731
23732    assign(argL, getYMMReg(rV));
23733    if (epartIsReg(rm)) {
23734       imm8 = getUChar(delta+1);
23735       Bool ok = findSSECmpOp(&preZero, &preSwap, &op, &postNot, imm8,
23736                              True/*all_lanes*/, sz);
23737       if (!ok) return deltaIN; /* FAIL */
23738       UInt rE = eregOfRexRM(pfx,rm);
23739       assign(argR, getYMMReg(rE));
23740       delta += 1+1;
23741       DIP("%s $%u,%s,%s,%s\n",
23742           opname, imm8,
23743           nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
23744    } else {
23745       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23746       imm8 = getUChar(delta+alen);
23747       Bool ok = findSSECmpOp(&preZero, &preSwap, &op, &postNot, imm8,
23748                              True/*all_lanes*/, sz);
23749       if (!ok) return deltaIN; /* FAIL */
23750       assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
23751       delta += alen+1;
23752       DIP("%s $%u,%s,%s,%s\n",
23753           opname, imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
23754    }
23755
23756    breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
23757    breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
23758
23759    IRTemp argMask = newTemp(Ity_V128);
23760    if (preZero) {
23761       // In this case, preSwap is irrelevant, but it's harmless to honour it
23762       // anyway.
23763       assign(argMask, mkV128(0x0000));
23764    } else {
23765       assign(argMask, mkV128(0xFFFF));
23766    }
23767
23768    assign(
23769       plain,
23770       binop( Iop_V128HLtoV256,
23771              binop(op, binop(Iop_AndV128, mkexpr(argLhi), mkexpr(argMask)),
23772                        binop(Iop_AndV128, mkexpr(argRhi), mkexpr(argMask))),
23773              binop(op, binop(Iop_AndV128, mkexpr(argLlo), mkexpr(argMask)),
23774                        binop(Iop_AndV128, mkexpr(argRlo), mkexpr(argMask))))
23775    );
23776
23777    /* This is simple: just invert the result, if necessary, and
23778       have done. */
23779    if (postNot) {
23780       putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
23781    } else {
23782       putYMMReg( rG, mkexpr(plain) );
23783    }
23784
23785    *uses_vvvv = True;
23786    return delta;
23787 }
23788
23789
23790 /* Handles AVX128 unary E-to-G all-lanes operations. */
23791 static
23792 Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
23793                                const VexAbiInfo* vbi,
23794                                Prefix pfx, Long delta,
23795                                const HChar* opname,
23796                                IRTemp (*opFn)(IRTemp) )
23797 {
23798    HChar  dis_buf[50];
23799    Int    alen;
23800    IRTemp addr;
23801    IRTemp res  = newTemp(Ity_V128);
23802    IRTemp arg  = newTemp(Ity_V128);
23803    UChar  rm   = getUChar(delta);
23804    UInt   rG   = gregOfRexRM(pfx, rm);
23805    if (epartIsReg(rm)) {
23806       UInt rE = eregOfRexRM(pfx,rm);
23807       assign(arg, getXMMReg(rE));
23808       delta += 1;
23809       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
23810    } else {
23811       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23812       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
23813       delta += alen;
23814       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
23815    }
23816    res = opFn(arg);
23817    putYMMRegLoAndZU( rG, mkexpr(res) );
23818    *uses_vvvv = False;
23819    return delta;
23820 }
23821
23822
23823 /* Handles AVX128 unary E-to-G all-lanes operations. */
23824 static
23825 Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
23826                                    const VexAbiInfo* vbi,
23827                                    Prefix pfx, Long delta,
23828                                    const HChar* opname, IROp op )
23829 {
23830    HChar  dis_buf[50];
23831    Int    alen;
23832    IRTemp addr;
23833    IRTemp arg  = newTemp(Ity_V128);
23834    UChar  rm   = getUChar(delta);
23835    UInt   rG   = gregOfRexRM(pfx, rm);
23836    if (epartIsReg(rm)) {
23837       UInt rE = eregOfRexRM(pfx,rm);
23838       assign(arg, getXMMReg(rE));
23839       delta += 1;
23840       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
23841    } else {
23842       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23843       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
23844       delta += alen;
23845       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
23846    }
23847    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
23848    // up in the usual way.
23849    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
23850    /* XXXROUNDINGFIXME */
23851    IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), mkexpr(arg))
23852                            : unop(op, mkexpr(arg));
23853    putYMMRegLoAndZU( rG, res );
23854    *uses_vvvv = False;
23855    return delta;
23856 }
23857
23858
23859 /* FIXME: common up with the _128_ version above? */
23860 static
23861 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
23862         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
23863         Prefix pfx, Long delta, const HChar* name,
23864         /* The actual operation.  Use either 'op' or 'opfn',
23865            but not both. */
23866         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
23867         Bool invertLeftArg,
23868         Bool swapArgs
23869      )
23870 {
23871    UChar  modrm = getUChar(delta);
23872    UInt   rD    = gregOfRexRM(pfx, modrm);
23873    UInt   rSL   = getVexNvvvv(pfx);
23874    IRTemp tSL   = newTemp(Ity_V256);
23875    IRTemp tSR   = newTemp(Ity_V256);
23876    IRTemp addr  = IRTemp_INVALID;
23877    HChar  dis_buf[50];
23878    Int    alen  = 0;
23879    vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
23880
23881    assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
23882                              : getYMMReg(rSL));
23883
23884    if (epartIsReg(modrm)) {
23885       UInt rSR = eregOfRexRM(pfx, modrm);
23886       delta += 1;
23887       assign(tSR, getYMMReg(rSR));
23888       DIP("%s %s,%s,%s\n",
23889           name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
23890    } else {
23891       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23892       delta += alen;
23893       assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
23894       DIP("%s %s,%s,%s\n",
23895           name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
23896    }
23897
23898    IRTemp res = IRTemp_INVALID;
23899    if (op != Iop_INVALID) {
23900       vassert(opFn == NULL);
23901       res = newTemp(Ity_V256);
23902       if (requiresRMode(op)) {
23903          IRTemp rm = newTemp(Ity_I32);
23904          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
23905          assign(res, swapArgs
23906                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
23907                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
23908       } else {
23909          assign(res, swapArgs
23910                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
23911                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
23912       }
23913    } else {
23914       vassert(opFn != NULL);
23915       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
23916    }
23917
23918    putYMMReg(rD, mkexpr(res));
23919
23920    *uses_vvvv = True;
23921    return delta;
23922 }
23923
23924
23925 /* All-lanes AVX256 binary operation:
23926    G[255:0] = V[255:0] `op` E[255:0]
23927 */
23928 static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
23929                                   const VexAbiInfo* vbi,
23930                                   Prefix pfx, Long delta,
23931                                   const HChar* opname, IROp op )
23932 {
23933    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23934              uses_vvvv, vbi, pfx, delta, opname, op,
23935              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
23936    );
23937 }
23938
23939
23940 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
23941    for the operation, no inversion of the left arg, and no swapping of
23942    args. */
23943 static
23944 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
23945         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
23946         Prefix pfx, Long delta, const HChar* name,
23947         IROp op
23948      )
23949 {
23950    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23951              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
23952 }
23953
23954
23955 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
23956    generator to compute the result, no inversion of the left
23957    arg, and no swapping of args. */
23958 static
23959 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
23960         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
23961         Prefix pfx, Long delta, const HChar* name,
23962         IRTemp(*opFn)(IRTemp,IRTemp)
23963      )
23964 {
23965    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23966              uses_vvvv, vbi, pfx, delta, name,
23967              Iop_INVALID, opFn, False, False );
23968 }
23969
23970
23971 /* Handles AVX256 unary E-to-G all-lanes operations. */
23972 static
23973 Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
23974                                const VexAbiInfo* vbi,
23975                                Prefix pfx, Long delta,
23976                                const HChar* opname,
23977                                IRTemp (*opFn)(IRTemp) )
23978 {
23979    HChar  dis_buf[50];
23980    Int    alen;
23981    IRTemp addr;
23982    IRTemp res  = newTemp(Ity_V256);
23983    IRTemp arg  = newTemp(Ity_V256);
23984    UChar  rm   = getUChar(delta);
23985    UInt   rG   = gregOfRexRM(pfx, rm);
23986    if (epartIsReg(rm)) {
23987       UInt rE = eregOfRexRM(pfx,rm);
23988       assign(arg, getYMMReg(rE));
23989       delta += 1;
23990       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
23991    } else {
23992       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23993       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
23994       delta += alen;
23995       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
23996    }
23997    res = opFn(arg);
23998    putYMMReg( rG, mkexpr(res) );
23999    *uses_vvvv = False;
24000    return delta;
24001 }
24002
24003
24004 /* Handles AVX256 unary E-to-G all-lanes operations. */
24005 static
24006 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
24007                                    const VexAbiInfo* vbi,
24008                                    Prefix pfx, Long delta,
24009                                    const HChar* opname, IROp op )
24010 {
24011    HChar  dis_buf[50];
24012    Int    alen;
24013    IRTemp addr;
24014    IRTemp arg  = newTemp(Ity_V256);
24015    UChar  rm   = getUChar(delta);
24016    UInt   rG   = gregOfRexRM(pfx, rm);
24017    if (epartIsReg(rm)) {
24018       UInt rE = eregOfRexRM(pfx,rm);
24019       assign(arg, getYMMReg(rE));
24020       delta += 1;
24021       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
24022    } else {
24023       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24024       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
24025       delta += alen;
24026       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
24027    }
24028    putYMMReg( rG, unop(op, mkexpr(arg)) );
24029    *uses_vvvv = False;
24030    return delta;
24031 }
24032
24033
24034 /* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
24035    had a variant of Iop_64x4toV256 that took F64s as args instead. */
24036 static Long dis_CVTDQ2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
24037                                Long delta )
24038 {
24039    IRTemp addr  = IRTemp_INVALID;
24040    Int    alen  = 0;
24041    HChar  dis_buf[50];
24042    UChar  modrm = getUChar(delta);
24043    IRTemp sV    = newTemp(Ity_V128);
24044    UInt   rG    = gregOfRexRM(pfx,modrm);
24045    if (epartIsReg(modrm)) {
24046       UInt rE = eregOfRexRM(pfx,modrm);
24047       assign( sV, getXMMReg(rE) );
24048       delta += 1;
24049       DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
24050    } else {
24051       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24052       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
24053       delta += alen;
24054       DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
24055    }
24056    IRTemp s3, s2, s1, s0;
24057    s3 = s2 = s1 = s0 = IRTemp_INVALID;
24058    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
24059    IRExpr* res
24060       = IRExpr_Qop(
24061            Iop_64x4toV256,
24062            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
24063            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
24064            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
24065            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
24066         );
24067    putYMMReg(rG, res);
24068    return delta;
24069 }
24070
24071
24072 static Long dis_CVTPD2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
24073                                Long delta )
24074 {
24075    IRTemp addr  = IRTemp_INVALID;
24076    Int    alen  = 0;
24077    HChar  dis_buf[50];
24078    UChar  modrm = getUChar(delta);
24079    UInt   rG    = gregOfRexRM(pfx,modrm);
24080    IRTemp argV  = newTemp(Ity_V256);
24081    IRTemp rmode = newTemp(Ity_I32);
24082    if (epartIsReg(modrm)) {
24083       UInt rE = eregOfRexRM(pfx,modrm);
24084       assign( argV, getYMMReg(rE) );
24085       delta += 1;
24086       DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
24087    } else {
24088       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24089       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
24090       delta += alen;
24091       DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
24092    }
24093
24094    assign( rmode, get_sse_roundingmode() );
24095    IRTemp t3, t2, t1, t0;
24096    t3 = t2 = t1 = t0 = IRTemp_INVALID;
24097    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
24098 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
24099                           unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
24100    putXMMRegLane32F( rG, 3, CVT(t3) );
24101    putXMMRegLane32F( rG, 2, CVT(t2) );
24102    putXMMRegLane32F( rG, 1, CVT(t1) );
24103    putXMMRegLane32F( rG, 0, CVT(t0) );
24104 #  undef CVT
24105    putYMMRegLane128( rG, 1, mkV128(0) );
24106    return delta;
24107 }
24108
24109
24110 static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
24111 {
24112    IRTemp tLhi, tLlo, tRhi, tRlo;
24113    tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
24114    IRTemp res = newTemp(Ity_V256);
24115    breakupV256toV128s( tL, &tLhi, &tLlo );
24116    breakupV256toV128s( tR, &tRhi, &tRlo );
24117    assign( res, binop( Iop_V128HLtoV256,
24118                        binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
24119                        binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
24120    return res;
24121 }
24122
24123
24124 static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
24125 {
24126    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
24127 }
24128
24129
24130 static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
24131 {
24132    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
24133 }
24134
24135
24136 static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
24137 {
24138    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
24139 }
24140
24141
24142 static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
24143 {
24144    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
24145 }
24146
24147
24148 static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
24149 {
24150    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
24151 }
24152
24153
24154 static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
24155 {
24156    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
24157 }
24158
24159
24160 static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
24161 {
24162    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
24163 }
24164
24165
24166 static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
24167 {
24168    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
24169 }
24170
24171
24172 static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
24173 {
24174    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
24175 }
24176
24177
24178 static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
24179 {
24180    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
24181 }
24182
24183
24184 static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
24185 {
24186    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
24187 }
24188
24189
24190 static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
24191 {
24192    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
24193 }
24194
24195
24196 __attribute__((noinline))
24197 static
24198 Long dis_ESC_0F__VEX (
24199         /*MB_OUT*/DisResult* dres,
24200         /*OUT*/   Bool*      uses_vvvv,
24201         const VexArchInfo* archinfo,
24202         const VexAbiInfo*  vbi,
24203         Prefix pfx, Int sz, Long deltaIN
24204      )
24205 {
24206    IRTemp addr  = IRTemp_INVALID;
24207    Int    alen  = 0;
24208    HChar  dis_buf[50];
24209    Long   delta = deltaIN;
24210    UChar  opc   = getUChar(delta);
24211    delta++;
24212    *uses_vvvv = False;
24213
24214    switch (opc) {
24215
24216    case 0x10:
24217       /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
24218       /* Move 64 bits from E (mem only) to G (lo half xmm).
24219          Bits 255-64 of the dest are zeroed out. */
24220       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
24221          UChar modrm = getUChar(delta);
24222          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24223          UInt   rG   = gregOfRexRM(pfx,modrm);
24224          IRTemp z128 = newTemp(Ity_V128);
24225          assign(z128, mkV128(0));
24226          putXMMReg( rG, mkexpr(z128) );
24227          /* FIXME: ALIGNMENT CHECK? */
24228          putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
24229          putYMMRegLane128( rG, 1, mkexpr(z128) );
24230          DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
24231          delta += alen;
24232          goto decode_success;
24233       }
24234       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
24235       /* Reg form. */
24236       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
24237          UChar modrm = getUChar(delta);
24238          UInt  rG    = gregOfRexRM(pfx, modrm);
24239          UInt  rE    = eregOfRexRM(pfx, modrm);
24240          UInt  rV    = getVexNvvvv(pfx);
24241          delta++;
24242          DIP("vmovsd %s,%s,%s\n",
24243              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24244          IRTemp res = newTemp(Ity_V128);
24245          assign(res, binop(Iop_64HLtoV128,
24246                            getXMMRegLane64(rV, 1),
24247                            getXMMRegLane64(rE, 0)));
24248          putYMMRegLoAndZU(rG, mkexpr(res));
24249          *uses_vvvv = True;
24250          goto decode_success;
24251       }
24252       /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
24253       /* Move 32 bits from E (mem only) to G (lo half xmm).
24254          Bits 255-32 of the dest are zeroed out. */
24255       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
24256          UChar modrm = getUChar(delta);
24257          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24258          UInt   rG   = gregOfRexRM(pfx,modrm);
24259          IRTemp z128 = newTemp(Ity_V128);
24260          assign(z128, mkV128(0));
24261          putXMMReg( rG, mkexpr(z128) );
24262          /* FIXME: ALIGNMENT CHECK? */
24263          putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
24264          putYMMRegLane128( rG, 1, mkexpr(z128) );
24265          DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
24266          delta += alen;
24267          goto decode_success;
24268       }
24269       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
24270       /* Reg form. */
24271       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
24272          UChar modrm = getUChar(delta);
24273          UInt  rG    = gregOfRexRM(pfx, modrm);
24274          UInt  rE    = eregOfRexRM(pfx, modrm);
24275          UInt  rV    = getVexNvvvv(pfx);
24276          delta++;
24277          DIP("vmovss %s,%s,%s\n",
24278              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24279          IRTemp res = newTemp(Ity_V128);
24280          assign( res, binop( Iop_64HLtoV128,
24281                              getXMMRegLane64(rV, 1),
24282                              binop(Iop_32HLto64,
24283                                    getXMMRegLane32(rV, 1),
24284                                    getXMMRegLane32(rE, 0)) ) );
24285          putYMMRegLoAndZU(rG, mkexpr(res));
24286          *uses_vvvv = True;
24287          goto decode_success;
24288       }
24289       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
24290       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24291          UChar modrm = getUChar(delta);
24292          UInt  rG    = gregOfRexRM(pfx, modrm);
24293          if (epartIsReg(modrm)) {
24294             UInt rE = eregOfRexRM(pfx,modrm);
24295             putYMMRegLoAndZU( rG, getXMMReg( rE ));
24296             DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24297             delta += 1;
24298          } else {
24299             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24300             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24301             DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
24302             delta += alen;
24303          }
24304          goto decode_success;
24305       }
24306       /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
24307       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24308          UChar modrm = getUChar(delta);
24309          UInt  rG    = gregOfRexRM(pfx, modrm);
24310          if (epartIsReg(modrm)) {
24311             UInt rE = eregOfRexRM(pfx,modrm);
24312             putYMMReg( rG, getYMMReg( rE ));
24313             DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24314             delta += 1;
24315          } else {
24316             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24317             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24318             DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
24319             delta += alen;
24320          }
24321          goto decode_success;
24322       }
24323       /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
24324       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24325          UChar modrm = getUChar(delta);
24326          UInt  rG    = gregOfRexRM(pfx, modrm);
24327          if (epartIsReg(modrm)) {
24328             UInt rE = eregOfRexRM(pfx,modrm);
24329             putYMMRegLoAndZU( rG, getXMMReg( rE ));
24330             DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24331             delta += 1;
24332          } else {
24333             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24334             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24335             DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
24336             delta += alen;
24337          }
24338          goto decode_success;
24339       }
24340       /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
24341       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24342          UChar modrm = getUChar(delta);
24343          UInt  rG    = gregOfRexRM(pfx, modrm);
24344          if (epartIsReg(modrm)) {
24345             UInt rE = eregOfRexRM(pfx,modrm);
24346             putYMMReg( rG, getYMMReg( rE ));
24347             DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24348             delta += 1;
24349          } else {
24350             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24351             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24352             DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
24353             delta += alen;
24354          }
24355          goto decode_success;
24356       }
24357       break;
24358
24359    case 0x11:
24360       /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
24361       /* Move 64 bits from G (low half xmm) to mem only. */
24362       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
24363          UChar modrm = getUChar(delta);
24364          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24365          UInt   rG   = gregOfRexRM(pfx,modrm);
24366          /* FIXME: ALIGNMENT CHECK? */
24367          storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
24368          DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
24369          delta += alen;
24370          goto decode_success;
24371       }
24372       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
24373       /* Reg form. */
24374       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
24375          UChar modrm = getUChar(delta);
24376          UInt  rG    = gregOfRexRM(pfx, modrm);
24377          UInt  rE    = eregOfRexRM(pfx, modrm);
24378          UInt  rV    = getVexNvvvv(pfx);
24379          delta++;
24380          DIP("vmovsd %s,%s,%s\n",
24381              nameXMMReg(rG), nameXMMReg(rV), nameXMMReg(rE));
24382          IRTemp res = newTemp(Ity_V128);
24383          assign(res, binop(Iop_64HLtoV128,
24384                            getXMMRegLane64(rV, 1),
24385                            getXMMRegLane64(rG, 0)));
24386          putYMMRegLoAndZU(rE, mkexpr(res));
24387          *uses_vvvv = True;
24388          goto decode_success;
24389       }
24390       /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
24391       /* Move 32 bits from G (low 1/4 xmm) to mem only. */
24392       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
24393          UChar modrm = getUChar(delta);
24394          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24395          UInt   rG   = gregOfRexRM(pfx,modrm);
24396          /* FIXME: ALIGNMENT CHECK? */
24397          storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
24398          DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
24399          delta += alen;
24400          goto decode_success;
24401       }
24402       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
24403       /* Reg form. */
24404       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
24405          UChar modrm = getUChar(delta);
24406          UInt  rG    = gregOfRexRM(pfx, modrm);
24407          UInt  rE    = eregOfRexRM(pfx, modrm);
24408          UInt  rV    = getVexNvvvv(pfx);
24409          delta++;
24410          DIP("vmovss %s,%s,%s\n",
24411              nameXMMReg(rG), nameXMMReg(rV), nameXMMReg(rE));
24412          IRTemp res = newTemp(Ity_V128);
24413          assign( res, binop( Iop_64HLtoV128,
24414                              getXMMRegLane64(rV, 1),
24415                              binop(Iop_32HLto64,
24416                                    getXMMRegLane32(rV, 1),
24417                                    getXMMRegLane32(rG, 0)) ) );
24418          putYMMRegLoAndZU(rE, mkexpr(res));
24419          *uses_vvvv = True;
24420          goto decode_success;
24421       }
24422       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
24423       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24424          UChar modrm = getUChar(delta);
24425          UInt  rG    = gregOfRexRM(pfx,modrm);
24426          if (epartIsReg(modrm)) {
24427             UInt rE = eregOfRexRM(pfx,modrm);
24428             putYMMRegLoAndZU( rE, getXMMReg(rG) );
24429             DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24430             delta += 1;
24431          } else {
24432             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24433             storeLE( mkexpr(addr), getXMMReg(rG) );
24434             DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
24435             delta += alen;
24436          }
24437          goto decode_success;
24438       }
24439       /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
24440       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24441          UChar modrm = getUChar(delta);
24442          UInt  rG    = gregOfRexRM(pfx,modrm);
24443          if (epartIsReg(modrm)) {
24444             UInt rE = eregOfRexRM(pfx,modrm);
24445             putYMMReg( rE, getYMMReg(rG) );
24446             DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24447             delta += 1;
24448          } else {
24449             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24450             storeLE( mkexpr(addr), getYMMReg(rG) );
24451             DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
24452             delta += alen;
24453          }
24454          goto decode_success;
24455       }
24456       /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
24457       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24458          UChar modrm = getUChar(delta);
24459          UInt  rG    = gregOfRexRM(pfx,modrm);
24460          if (epartIsReg(modrm)) {
24461             UInt rE = eregOfRexRM(pfx,modrm);
24462             putYMMRegLoAndZU( rE, getXMMReg(rG) );
24463             DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24464             delta += 1;
24465          } else {
24466             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24467             storeLE( mkexpr(addr), getXMMReg(rG) );
24468             DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
24469             delta += alen;
24470          }
24471          goto decode_success;
24472       }
24473       /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
24474       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24475          UChar modrm = getUChar(delta);
24476          UInt  rG    = gregOfRexRM(pfx,modrm);
24477          if (epartIsReg(modrm)) {
24478             UInt rE = eregOfRexRM(pfx,modrm);
24479             putYMMReg( rE, getYMMReg(rG) );
24480             DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24481             delta += 1;
24482          } else {
24483             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24484             storeLE( mkexpr(addr), getYMMReg(rG) );
24485             DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
24486             delta += alen;
24487          }
24488          goto decode_success;
24489       }
24490       break;
24491
24492    case 0x12:
24493       /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
24494       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24495          delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
24496          goto decode_success;
24497       }
24498       /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
24499       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24500          delta = dis_MOVDDUP_256( vbi, pfx, delta );
24501          goto decode_success;
24502       }
24503       /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
24504       /* Insn only exists in reg form */
24505       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
24506           && epartIsReg(getUChar(delta))) {
24507          UChar modrm = getUChar(delta);
24508          UInt  rG    = gregOfRexRM(pfx, modrm);
24509          UInt  rE    = eregOfRexRM(pfx, modrm);
24510          UInt  rV    = getVexNvvvv(pfx);
24511          delta++;
24512          DIP("vmovhlps %s,%s,%s\n",
24513              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24514          IRTemp res = newTemp(Ity_V128);
24515          assign(res, binop(Iop_64HLtoV128,
24516                            getXMMRegLane64(rV, 1),
24517                            getXMMRegLane64(rE, 1)));
24518          putYMMRegLoAndZU(rG, mkexpr(res));
24519          *uses_vvvv = True;
24520          goto decode_success;
24521       }
24522       /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
24523       /* Insn exists only in mem form, it appears. */
24524       /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
24525       /* Insn exists only in mem form, it appears. */
24526       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24527           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24528          UChar modrm = getUChar(delta);
24529          UInt  rG    = gregOfRexRM(pfx, modrm);
24530          UInt  rV    = getVexNvvvv(pfx);
24531          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24532          delta += alen;
24533          DIP("vmovlpd %s,%s,%s\n",
24534              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
24535          IRTemp res = newTemp(Ity_V128);
24536          assign(res, binop(Iop_64HLtoV128,
24537                            getXMMRegLane64(rV, 1),
24538                            loadLE(Ity_I64, mkexpr(addr))));
24539          putYMMRegLoAndZU(rG, mkexpr(res));
24540          *uses_vvvv = True;
24541          goto decode_success;
24542       }
24543       /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
24544       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
24545          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
24546                                    True/*isL*/ );
24547          goto decode_success;
24548       }
24549       /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
24550       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
24551          delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
24552          goto decode_success;
24553       }
24554       break;
24555
24556    case 0x13:
24557       /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
24558       /* Insn exists only in mem form, it appears. */
24559       /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
24560       /* Insn exists only in mem form, it appears. */
24561       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24562           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24563          UChar modrm = getUChar(delta);
24564          UInt  rG    = gregOfRexRM(pfx, modrm);
24565          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24566          delta += alen;
24567          storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
24568          DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
24569          goto decode_success;
24570       }
24571       break;
24572
24573    case 0x14:
24574    case 0x15:
24575       /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
24576       /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
24577       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24578          Bool   hi    = opc == 0x15;
24579          UChar  modrm = getUChar(delta);
24580          UInt   rG    = gregOfRexRM(pfx,modrm);
24581          UInt   rV    = getVexNvvvv(pfx);
24582          IRTemp eV    = newTemp(Ity_V128);
24583          IRTemp vV    = newTemp(Ity_V128);
24584          assign( vV, getXMMReg(rV) );
24585          if (epartIsReg(modrm)) {
24586             UInt rE = eregOfRexRM(pfx,modrm);
24587             assign( eV, getXMMReg(rE) );
24588             delta += 1;
24589             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24590                 nameXMMReg(rE), nameXMMReg(rG));
24591          } else {
24592             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24593             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
24594             delta += alen;
24595             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24596                 dis_buf, nameXMMReg(rG));
24597          }
24598          IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
24599          putYMMRegLoAndZU( rG, mkexpr(res) );
24600          *uses_vvvv = True;
24601          goto decode_success;
24602       }
24603       /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
24604       /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
24605       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24606          Bool   hi    = opc == 0x15;
24607          UChar  modrm = getUChar(delta);
24608          UInt   rG    = gregOfRexRM(pfx,modrm);
24609          UInt   rV    = getVexNvvvv(pfx);
24610          IRTemp eV    = newTemp(Ity_V256);
24611          IRTemp vV    = newTemp(Ity_V256);
24612          assign( vV, getYMMReg(rV) );
24613          if (epartIsReg(modrm)) {
24614             UInt rE = eregOfRexRM(pfx,modrm);
24615             assign( eV, getYMMReg(rE) );
24616             delta += 1;
24617             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24618                 nameYMMReg(rE), nameYMMReg(rG));
24619          } else {
24620             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24621             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
24622             delta += alen;
24623             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24624                 dis_buf, nameYMMReg(rG));
24625          }
24626          IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
24627          putYMMReg( rG, mkexpr(res) );
24628          *uses_vvvv = True;
24629          goto decode_success;
24630       }
24631       /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
24632       /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
24633       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24634          Bool   hi    = opc == 0x15;
24635          UChar  modrm = getUChar(delta);
24636          UInt   rG    = gregOfRexRM(pfx,modrm);
24637          UInt   rV    = getVexNvvvv(pfx);
24638          IRTemp eV    = newTemp(Ity_V128);
24639          IRTemp vV    = newTemp(Ity_V128);
24640          assign( vV, getXMMReg(rV) );
24641          if (epartIsReg(modrm)) {
24642             UInt rE = eregOfRexRM(pfx,modrm);
24643             assign( eV, getXMMReg(rE) );
24644             delta += 1;
24645             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24646                 nameXMMReg(rE), nameXMMReg(rG));
24647          } else {
24648             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24649             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
24650             delta += alen;
24651             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24652                 dis_buf, nameXMMReg(rG));
24653          }
24654          IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
24655          putYMMRegLoAndZU( rG, mkexpr(res) );
24656          *uses_vvvv = True;
24657          goto decode_success;
24658       }
24659       /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
24660       /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
24661       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24662          Bool   hi    = opc == 0x15;
24663          UChar  modrm = getUChar(delta);
24664          UInt   rG    = gregOfRexRM(pfx,modrm);
24665          UInt   rV    = getVexNvvvv(pfx);
24666          IRTemp eV    = newTemp(Ity_V256);
24667          IRTemp vV    = newTemp(Ity_V256);
24668          assign( vV, getYMMReg(rV) );
24669          if (epartIsReg(modrm)) {
24670             UInt rE = eregOfRexRM(pfx,modrm);
24671             assign( eV, getYMMReg(rE) );
24672             delta += 1;
24673             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24674                 nameYMMReg(rE), nameYMMReg(rG));
24675          } else {
24676             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24677             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
24678             delta += alen;
24679             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24680                 dis_buf, nameYMMReg(rG));
24681          }
24682          IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
24683          putYMMReg( rG, mkexpr(res) );
24684          *uses_vvvv = True;
24685          goto decode_success;
24686       }
24687       break;
24688
24689    case 0x16:
24690       /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
24691       /* Insn only exists in reg form */
24692       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
24693           && epartIsReg(getUChar(delta))) {
24694          UChar modrm = getUChar(delta);
24695          UInt  rG    = gregOfRexRM(pfx, modrm);
24696          UInt  rE    = eregOfRexRM(pfx, modrm);
24697          UInt  rV    = getVexNvvvv(pfx);
24698          delta++;
24699          DIP("vmovlhps %s,%s,%s\n",
24700              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24701          IRTemp res = newTemp(Ity_V128);
24702          assign(res, binop(Iop_64HLtoV128,
24703                            getXMMRegLane64(rE, 0),
24704                            getXMMRegLane64(rV, 0)));
24705          putYMMRegLoAndZU(rG, mkexpr(res));
24706          *uses_vvvv = True;
24707          goto decode_success;
24708       }
24709       /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
24710       /* Insn exists only in mem form, it appears. */
24711       /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
24712       /* Insn exists only in mem form, it appears. */
24713       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24714           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24715          UChar modrm = getUChar(delta);
24716          UInt  rG    = gregOfRexRM(pfx, modrm);
24717          UInt  rV    = getVexNvvvv(pfx);
24718          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24719          delta += alen;
24720          DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
24721              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
24722          IRTemp res = newTemp(Ity_V128);
24723          assign(res, binop(Iop_64HLtoV128,
24724                            loadLE(Ity_I64, mkexpr(addr)),
24725                            getXMMRegLane64(rV, 0)));
24726          putYMMRegLoAndZU(rG, mkexpr(res));
24727          *uses_vvvv = True;
24728          goto decode_success;
24729       }
24730       /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
24731       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
24732          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
24733                                    False/*!isL*/ );
24734          goto decode_success;
24735       }
24736       /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
24737       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
24738          delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
24739          goto decode_success;
24740       }
24741       break;
24742
24743    case 0x17:
24744       /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
24745       /* Insn exists only in mem form, it appears. */
24746       /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
24747       /* Insn exists only in mem form, it appears. */
24748       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24749           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24750          UChar modrm = getUChar(delta);
24751          UInt  rG    = gregOfRexRM(pfx, modrm);
24752          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24753          delta += alen;
24754          storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
24755          DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
24756              nameXMMReg(rG), dis_buf);
24757          goto decode_success;
24758       }
24759       break;
24760
24761    case 0x28:
24762       /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
24763       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24764          UChar modrm = getUChar(delta);
24765          UInt  rG    = gregOfRexRM(pfx, modrm);
24766          if (epartIsReg(modrm)) {
24767             UInt rE = eregOfRexRM(pfx,modrm);
24768             putYMMRegLoAndZU( rG, getXMMReg( rE ));
24769             DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24770             delta += 1;
24771          } else {
24772             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24773             gen_SIGNAL_if_not_16_aligned( vbi, addr );
24774             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24775             DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
24776             delta += alen;
24777          }
24778          goto decode_success;
24779       }
24780       /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
24781       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24782          UChar modrm = getUChar(delta);
24783          UInt  rG    = gregOfRexRM(pfx, modrm);
24784          if (epartIsReg(modrm)) {
24785             UInt rE = eregOfRexRM(pfx,modrm);
24786             putYMMReg( rG, getYMMReg( rE ));
24787             DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24788             delta += 1;
24789          } else {
24790             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24791             gen_SIGNAL_if_not_32_aligned( vbi, addr );
24792             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24793             DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
24794             delta += alen;
24795          }
24796          goto decode_success;
24797       }
24798       /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
24799       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24800          UChar modrm = getUChar(delta);
24801          UInt  rG    = gregOfRexRM(pfx, modrm);
24802          if (epartIsReg(modrm)) {
24803             UInt rE = eregOfRexRM(pfx,modrm);
24804             putYMMRegLoAndZU( rG, getXMMReg( rE ));
24805             DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24806             delta += 1;
24807          } else {
24808             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24809             gen_SIGNAL_if_not_16_aligned( vbi, addr );
24810             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24811             DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
24812             delta += alen;
24813          }
24814          goto decode_success;
24815       }
24816       /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
24817       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24818          UChar modrm = getUChar(delta);
24819          UInt  rG    = gregOfRexRM(pfx, modrm);
24820          if (epartIsReg(modrm)) {
24821             UInt rE = eregOfRexRM(pfx,modrm);
24822             putYMMReg( rG, getYMMReg( rE ));
24823             DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24824             delta += 1;
24825          } else {
24826             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24827             gen_SIGNAL_if_not_32_aligned( vbi, addr );
24828             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24829             DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
24830             delta += alen;
24831          }
24832          goto decode_success;
24833       }
24834       break;
24835
24836    case 0x29:
24837       /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
24838       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24839          UChar modrm = getUChar(delta);
24840          UInt  rG    = gregOfRexRM(pfx,modrm);
24841          if (epartIsReg(modrm)) {
24842             UInt rE = eregOfRexRM(pfx,modrm);
24843             putYMMRegLoAndZU( rE, getXMMReg(rG) );
24844             DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24845             delta += 1;
24846          } else {
24847             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24848             gen_SIGNAL_if_not_16_aligned( vbi, addr );
24849             storeLE( mkexpr(addr), getXMMReg(rG) );
24850             DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
24851             delta += alen;
24852          }
24853          goto decode_success;
24854       }
24855       /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
24856       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24857          UChar modrm = getUChar(delta);
24858          UInt  rG    = gregOfRexRM(pfx,modrm);
24859          if (epartIsReg(modrm)) {
24860             UInt rE = eregOfRexRM(pfx,modrm);
24861             putYMMReg( rE, getYMMReg(rG) );
24862             DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24863             delta += 1;
24864          } else {
24865             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24866             gen_SIGNAL_if_not_32_aligned( vbi, addr );
24867             storeLE( mkexpr(addr), getYMMReg(rG) );
24868             DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
24869             delta += alen;
24870          }
24871          goto decode_success;
24872       }
24873       /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
24874       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24875          UChar modrm = getUChar(delta);
24876          UInt  rG    = gregOfRexRM(pfx,modrm);
24877          if (epartIsReg(modrm)) {
24878             UInt rE = eregOfRexRM(pfx,modrm);
24879             putYMMRegLoAndZU( rE, getXMMReg(rG) );
24880             DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24881             delta += 1;
24882             goto decode_success;
24883          } else {
24884             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24885             gen_SIGNAL_if_not_16_aligned( vbi, addr );
24886             storeLE( mkexpr(addr), getXMMReg(rG) );
24887             DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
24888             delta += alen;
24889             goto decode_success;
24890          }
24891       }
24892       /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
24893       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24894          UChar modrm = getUChar(delta);
24895          UInt  rG    = gregOfRexRM(pfx,modrm);
24896          if (epartIsReg(modrm)) {
24897             UInt rE = eregOfRexRM(pfx,modrm);
24898             putYMMReg( rE, getYMMReg(rG) );
24899             DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24900             delta += 1;
24901             goto decode_success;
24902          } else {
24903             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24904             gen_SIGNAL_if_not_32_aligned( vbi, addr );
24905             storeLE( mkexpr(addr), getYMMReg(rG) );
24906             DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
24907             delta += alen;
24908             goto decode_success;
24909          }
24910       }
24911       break;
24912
24913    case 0x2A: {
24914       IRTemp rmode = newTemp(Ity_I32);
24915       assign( rmode, get_sse_roundingmode() );
24916       /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
24917       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24918          UChar  modrm = getUChar(delta);
24919          UInt   rV    = getVexNvvvv(pfx);
24920          UInt   rD    = gregOfRexRM(pfx, modrm);
24921          IRTemp arg32 = newTemp(Ity_I32);
24922          if (epartIsReg(modrm)) {
24923             UInt rS = eregOfRexRM(pfx,modrm);
24924             assign( arg32, getIReg32(rS) );
24925             delta += 1;
24926             DIP("vcvtsi2sdl %s,%s,%s\n",
24927                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
24928          } else {
24929             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24930             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
24931             delta += alen;
24932             DIP("vcvtsi2sdl %s,%s,%s\n",
24933                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24934          }
24935          putXMMRegLane64F( rD, 0,
24936                            unop(Iop_I32StoF64, mkexpr(arg32)));
24937          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24938          putYMMRegLane128( rD, 1, mkV128(0) );
24939          *uses_vvvv = True;
24940          goto decode_success;
24941       }
24942       /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
24943       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24944          UChar  modrm = getUChar(delta);
24945          UInt   rV    = getVexNvvvv(pfx);
24946          UInt   rD    = gregOfRexRM(pfx, modrm);
24947          IRTemp arg64 = newTemp(Ity_I64);
24948          if (epartIsReg(modrm)) {
24949             UInt rS = eregOfRexRM(pfx,modrm);
24950             assign( arg64, getIReg64(rS) );
24951             delta += 1;
24952             DIP("vcvtsi2sdq %s,%s,%s\n",
24953                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
24954          } else {
24955             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24956             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
24957             delta += alen;
24958             DIP("vcvtsi2sdq %s,%s,%s\n",
24959                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24960          }
24961          putXMMRegLane64F( rD, 0,
24962                            binop( Iop_I64StoF64,
24963                                   get_sse_roundingmode(),
24964                                   mkexpr(arg64)) );
24965          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24966          putYMMRegLane128( rD, 1, mkV128(0) );
24967          *uses_vvvv = True;
24968          goto decode_success;
24969       }
24970       /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
24971       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24972          UChar  modrm = getUChar(delta);
24973          UInt   rV    = getVexNvvvv(pfx);
24974          UInt   rD    = gregOfRexRM(pfx, modrm);
24975          IRTemp arg64 = newTemp(Ity_I64);
24976          if (epartIsReg(modrm)) {
24977             UInt rS = eregOfRexRM(pfx,modrm);
24978             assign( arg64, getIReg64(rS) );
24979             delta += 1;
24980             DIP("vcvtsi2ssq %s,%s,%s\n",
24981                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
24982          } else {
24983             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24984             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
24985             delta += alen;
24986             DIP("vcvtsi2ssq %s,%s,%s\n",
24987                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24988          }
24989          putXMMRegLane32F( rD, 0,
24990                            binop(Iop_F64toF32,
24991                                  mkexpr(rmode),
24992                                  binop(Iop_I64StoF64, mkexpr(rmode),
24993                                                       mkexpr(arg64)) ) );
24994          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
24995          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24996          putYMMRegLane128( rD, 1, mkV128(0) );
24997          *uses_vvvv = True;
24998          goto decode_success;
24999       }
25000       /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
25001       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
25002          UChar  modrm = getUChar(delta);
25003          UInt   rV    = getVexNvvvv(pfx);
25004          UInt   rD    = gregOfRexRM(pfx, modrm);
25005          IRTemp arg32 = newTemp(Ity_I32);
25006          if (epartIsReg(modrm)) {
25007             UInt rS = eregOfRexRM(pfx,modrm);
25008             assign( arg32, getIReg32(rS) );
25009             delta += 1;
25010             DIP("vcvtsi2ssl %s,%s,%s\n",
25011                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
25012          } else {
25013             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25014             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
25015             delta += alen;
25016             DIP("vcvtsi2ssl %s,%s,%s\n",
25017                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
25018          }
25019          putXMMRegLane32F( rD, 0,
25020                            binop(Iop_F64toF32,
25021                                  mkexpr(rmode),
25022                                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
25023          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
25024          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
25025          putYMMRegLane128( rD, 1, mkV128(0) );
25026          *uses_vvvv = True;
25027          goto decode_success;
25028       }
25029       break;
25030    }
25031
25032    case 0x2B:
25033       /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
25034       /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
25035       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
25036           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
25037          UChar  modrm = getUChar(delta);
25038          UInt   rS    = gregOfRexRM(pfx, modrm);
25039          IRTemp tS    = newTemp(Ity_V128);
25040          assign(tS, getXMMReg(rS));
25041          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25042          delta += alen;
25043          gen_SIGNAL_if_not_16_aligned(vbi, addr);
25044          storeLE(mkexpr(addr), mkexpr(tS));
25045          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
25046              nameXMMReg(rS), dis_buf);
25047          goto decode_success;
25048       }
25049       /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
25050       /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
25051       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
25052           && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
25053          UChar  modrm = getUChar(delta);
25054          UInt   rS    = gregOfRexRM(pfx, modrm);
25055          IRTemp tS    = newTemp(Ity_V256);
25056          assign(tS, getYMMReg(rS));
25057          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25058          delta += alen;
25059          gen_SIGNAL_if_not_32_aligned(vbi, addr);
25060          storeLE(mkexpr(addr), mkexpr(tS));
25061          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
25062              nameYMMReg(rS), dis_buf);
25063          goto decode_success;
25064       }
25065       break;
25066
25067    case 0x2C:
25068       /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
25069       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
25070          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
25071          goto decode_success;
25072       }
25073       /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
25074       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
25075          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
25076          goto decode_success;
25077       }
25078       /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
25079       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
25080          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
25081          goto decode_success;
25082       }
25083       /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
25084       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
25085          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
25086          goto decode_success;
25087       }
25088       break;
25089
25090    case 0x2D:
25091       /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
25092       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
25093          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
25094          goto decode_success;
25095       }
25096       /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
25097       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
25098          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
25099          goto decode_success;
25100       }
25101       /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
25102       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
25103          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
25104          goto decode_success;
25105       }
25106       /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
25107       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
25108          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
25109          goto decode_success;
25110       }
25111       break;
25112
25113    case 0x2E:
25114    case 0x2F:
25115       /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
25116       /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
25117       if (have66noF2noF3(pfx)) {
25118          delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
25119          goto decode_success;
25120       }
25121       /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
25122       /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
25123       if (haveNo66noF2noF3(pfx)) {
25124          delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
25125          goto decode_success;
25126       }
25127       break;
25128
25129    case 0x50:
25130       /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
25131       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25132          delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
25133          goto decode_success;
25134       }
25135       /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
25136       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25137          delta = dis_MOVMSKPD_256( vbi, pfx, delta );
25138          goto decode_success;
25139       }
25140       /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
25141       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25142          delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
25143          goto decode_success;
25144       }
25145       /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
25146       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25147          delta = dis_MOVMSKPS_256( vbi, pfx, delta );
25148          goto decode_success;
25149       }
25150       break;
25151
25152    case 0x51:
25153       /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
25154       if (haveF3no66noF2(pfx)) {
25155          delta = dis_AVX128_E_V_to_G_lo32_unary(
25156                     uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
25157          goto decode_success;
25158       }
25159       /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
25160       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25161          delta = dis_AVX128_E_to_G_unary_all(
25162                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
25163          goto decode_success;
25164       }
25165       /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
25166       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25167          delta = dis_AVX256_E_to_G_unary_all(
25168                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
25169          goto decode_success;
25170       }
25171       /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
25172       if (haveF2no66noF3(pfx)) {
25173          delta = dis_AVX128_E_V_to_G_lo64_unary(
25174                     uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
25175          goto decode_success;
25176       }
25177       /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
25178       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25179          delta = dis_AVX128_E_to_G_unary_all(
25180                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
25181          goto decode_success;
25182       }
25183       /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
25184       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25185          delta = dis_AVX256_E_to_G_unary_all(
25186                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
25187          goto decode_success;
25188       }
25189       break;
25190
25191    case 0x52:
25192       /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
25193       if (haveF3no66noF2(pfx)) {
25194          delta = dis_AVX128_E_V_to_G_lo32_unary(
25195                     uses_vvvv, vbi, pfx, delta, "vrsqrtss",
25196                     Iop_RSqrtEst32F0x4 );
25197          goto decode_success;
25198       }
25199       /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
25200       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25201          delta = dis_AVX128_E_to_G_unary_all(
25202                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx4 );
25203          goto decode_success;
25204       }
25205       /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
25206       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25207          delta = dis_AVX256_E_to_G_unary_all(
25208                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx8 );
25209          goto decode_success;
25210       }
25211       break;
25212
25213    case 0x53:
25214       /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
25215       if (haveF3no66noF2(pfx)) {
25216          delta = dis_AVX128_E_V_to_G_lo32_unary(
25217                     uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_RecipEst32F0x4 );
25218          goto decode_success;
25219       }
25220       /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
25221       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25222          delta = dis_AVX128_E_to_G_unary_all(
25223                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx4 );
25224          goto decode_success;
25225       }
25226       /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
25227       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25228          delta = dis_AVX256_E_to_G_unary_all(
25229                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx8 );
25230          goto decode_success;
25231       }
25232       break;
25233
25234    case 0x54:
25235       /* VANDPD r/m, rV, r ::: r = rV & r/m */
25236       /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
25237       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25238          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25239                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
25240          goto decode_success;
25241       }
25242       /* VANDPD r/m, rV, r ::: r = rV & r/m */
25243       /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
25244       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25245          delta = dis_AVX256_E_V_to_G(
25246                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
25247          goto decode_success;
25248       }
25249       /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
25250       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25251          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25252                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
25253          goto decode_success;
25254       }
25255       /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
25256       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25257          delta = dis_AVX256_E_V_to_G(
25258                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
25259          goto decode_success;
25260       }
25261       break;
25262
25263    case 0x55:
25264       /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
25265       /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
25266       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25267          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25268                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
25269                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25270          goto decode_success;
25271       }
25272       /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
25273       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25274          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
25275                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
25276                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25277          goto decode_success;
25278       }
25279       /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
25280       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25281          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25282                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
25283                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25284          goto decode_success;
25285       }
25286       /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
25287       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25288          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
25289                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
25290                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25291          goto decode_success;
25292       }
25293       break;
25294
25295    case 0x56:
25296       /* VORPD r/m, rV, r ::: r = rV | r/m */
25297       /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
25298       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25299          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25300                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
25301          goto decode_success;
25302       }
25303       /* VORPD r/m, rV, r ::: r = rV | r/m */
25304       /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
25305       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25306          delta = dis_AVX256_E_V_to_G(
25307                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
25308          goto decode_success;
25309       }
25310       /* VORPS r/m, rV, r ::: r = rV | r/m */
25311       /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
25312       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25313          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25314                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
25315          goto decode_success;
25316       }
25317       /* VORPS r/m, rV, r ::: r = rV | r/m */
25318       /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
25319       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25320          delta = dis_AVX256_E_V_to_G(
25321                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
25322          goto decode_success;
25323       }
25324       break;
25325
25326    case 0x57:
25327       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
25328       /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
25329       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25330          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25331                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
25332          goto decode_success;
25333       }
25334       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
25335       /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
25336       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25337          delta = dis_AVX256_E_V_to_G(
25338                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
25339          goto decode_success;
25340       }
25341       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
25342       /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
25343       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25344          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25345                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
25346          goto decode_success;
25347       }
25348       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
25349       /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
25350       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25351          delta = dis_AVX256_E_V_to_G(
25352                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
25353          goto decode_success;
25354       }
25355       break;
25356
25357    case 0x58:
25358       /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
25359       if (haveF2no66noF3(pfx)) {
25360          delta = dis_AVX128_E_V_to_G_lo64(
25361                     uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
25362          goto decode_success;
25363       }
25364       /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
25365       if (haveF3no66noF2(pfx)) {
25366          delta = dis_AVX128_E_V_to_G_lo32(
25367                     uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
25368          goto decode_success;
25369       }
25370       /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
25371       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25372          delta = dis_AVX128_E_V_to_G(
25373                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
25374          goto decode_success;
25375       }
25376       /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
25377       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25378          delta = dis_AVX256_E_V_to_G(
25379                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
25380          goto decode_success;
25381       }
25382       /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
25383       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25384          delta = dis_AVX128_E_V_to_G(
25385                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
25386          goto decode_success;
25387       }
25388       /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
25389       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25390          delta = dis_AVX256_E_V_to_G(
25391                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
25392          goto decode_success;
25393       }
25394       break;
25395
25396    case 0x59:
25397       /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
25398       if (haveF2no66noF3(pfx)) {
25399          delta = dis_AVX128_E_V_to_G_lo64(
25400                     uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
25401          goto decode_success;
25402       }
25403       /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
25404       if (haveF3no66noF2(pfx)) {
25405          delta = dis_AVX128_E_V_to_G_lo32(
25406                     uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
25407          goto decode_success;
25408       }
25409       /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
25410       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25411          delta = dis_AVX128_E_V_to_G(
25412                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
25413          goto decode_success;
25414       }
25415       /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
25416       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25417          delta = dis_AVX256_E_V_to_G(
25418                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
25419          goto decode_success;
25420       }
25421       /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
25422       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25423          delta = dis_AVX128_E_V_to_G(
25424                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
25425          goto decode_success;
25426       }
25427       /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
25428       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25429          delta = dis_AVX256_E_V_to_G(
25430                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
25431          goto decode_success;
25432       }
25433       break;
25434
25435    case 0x5A:
25436       /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
25437       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25438          delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
25439          goto decode_success;
25440       }
25441       /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
25442       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25443          delta = dis_CVTPS2PD_256( vbi, pfx, delta );
25444          goto decode_success;
25445       }
25446       /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
25447       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25448          delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
25449          goto decode_success;
25450       }
25451       /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
25452       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25453          delta = dis_CVTPD2PS_256( vbi, pfx, delta );
25454          goto decode_success;
25455       }
25456       /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
25457       if (haveF2no66noF3(pfx)) {
25458          UChar  modrm = getUChar(delta);
25459          UInt   rV    = getVexNvvvv(pfx);
25460          UInt   rD    = gregOfRexRM(pfx, modrm);
25461          IRTemp f64lo = newTemp(Ity_F64);
25462          IRTemp rmode = newTemp(Ity_I32);
25463          assign( rmode, get_sse_roundingmode() );
25464          if (epartIsReg(modrm)) {
25465             UInt rS = eregOfRexRM(pfx,modrm);
25466             assign(f64lo, getXMMRegLane64F(rS, 0));
25467             delta += 1;
25468             DIP("vcvtsd2ss %s,%s,%s\n",
25469                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
25470          } else {
25471             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25472             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
25473             delta += alen;
25474             DIP("vcvtsd2ss %s,%s,%s\n",
25475                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
25476          }
25477          putXMMRegLane32F( rD, 0,
25478                            binop( Iop_F64toF32, mkexpr(rmode),
25479                                                 mkexpr(f64lo)) );
25480          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
25481          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
25482          putYMMRegLane128( rD, 1, mkV128(0) );
25483          *uses_vvvv = True;
25484          goto decode_success;
25485       }
25486       /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
25487       if (haveF3no66noF2(pfx)) {
25488          UChar  modrm = getUChar(delta);
25489          UInt   rV    = getVexNvvvv(pfx);
25490          UInt   rD    = gregOfRexRM(pfx, modrm);
25491          IRTemp f32lo = newTemp(Ity_F32);
25492          if (epartIsReg(modrm)) {
25493             UInt rS = eregOfRexRM(pfx,modrm);
25494             assign(f32lo, getXMMRegLane32F(rS, 0));
25495             delta += 1;
25496             DIP("vcvtss2sd %s,%s,%s\n",
25497                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
25498          } else {
25499             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25500             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
25501             delta += alen;
25502             DIP("vcvtss2sd %s,%s,%s\n",
25503                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
25504          }
25505          putXMMRegLane64F( rD, 0,
25506                            unop( Iop_F32toF64, mkexpr(f32lo)) );
25507          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
25508          putYMMRegLane128( rD, 1, mkV128(0) );
25509          *uses_vvvv = True;
25510          goto decode_success;
25511       }
25512       break;
25513
25514    case 0x5B:
25515       /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
25516       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25517          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
25518                                     True/*isAvx*/, False/*!r2zero*/ );
25519          goto decode_success;
25520       }
25521       /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
25522       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25523          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
25524                                     False/*!r2zero*/ );
25525          goto decode_success;
25526       }
25527       /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
25528       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
25529          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
25530                                     True/*isAvx*/, True/*r2zero*/ );
25531          goto decode_success;
25532       }
25533       /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
25534       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
25535          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
25536                                     True/*r2zero*/ );
25537          goto decode_success;
25538       }
25539       /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
25540       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25541          delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
25542          goto decode_success;
25543       }
25544       /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
25545       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25546          delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
25547          goto decode_success;
25548       }
25549       break;
25550
25551    case 0x5C:
25552       /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
25553       if (haveF2no66noF3(pfx)) {
25554          delta = dis_AVX128_E_V_to_G_lo64(
25555                     uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
25556          goto decode_success;
25557       }
25558       /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
25559       if (haveF3no66noF2(pfx)) {
25560          delta = dis_AVX128_E_V_to_G_lo32(
25561                     uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
25562          goto decode_success;
25563       }
25564       /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
25565       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25566          delta = dis_AVX128_E_V_to_G(
25567                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
25568          goto decode_success;
25569       }
25570       /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
25571       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25572          delta = dis_AVX256_E_V_to_G(
25573                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
25574          goto decode_success;
25575       }
25576       /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
25577       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25578          delta = dis_AVX128_E_V_to_G(
25579                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
25580          goto decode_success;
25581       }
25582       /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
25583       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25584          delta = dis_AVX256_E_V_to_G(
25585                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
25586          goto decode_success;
25587       }
25588       break;
25589
25590    case 0x5D:
25591       /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
25592       if (haveF2no66noF3(pfx)) {
25593          delta = dis_AVX128_E_V_to_G_lo64(
25594                     uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
25595          goto decode_success;
25596       }
25597       /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
25598       if (haveF3no66noF2(pfx)) {
25599          delta = dis_AVX128_E_V_to_G_lo32(
25600                     uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
25601          goto decode_success;
25602       }
25603       /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
25604       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25605          delta = dis_AVX128_E_V_to_G(
25606                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
25607          goto decode_success;
25608       }
25609       /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
25610       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25611          delta = dis_AVX256_E_V_to_G(
25612                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
25613          goto decode_success;
25614       }
25615       /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
25616       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25617          delta = dis_AVX128_E_V_to_G(
25618                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
25619          goto decode_success;
25620       }
25621       /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
25622       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25623          delta = dis_AVX256_E_V_to_G(
25624                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
25625          goto decode_success;
25626       }
25627       break;
25628
25629    case 0x5E:
25630       /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
25631       if (haveF2no66noF3(pfx)) {
25632          delta = dis_AVX128_E_V_to_G_lo64(
25633                     uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
25634          goto decode_success;
25635       }
25636       /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
25637       if (haveF3no66noF2(pfx)) {
25638          delta = dis_AVX128_E_V_to_G_lo32(
25639                     uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
25640          goto decode_success;
25641       }
25642       /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
25643       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25644          delta = dis_AVX128_E_V_to_G(
25645                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
25646          goto decode_success;
25647       }
25648       /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
25649       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25650          delta = dis_AVX256_E_V_to_G(
25651                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
25652          goto decode_success;
25653       }
25654       /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
25655       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25656          delta = dis_AVX128_E_V_to_G(
25657                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
25658          goto decode_success;
25659       }
25660       /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
25661       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25662          delta = dis_AVX256_E_V_to_G(
25663                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
25664          goto decode_success;
25665       }
25666       break;
25667
25668    case 0x5F:
25669       /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
25670       if (haveF2no66noF3(pfx)) {
25671          delta = dis_AVX128_E_V_to_G_lo64(
25672                     uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
25673          goto decode_success;
25674       }
25675       /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
25676       if (haveF3no66noF2(pfx)) {
25677          delta = dis_AVX128_E_V_to_G_lo32(
25678                     uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
25679          goto decode_success;
25680       }
25681       /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
25682       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25683          delta = dis_AVX128_E_V_to_G(
25684                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
25685          goto decode_success;
25686       }
25687       /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
25688       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25689          delta = dis_AVX256_E_V_to_G(
25690                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
25691          goto decode_success;
25692       }
25693       /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
25694       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25695          delta = dis_AVX128_E_V_to_G(
25696                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
25697          goto decode_success;
25698       }
25699       /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
25700       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25701          delta = dis_AVX256_E_V_to_G(
25702                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
25703          goto decode_success;
25704       }
25705       break;
25706
25707    case 0x60:
25708       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
25709       /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
25710       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25711          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25712                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
25713                     Iop_InterleaveLO8x16, NULL,
25714                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25715          goto decode_success;
25716       }
25717       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
25718       /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
25719       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25720          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25721                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
25722                     math_VPUNPCKLBW_YMM );
25723          goto decode_success;
25724       }
25725       break;
25726
25727    case 0x61:
25728       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
25729       /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
25730       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25731          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25732                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
25733                     Iop_InterleaveLO16x8, NULL,
25734                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25735          goto decode_success;
25736       }
25737       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
25738       /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
25739       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25740          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25741                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
25742                     math_VPUNPCKLWD_YMM );
25743          goto decode_success;
25744       }
25745       break;
25746
25747    case 0x62:
25748       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
25749       /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
25750       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25751          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25752                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
25753                     Iop_InterleaveLO32x4, NULL,
25754                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25755          goto decode_success;
25756       }
25757       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
25758       /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
25759       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25760          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25761                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
25762                     math_VPUNPCKLDQ_YMM );
25763          goto decode_success;
25764       }
25765       break;
25766
25767    case 0x63:
25768       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
25769       /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
25770       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25771          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25772                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
25773                     Iop_QNarrowBin16Sto8Sx16, NULL,
25774                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25775          goto decode_success;
25776       }
25777       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
25778       /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
25779       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25780          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25781                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
25782                     math_VPACKSSWB_YMM );
25783          goto decode_success;
25784       }
25785       break;
25786
25787    case 0x64:
25788       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
25789       /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
25790       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25791          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25792                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
25793          goto decode_success;
25794       }
25795       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
25796       /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
25797       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25798          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25799                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
25800          goto decode_success;
25801       }
25802       break;
25803
25804    case 0x65:
25805       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
25806       /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
25807       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25808          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25809                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
25810          goto decode_success;
25811       }
25812       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
25813       /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
25814       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25815          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25816                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
25817          goto decode_success;
25818       }
25819       break;
25820
25821    case 0x66:
25822       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
25823       /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
25824       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25825          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25826                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
25827          goto decode_success;
25828       }
25829       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
25830       /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
25831       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25832          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25833                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
25834          goto decode_success;
25835       }
25836       break;
25837
25838    case 0x67:
25839       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
25840       /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
25841       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25842          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25843                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
25844                     Iop_QNarrowBin16Sto8Ux16, NULL,
25845                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25846          goto decode_success;
25847       }
25848       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
25849       /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
25850       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25851          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25852                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
25853                     math_VPACKUSWB_YMM );
25854          goto decode_success;
25855       }
25856       break;
25857
25858    case 0x68:
25859       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
25860       /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
25861       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25862          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25863                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
25864                     Iop_InterleaveHI8x16, NULL,
25865                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25866          goto decode_success;
25867       }
25868       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
25869       /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
25870       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25871          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25872                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
25873                     math_VPUNPCKHBW_YMM );
25874          goto decode_success;
25875       }
25876       break;
25877
25878    case 0x69:
25879       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
25880       /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
25881       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25882          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25883                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
25884                     Iop_InterleaveHI16x8, NULL,
25885                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25886          goto decode_success;
25887       }
25888       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
25889       /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
25890       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25891          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25892                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
25893                     math_VPUNPCKHWD_YMM );
25894          goto decode_success;
25895       }
25896       break;
25897
25898    case 0x6A:
25899       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
25900       /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
25901       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25902          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25903                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
25904                     Iop_InterleaveHI32x4, NULL,
25905                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25906          goto decode_success;
25907       }
25908       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
25909       /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
25910       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25911          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25912                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
25913                     math_VPUNPCKHDQ_YMM );
25914          goto decode_success;
25915       }
25916       break;
25917
25918    case 0x6B:
25919       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
25920       /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
25921       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25922          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25923                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
25924                     Iop_QNarrowBin32Sto16Sx8, NULL,
25925                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25926          goto decode_success;
25927       }
25928       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
25929       /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
25930       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25931          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25932                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
25933                     math_VPACKSSDW_YMM );
25934          goto decode_success;
25935       }
25936       break;
25937
25938    case 0x6C:
25939       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
25940       /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
25941       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25942          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25943                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
25944                     Iop_InterleaveLO64x2, NULL,
25945                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25946          goto decode_success;
25947       }
25948       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
25949       /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
25950       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25951          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25952                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
25953                     math_VPUNPCKLQDQ_YMM );
25954          goto decode_success;
25955       }
25956       break;
25957
25958    case 0x6D:
25959       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
25960       /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
25961       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25962          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25963                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
25964                     Iop_InterleaveHI64x2, NULL,
25965                     False/*!invertLeftArg*/, True/*swapArgs*/ );
25966          goto decode_success;
25967       }
25968       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
25969       /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
25970       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25971          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25972                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
25973                     math_VPUNPCKHQDQ_YMM );
25974          goto decode_success;
25975       }
25976       break;
25977
25978    case 0x6E:
25979       /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
25980       if (have66noF2noF3(pfx)
25981           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25982          vassert(sz == 2); /* even tho we are transferring 4, not 2. */
25983          UChar modrm = getUChar(delta);
25984          if (epartIsReg(modrm)) {
25985             delta += 1;
25986             putYMMRegLoAndZU(
25987                gregOfRexRM(pfx,modrm),
25988                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
25989             );
25990             DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
25991                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
25992         } else {
25993             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25994             delta += alen;
25995             putYMMRegLoAndZU(
25996                gregOfRexRM(pfx,modrm),
25997                unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
25998                              );
25999             DIP("vmovd %s, %s\n", dis_buf,
26000                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
26001          }
26002          goto decode_success;
26003       }
26004       /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
26005       if (have66noF2noF3(pfx)
26006           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
26007          vassert(sz == 2); /* even tho we are transferring 8, not 2. */
26008          UChar modrm = getUChar(delta);
26009          if (epartIsReg(modrm)) {
26010             delta += 1;
26011             putYMMRegLoAndZU(
26012                gregOfRexRM(pfx,modrm),
26013                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
26014             );
26015             DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
26016                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
26017         } else {
26018             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26019             delta += alen;
26020             putYMMRegLoAndZU(
26021                gregOfRexRM(pfx,modrm),
26022                unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
26023                              );
26024             DIP("vmovq %s, %s\n", dis_buf,
26025                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
26026          }
26027          goto decode_success;
26028       }
26029       break;
26030
26031    case 0x6F:
26032       /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
26033       /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
26034       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
26035           && 1==getVexL(pfx)/*256*/) {
26036          UChar  modrm = getUChar(delta);
26037          UInt   rD    = gregOfRexRM(pfx, modrm);
26038          IRTemp tD    = newTemp(Ity_V256);
26039          Bool   isA   = have66noF2noF3(pfx);
26040          HChar  ch    = isA ? 'a' : 'u';
26041          if (epartIsReg(modrm)) {
26042             UInt rS = eregOfRexRM(pfx, modrm);
26043             delta += 1;
26044             assign(tD, getYMMReg(rS));
26045             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
26046          } else {
26047             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26048             delta += alen;
26049             if (isA)
26050                gen_SIGNAL_if_not_32_aligned(vbi, addr);
26051             assign(tD, loadLE(Ity_V256, mkexpr(addr)));
26052             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
26053          }
26054          putYMMReg(rD, mkexpr(tD));
26055          goto decode_success;
26056       }
26057       /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
26058       /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
26059       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
26060           && 0==getVexL(pfx)/*128*/) {
26061          UChar  modrm = getUChar(delta);
26062          UInt   rD    = gregOfRexRM(pfx, modrm);
26063          IRTemp tD    = newTemp(Ity_V128);
26064          Bool   isA   = have66noF2noF3(pfx);
26065          HChar  ch    = isA ? 'a' : 'u';
26066          if (epartIsReg(modrm)) {
26067             UInt rS = eregOfRexRM(pfx, modrm);
26068             delta += 1;
26069             assign(tD, getXMMReg(rS));
26070             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
26071          } else {
26072             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26073             delta += alen;
26074             if (isA)
26075                gen_SIGNAL_if_not_16_aligned(vbi, addr);
26076             assign(tD, loadLE(Ity_V128, mkexpr(addr)));
26077             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
26078          }
26079          putYMMRegLoAndZU(rD, mkexpr(tD));
26080          goto decode_success;
26081       }
26082       break;
26083
26084    case 0x70:
26085       /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
26086       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26087          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
26088          goto decode_success;
26089       }
26090       /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
26091       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26092          delta = dis_PSHUFD_32x8( vbi, pfx, delta);
26093          goto decode_success;
26094       }
26095       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
26096       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26097          delta = dis_PSHUFxW_128( vbi, pfx, delta,
26098                                   True/*isAvx*/, False/*!xIsH*/ );
26099          goto decode_success;
26100       }
26101       /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
26102       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26103          delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
26104          goto decode_success;
26105       }
26106       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
26107       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
26108          delta = dis_PSHUFxW_128( vbi, pfx, delta,
26109                                   True/*isAvx*/, True/*xIsH*/ );
26110          goto decode_success;
26111       }
26112       /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
26113       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
26114          delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
26115          goto decode_success;
26116       }
26117       break;
26118
26119    case 0x71:
26120       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
26121       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
26122       /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
26123       if (have66noF2noF3(pfx)
26124           && 0==getVexL(pfx)/*128*/
26125           && epartIsReg(getUChar(delta))) {
26126          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26127             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26128                                                 "vpsrlw", Iop_ShrN16x8 );
26129             *uses_vvvv = True;
26130             goto decode_success;
26131          }
26132          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26133             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26134                                                 "vpsraw", Iop_SarN16x8 );
26135             *uses_vvvv = True;
26136             goto decode_success;
26137          }
26138          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26139             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26140                                                 "vpsllw", Iop_ShlN16x8 );
26141             *uses_vvvv = True;
26142             goto decode_success;
26143          }
26144          /* else fall through */
26145       }
26146       /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
26147       /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
26148       /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
26149       if (have66noF2noF3(pfx)
26150           && 1==getVexL(pfx)/*256*/
26151           && epartIsReg(getUChar(delta))) {
26152          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26153             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26154                                                 "vpsrlw", Iop_ShrN16x16 );
26155             *uses_vvvv = True;
26156             goto decode_success;
26157          }
26158          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26159             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26160                                                 "vpsraw", Iop_SarN16x16 );
26161             *uses_vvvv = True;
26162             goto decode_success;
26163          }
26164          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26165             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26166                                                 "vpsllw", Iop_ShlN16x16 );
26167             *uses_vvvv = True;
26168             goto decode_success;
26169          }
26170          /* else fall through */
26171       }
26172       break;
26173
26174    case 0x72:
26175       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
26176       /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
26177       /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
26178       if (have66noF2noF3(pfx)
26179           && 0==getVexL(pfx)/*128*/
26180           && epartIsReg(getUChar(delta))) {
26181          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26182             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26183                                                 "vpsrld", Iop_ShrN32x4 );
26184             *uses_vvvv = True;
26185             goto decode_success;
26186          }
26187          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26188             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26189                                                 "vpsrad", Iop_SarN32x4 );
26190             *uses_vvvv = True;
26191             goto decode_success;
26192          }
26193          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26194             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26195                                                 "vpslld", Iop_ShlN32x4 );
26196             *uses_vvvv = True;
26197             goto decode_success;
26198          }
26199          /* else fall through */
26200       }
26201       /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
26202       /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
26203       /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
26204       if (have66noF2noF3(pfx)
26205           && 1==getVexL(pfx)/*256*/
26206           && epartIsReg(getUChar(delta))) {
26207          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26208             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26209                                                 "vpsrld", Iop_ShrN32x8 );
26210             *uses_vvvv = True;
26211             goto decode_success;
26212          }
26213          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26214             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26215                                                 "vpsrad", Iop_SarN32x8 );
26216             *uses_vvvv = True;
26217             goto decode_success;
26218          }
26219          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26220             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26221                                                 "vpslld", Iop_ShlN32x8 );
26222             *uses_vvvv = True;
26223             goto decode_success;
26224          }
26225          /* else fall through */
26226       }
26227       break;
26228
26229    case 0x73:
26230       /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
26231       /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
26232       /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
26233       /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
26234       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
26235           && epartIsReg(getUChar(delta))) {
26236          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
26237          Int    rD   = getVexNvvvv(pfx);
26238          IRTemp vecS = newTemp(Ity_V128);
26239          if (gregLO3ofRM(getUChar(delta)) == 3) {
26240             Int imm = (Int)getUChar(delta+1);
26241             DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
26242             delta += 2;
26243             assign( vecS, getXMMReg(rS) );
26244             putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
26245             *uses_vvvv = True;
26246             goto decode_success;
26247          }
26248          if (gregLO3ofRM(getUChar(delta)) == 7) {
26249             Int imm = (Int)getUChar(delta+1);
26250             DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
26251             delta += 2;
26252             assign( vecS, getXMMReg(rS) );
26253             putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
26254             *uses_vvvv = True;
26255             goto decode_success;
26256          }
26257          if (gregLO3ofRM(getUChar(delta)) == 2) {
26258             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26259                                                 "vpsrlq", Iop_ShrN64x2 );
26260             *uses_vvvv = True;
26261             goto decode_success;
26262          }
26263          if (gregLO3ofRM(getUChar(delta)) == 6) {
26264             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26265                                                 "vpsllq", Iop_ShlN64x2 );
26266             *uses_vvvv = True;
26267             goto decode_success;
26268          }
26269          /* else fall through */
26270       }
26271       /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
26272       /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
26273       /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
26274       /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
26275       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
26276           && epartIsReg(getUChar(delta))) {
26277          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
26278          Int    rD   = getVexNvvvv(pfx);
26279          if (gregLO3ofRM(getUChar(delta)) == 3) {
26280             IRTemp vecS0 = newTemp(Ity_V128);
26281             IRTemp vecS1 = newTemp(Ity_V128);
26282             Int imm = (Int)getUChar(delta+1);
26283             DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
26284             delta += 2;
26285             assign( vecS0, getYMMRegLane128(rS, 0));
26286             assign( vecS1, getYMMRegLane128(rS, 1));
26287             putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
26288             putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
26289             *uses_vvvv = True;
26290             goto decode_success;
26291          }
26292          if (gregLO3ofRM(getUChar(delta)) == 7) {
26293             IRTemp vecS0 = newTemp(Ity_V128);
26294             IRTemp vecS1 = newTemp(Ity_V128);
26295             Int imm = (Int)getUChar(delta+1);
26296             DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
26297             delta += 2;
26298             assign( vecS0, getYMMRegLane128(rS, 0));
26299             assign( vecS1, getYMMRegLane128(rS, 1));
26300             putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
26301             putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
26302             *uses_vvvv = True;
26303             goto decode_success;
26304          }
26305          if (gregLO3ofRM(getUChar(delta)) == 2) {
26306             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26307                                                 "vpsrlq", Iop_ShrN64x4 );
26308             *uses_vvvv = True;
26309             goto decode_success;
26310          }
26311          if (gregLO3ofRM(getUChar(delta)) == 6) {
26312             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26313                                                 "vpsllq", Iop_ShlN64x4 );
26314             *uses_vvvv = True;
26315             goto decode_success;
26316          }
26317          /* else fall through */
26318       }
26319       break;
26320
26321    case 0x74:
26322       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
26323       /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
26324       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26325          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26326                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
26327          goto decode_success;
26328       }
26329       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
26330       /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
26331       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26332          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26333                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
26334          goto decode_success;
26335       }
26336       break;
26337
26338    case 0x75:
26339       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
26340       /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
26341       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26342          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26343                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
26344          goto decode_success;
26345       }
26346       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
26347       /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
26348       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26349          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26350                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
26351          goto decode_success;
26352       }
26353       break;
26354
26355    case 0x76:
26356       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
26357       /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
26358       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26359          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26360                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
26361          goto decode_success;
26362       }
26363       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
26364       /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
26365       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26366          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26367                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
26368          goto decode_success;
26369       }
26370       break;
26371
26372    case 0x77:
26373       /* VZEROUPPER = VEX.128.0F.WIG 77 */
26374       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26375          Int i;
26376          IRTemp zero128 = newTemp(Ity_V128);
26377          assign(zero128, mkV128(0));
26378          for (i = 0; i < 16; i++) {
26379             putYMMRegLane128(i, 1, mkexpr(zero128));
26380          }
26381          DIP("vzeroupper\n");
26382          goto decode_success;
26383       }
26384       /* VZEROALL = VEX.256.0F.WIG 77 */
26385       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26386          Int i;
26387          IRTemp zero128 = newTemp(Ity_V128);
26388          assign(zero128, mkV128(0));
26389          for (i = 0; i < 16; i++) {
26390             putYMMRegLoAndZU(i, mkexpr(zero128));
26391          }
26392          DIP("vzeroall\n");
26393          goto decode_success;
26394       }
26395       break;
26396
26397    case 0x7C:
26398    case 0x7D:
26399       /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
26400       /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
26401       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26402          IRTemp sV     = newTemp(Ity_V128);
26403          IRTemp dV     = newTemp(Ity_V128);
26404          Bool   isAdd  = opc == 0x7C;
26405          const HChar* str = isAdd ? "add" : "sub";
26406          UChar modrm   = getUChar(delta);
26407          UInt   rG     = gregOfRexRM(pfx,modrm);
26408          UInt   rV     = getVexNvvvv(pfx);
26409          if (epartIsReg(modrm)) {
26410             UInt rE = eregOfRexRM(pfx,modrm);
26411             assign( sV, getXMMReg(rE) );
26412             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
26413                 nameXMMReg(rV), nameXMMReg(rG));
26414             delta += 1;
26415          } else {
26416             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26417             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
26418             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26419                 nameXMMReg(rV), nameXMMReg(rG));
26420             delta += alen;
26421          }
26422          assign( dV, getXMMReg(rV) );
26423          putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
26424          *uses_vvvv = True;
26425          goto decode_success;
26426       }
26427       /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
26428       /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
26429       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26430          IRTemp sV     = newTemp(Ity_V256);
26431          IRTemp dV     = newTemp(Ity_V256);
26432          IRTemp s1, s0, d1, d0;
26433          Bool   isAdd  = opc == 0x7C;
26434          const HChar* str = isAdd ? "add" : "sub";
26435          UChar modrm   = getUChar(delta);
26436          UInt   rG     = gregOfRexRM(pfx,modrm);
26437          UInt   rV     = getVexNvvvv(pfx);
26438          s1 = s0 = d1 = d0 = IRTemp_INVALID;
26439          if (epartIsReg(modrm)) {
26440             UInt rE = eregOfRexRM(pfx,modrm);
26441             assign( sV, getYMMReg(rE) );
26442             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
26443                 nameYMMReg(rV), nameYMMReg(rG));
26444             delta += 1;
26445          } else {
26446             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26447             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
26448             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26449                 nameYMMReg(rV), nameYMMReg(rG));
26450             delta += alen;
26451          }
26452          assign( dV, getYMMReg(rV) );
26453          breakupV256toV128s( dV, &d1, &d0 );
26454          breakupV256toV128s( sV, &s1, &s0 );
26455          putYMMReg( rG, binop(Iop_V128HLtoV256,
26456                               mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
26457                               mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
26458          *uses_vvvv = True;
26459          goto decode_success;
26460       }
26461       /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
26462       /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
26463       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26464          IRTemp sV     = newTemp(Ity_V128);
26465          IRTemp dV     = newTemp(Ity_V128);
26466          Bool   isAdd  = opc == 0x7C;
26467          const HChar* str = isAdd ? "add" : "sub";
26468          UChar modrm   = getUChar(delta);
26469          UInt   rG     = gregOfRexRM(pfx,modrm);
26470          UInt   rV     = getVexNvvvv(pfx);
26471          if (epartIsReg(modrm)) {
26472             UInt rE = eregOfRexRM(pfx,modrm);
26473             assign( sV, getXMMReg(rE) );
26474             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
26475                 nameXMMReg(rV), nameXMMReg(rG));
26476             delta += 1;
26477          } else {
26478             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26479             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
26480             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26481                 nameXMMReg(rV), nameXMMReg(rG));
26482             delta += alen;
26483          }
26484          assign( dV, getXMMReg(rV) );
26485          putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
26486          *uses_vvvv = True;
26487          goto decode_success;
26488       }
26489       /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
26490       /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
26491       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26492          IRTemp sV     = newTemp(Ity_V256);
26493          IRTemp dV     = newTemp(Ity_V256);
26494          IRTemp s1, s0, d1, d0;
26495          Bool   isAdd  = opc == 0x7C;
26496          const HChar* str = isAdd ? "add" : "sub";
26497          UChar modrm   = getUChar(delta);
26498          UInt   rG     = gregOfRexRM(pfx,modrm);
26499          UInt   rV     = getVexNvvvv(pfx);
26500          s1 = s0 = d1 = d0 = IRTemp_INVALID;
26501          if (epartIsReg(modrm)) {
26502             UInt rE = eregOfRexRM(pfx,modrm);
26503             assign( sV, getYMMReg(rE) );
26504             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
26505                 nameYMMReg(rV), nameYMMReg(rG));
26506             delta += 1;
26507          } else {
26508             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26509             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
26510             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26511                 nameYMMReg(rV), nameYMMReg(rG));
26512             delta += alen;
26513          }
26514          assign( dV, getYMMReg(rV) );
26515          breakupV256toV128s( dV, &d1, &d0 );
26516          breakupV256toV128s( sV, &s1, &s0 );
26517          putYMMReg( rG, binop(Iop_V128HLtoV256,
26518                               mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
26519                               mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
26520          *uses_vvvv = True;
26521          goto decode_success;
26522       }
26523       break;
26524
26525    case 0x7E:
26526       /* Note the Intel docs don't make sense for this.  I think they
26527          are wrong.  They seem to imply it is a store when in fact I
26528          think it is a load.  Also it's unclear whether this is W0, W1
26529          or WIG. */
26530       /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
26531       if (haveF3no66noF2(pfx)
26532           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26533          vassert(sz == 4); /* even tho we are transferring 8, not 4. */
26534          UChar modrm = getUChar(delta);
26535          UInt  rG    = gregOfRexRM(pfx,modrm);
26536          if (epartIsReg(modrm)) {
26537             UInt rE = eregOfRexRM(pfx,modrm);
26538             putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
26539             DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
26540             delta += 1;
26541          } else {
26542             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26543             putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
26544             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
26545             delta += alen;
26546          }
26547          /* zero bits 255:64 */
26548          putXMMRegLane64( rG, 1, mkU64(0) );
26549          putYMMRegLane128( rG, 1, mkV128(0) );
26550          goto decode_success;
26551       }
26552       /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
26553       /* Moves from G to E, so is a store-form insn */
26554       /* Intel docs list this in the VMOVD entry for some reason. */
26555       if (have66noF2noF3(pfx)
26556           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
26557          UChar modrm = getUChar(delta);
26558          UInt  rG    = gregOfRexRM(pfx,modrm);
26559          if (epartIsReg(modrm)) {
26560             UInt rE = eregOfRexRM(pfx,modrm);
26561             DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
26562             putIReg64(rE, getXMMRegLane64(rG, 0));
26563             delta += 1;
26564          } else {
26565             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26566             storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
26567             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
26568             delta += alen;
26569          }
26570          goto decode_success;
26571       }
26572       /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
26573       /* Moves from G to E, so is a store-form insn */
26574       if (have66noF2noF3(pfx)
26575           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26576          UChar modrm = getUChar(delta);
26577          UInt  rG    = gregOfRexRM(pfx,modrm);
26578          if (epartIsReg(modrm)) {
26579             UInt rE = eregOfRexRM(pfx,modrm);
26580             DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
26581             putIReg32(rE, getXMMRegLane32(rG, 0));
26582             delta += 1;
26583          } else {
26584             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26585             storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
26586             DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
26587             delta += alen;
26588          }
26589          goto decode_success;
26590       }
26591       break;
26592
26593    case 0x7F:
26594       /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
26595       /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
26596       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
26597           && 1==getVexL(pfx)/*256*/) {
26598          UChar  modrm = getUChar(delta);
26599          UInt   rS    = gregOfRexRM(pfx, modrm);
26600          IRTemp tS    = newTemp(Ity_V256);
26601          Bool   isA   = have66noF2noF3(pfx);
26602          HChar  ch    = isA ? 'a' : 'u';
26603          assign(tS, getYMMReg(rS));
26604          if (epartIsReg(modrm)) {
26605             UInt rD = eregOfRexRM(pfx, modrm);
26606             delta += 1;
26607             putYMMReg(rD, mkexpr(tS));
26608             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
26609          } else {
26610             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26611             delta += alen;
26612             if (isA)
26613                gen_SIGNAL_if_not_32_aligned(vbi, addr);
26614             storeLE(mkexpr(addr), mkexpr(tS));
26615             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
26616          }
26617          goto decode_success;
26618       }
26619       /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
26620       /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
26621       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
26622           && 0==getVexL(pfx)/*128*/) {
26623          UChar  modrm = getUChar(delta);
26624          UInt   rS    = gregOfRexRM(pfx, modrm);
26625          IRTemp tS    = newTemp(Ity_V128);
26626          Bool   isA   = have66noF2noF3(pfx);
26627          HChar  ch    = isA ? 'a' : 'u';
26628          assign(tS, getXMMReg(rS));
26629          if (epartIsReg(modrm)) {
26630             UInt rD = eregOfRexRM(pfx, modrm);
26631             delta += 1;
26632             putYMMRegLoAndZU(rD, mkexpr(tS));
26633             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
26634          } else {
26635             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26636             delta += alen;
26637             if (isA)
26638                gen_SIGNAL_if_not_16_aligned(vbi, addr);
26639             storeLE(mkexpr(addr), mkexpr(tS));
26640             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
26641          }
26642          goto decode_success;
26643       }
26644       break;
26645
26646    case 0xAE:
26647       /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
26648       if (haveNo66noF2noF3(pfx)
26649           && 0==getVexL(pfx)/*LZ*/
26650           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
26651           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
26652           && sz == 4) {
26653          delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
26654          goto decode_success;
26655       }
26656       /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
26657       if (haveNo66noF2noF3(pfx)
26658           && 0==getVexL(pfx)/*LZ*/
26659           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
26660           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
26661           && sz == 4) {
26662          delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
26663          goto decode_success;
26664       }
26665       break;
26666
26667    case 0xC2:
26668       /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
26669       /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
26670       if (haveF2no66noF3(pfx)) {
26671          Long delta0 = delta;
26672          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26673                                           "vcmpsd", False/*!all_lanes*/,
26674                                           8/*sz*/);
26675          if (delta > delta0) goto decode_success;
26676          /* else fall through -- decoding has failed */
26677       }
26678       /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
26679       /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
26680       if (haveF3no66noF2(pfx)) {
26681          Long delta0 = delta;
26682          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26683                                           "vcmpss", False/*!all_lanes*/,
26684                                           4/*sz*/);
26685          if (delta > delta0) goto decode_success;
26686          /* else fall through -- decoding has failed */
26687       }
26688       /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
26689       /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
26690       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26691          Long delta0 = delta;
26692          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26693                                           "vcmppd", True/*all_lanes*/,
26694                                           8/*sz*/);
26695          if (delta > delta0) goto decode_success;
26696          /* else fall through -- decoding has failed */
26697       }
26698       /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
26699       /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
26700       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26701          Long delta0 = delta;
26702          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26703                                           "vcmppd", 8/*sz*/);
26704          if (delta > delta0) goto decode_success;
26705          /* else fall through -- decoding has failed */
26706       }
26707       /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
26708       /* = VEX.NDS.128.0F.WIG C2 /r ib */
26709       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26710          Long delta0 = delta;
26711          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26712                                           "vcmpps", True/*all_lanes*/,
26713                                           4/*sz*/);
26714          if (delta > delta0) goto decode_success;
26715          /* else fall through -- decoding has failed */
26716       }
26717       /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
26718       /* = VEX.NDS.256.0F.WIG C2 /r ib */
26719       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26720          Long delta0 = delta;
26721          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26722                                           "vcmpps", 4/*sz*/);
26723          if (delta > delta0) goto decode_success;
26724          /* else fall through -- decoding has failed */
26725       }
26726       break;
26727
26728    case 0xC4:
26729       /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
26730       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26731          UChar  modrm = getUChar(delta);
26732          UInt   rG    = gregOfRexRM(pfx, modrm);
26733          UInt   rV    = getVexNvvvv(pfx);
26734          Int    imm8;
26735          IRTemp new16 = newTemp(Ity_I16);
26736
26737          if ( epartIsReg( modrm ) ) {
26738             imm8 = (Int)(getUChar(delta+1) & 7);
26739             assign( new16, unop(Iop_32to16,
26740                                 getIReg32(eregOfRexRM(pfx,modrm))) );
26741             delta += 1+1;
26742             DIP( "vpinsrw $%d,%s,%s\n", imm8,
26743                  nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
26744          } else {
26745             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26746             imm8 = (Int)(getUChar(delta+alen) & 7);
26747             assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
26748             delta += alen+1;
26749             DIP( "vpinsrw $%d,%s,%s\n",
26750                  imm8, dis_buf, nameXMMReg(rG) );
26751          }
26752
26753          IRTemp src_vec = newTemp(Ity_V128);
26754          assign(src_vec, getXMMReg( rV ));
26755          IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
26756          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26757          *uses_vvvv = True;
26758          goto decode_success;
26759       }
26760       break;
26761
26762    case 0xC5:
26763       /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
26764       if (have66noF2noF3(pfx)
26765          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26766          Long delta0 = delta;
26767          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
26768                                               True/*isAvx*/ );
26769          if (delta > delta0) goto decode_success;
26770          /* else fall through -- decoding has failed */
26771       }
26772       break;
26773
26774    case 0xC6:
26775       /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
26776       /* = VEX.NDS.128.0F.WIG C6 /r ib */
26777       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26778          Int    imm8 = 0;
26779          IRTemp eV   = newTemp(Ity_V128);
26780          IRTemp vV   = newTemp(Ity_V128);
26781          UInt  modrm = getUChar(delta);
26782          UInt  rG    = gregOfRexRM(pfx,modrm);
26783          UInt  rV    = getVexNvvvv(pfx);
26784          assign( vV, getXMMReg(rV) );
26785          if (epartIsReg(modrm)) {
26786             UInt rE = eregOfRexRM(pfx,modrm);
26787             assign( eV, getXMMReg(rE) );
26788             imm8 = (Int)getUChar(delta+1);
26789             delta += 1+1;
26790             DIP("vshufps $%d,%s,%s,%s\n",
26791                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
26792          } else {
26793             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26794             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
26795             imm8 = (Int)getUChar(delta+alen);
26796             delta += 1+alen;
26797             DIP("vshufps $%d,%s,%s,%s\n",
26798                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
26799          }
26800          IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
26801          putYMMRegLoAndZU( rG, mkexpr(res) );
26802          *uses_vvvv = True;
26803          goto decode_success;
26804       }
26805       /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
26806       /* = VEX.NDS.256.0F.WIG C6 /r ib */
26807       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26808          Int    imm8 = 0;
26809          IRTemp eV   = newTemp(Ity_V256);
26810          IRTemp vV   = newTemp(Ity_V256);
26811          UInt  modrm = getUChar(delta);
26812          UInt  rG    = gregOfRexRM(pfx,modrm);
26813          UInt  rV    = getVexNvvvv(pfx);
26814          assign( vV, getYMMReg(rV) );
26815          if (epartIsReg(modrm)) {
26816             UInt rE = eregOfRexRM(pfx,modrm);
26817             assign( eV, getYMMReg(rE) );
26818             imm8 = (Int)getUChar(delta+1);
26819             delta += 1+1;
26820             DIP("vshufps $%d,%s,%s,%s\n",
26821                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
26822          } else {
26823             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26824             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
26825             imm8 = (Int)getUChar(delta+alen);
26826             delta += 1+alen;
26827             DIP("vshufps $%d,%s,%s,%s\n",
26828                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
26829          }
26830          IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
26831          putYMMReg( rG, mkexpr(res) );
26832          *uses_vvvv = True;
26833          goto decode_success;
26834       }
26835       /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
26836       /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
26837       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26838          Int    imm8 = 0;
26839          IRTemp eV   = newTemp(Ity_V128);
26840          IRTemp vV   = newTemp(Ity_V128);
26841          UInt  modrm = getUChar(delta);
26842          UInt  rG    = gregOfRexRM(pfx,modrm);
26843          UInt  rV    = getVexNvvvv(pfx);
26844          assign( vV, getXMMReg(rV) );
26845          if (epartIsReg(modrm)) {
26846             UInt rE = eregOfRexRM(pfx,modrm);
26847             assign( eV, getXMMReg(rE) );
26848             imm8 = (Int)getUChar(delta+1);
26849             delta += 1+1;
26850             DIP("vshufpd $%d,%s,%s,%s\n",
26851                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
26852          } else {
26853             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26854             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
26855             imm8 = (Int)getUChar(delta+alen);
26856             delta += 1+alen;
26857             DIP("vshufpd $%d,%s,%s,%s\n",
26858                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
26859          }
26860          IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
26861          putYMMRegLoAndZU( rG, mkexpr(res) );
26862          *uses_vvvv = True;
26863          goto decode_success;
26864       }
26865       /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
26866       /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
26867       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26868          Int    imm8 = 0;
26869          IRTemp eV   = newTemp(Ity_V256);
26870          IRTemp vV   = newTemp(Ity_V256);
26871          UInt  modrm = getUChar(delta);
26872          UInt  rG    = gregOfRexRM(pfx,modrm);
26873          UInt  rV    = getVexNvvvv(pfx);
26874          assign( vV, getYMMReg(rV) );
26875          if (epartIsReg(modrm)) {
26876             UInt rE = eregOfRexRM(pfx,modrm);
26877             assign( eV, getYMMReg(rE) );
26878             imm8 = (Int)getUChar(delta+1);
26879             delta += 1+1;
26880             DIP("vshufpd $%d,%s,%s,%s\n",
26881                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
26882          } else {
26883             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26884             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
26885             imm8 = (Int)getUChar(delta+alen);
26886             delta += 1+alen;
26887             DIP("vshufpd $%d,%s,%s,%s\n",
26888                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
26889          }
26890          IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
26891          putYMMReg( rG, mkexpr(res) );
26892          *uses_vvvv = True;
26893          goto decode_success;
26894       }
26895       break;
26896
26897    case 0xD0:
26898       /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
26899       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26900          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26901                     uses_vvvv, vbi, pfx, delta,
26902                     "vaddsubpd", math_ADDSUBPD_128 );
26903          goto decode_success;
26904       }
26905       /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
26906       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26907          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26908                     uses_vvvv, vbi, pfx, delta,
26909                     "vaddsubpd", math_ADDSUBPD_256 );
26910          goto decode_success;
26911       }
26912       /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
26913       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26914          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26915                     uses_vvvv, vbi, pfx, delta,
26916                     "vaddsubps", math_ADDSUBPS_128 );
26917          goto decode_success;
26918       }
26919       /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
26920       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26921          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26922                     uses_vvvv, vbi, pfx, delta,
26923                     "vaddsubps", math_ADDSUBPS_256 );
26924          goto decode_success;
26925       }
26926       break;
26927
26928    case 0xD1:
26929       /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
26930       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26931          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26932                                         "vpsrlw", Iop_ShrN16x8 );
26933          *uses_vvvv = True;
26934          goto decode_success;
26935
26936       }
26937       /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
26938       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26939          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26940                                         "vpsrlw", Iop_ShrN16x16 );
26941          *uses_vvvv = True;
26942          goto decode_success;
26943
26944       }
26945       break;
26946
26947    case 0xD2:
26948       /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
26949       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26950          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26951                                         "vpsrld", Iop_ShrN32x4 );
26952          *uses_vvvv = True;
26953          goto decode_success;
26954       }
26955       /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
26956       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26957          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26958                                         "vpsrld", Iop_ShrN32x8 );
26959          *uses_vvvv = True;
26960          goto decode_success;
26961       }
26962       break;
26963
26964    case 0xD3:
26965       /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
26966       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26967          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26968                                         "vpsrlq", Iop_ShrN64x2 );
26969          *uses_vvvv = True;
26970          goto decode_success;
26971       }
26972       /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
26973       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26974          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26975                                         "vpsrlq", Iop_ShrN64x4 );
26976          *uses_vvvv = True;
26977          goto decode_success;
26978       }
26979       break;
26980
26981    case 0xD4:
26982       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
26983       /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
26984       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26985          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26986                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
26987          goto decode_success;
26988       }
26989       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
26990       /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
26991       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26992          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26993                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
26994          goto decode_success;
26995       }
26996       break;
26997
26998    case 0xD5:
26999       /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
27000       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27001          delta = dis_AVX128_E_V_to_G(
27002                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
27003          goto decode_success;
27004       }
27005       /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
27006       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27007          delta = dis_AVX256_E_V_to_G(
27008                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
27009          goto decode_success;
27010       }
27011       break;
27012
27013    case 0xD6:
27014       /* I can't even find any Intel docs for this one. */
27015       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
27016          xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
27017          (WIG, maybe?) */
27018       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
27019           && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
27020          UChar modrm = getUChar(delta);
27021          UInt  rG    = gregOfRexRM(pfx,modrm);
27022          if (epartIsReg(modrm)) {
27023             /* fall through, awaiting test case */
27024             /* dst: lo half copied, hi half zeroed */
27025          } else {
27026             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27027             storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
27028             DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
27029             delta += alen;
27030             goto decode_success;
27031          }
27032       }
27033       break;
27034
27035    case 0xD7:
27036       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
27037       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27038          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
27039          goto decode_success;
27040       }
27041       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
27042       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27043          delta = dis_PMOVMSKB_256( vbi, pfx, delta );
27044          goto decode_success;
27045       }
27046       break;
27047
27048    case 0xD8:
27049       /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
27050       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27051          delta = dis_AVX128_E_V_to_G(
27052                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
27053          goto decode_success;
27054       }
27055       /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
27056       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27057          delta = dis_AVX256_E_V_to_G(
27058                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
27059          goto decode_success;
27060       }
27061       break;
27062
27063    case 0xD9:
27064       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
27065       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27066          delta = dis_AVX128_E_V_to_G(
27067                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
27068          goto decode_success;
27069       }
27070       /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
27071       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27072          delta = dis_AVX256_E_V_to_G(
27073                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
27074          goto decode_success;
27075       }
27076       break;
27077
27078    case 0xDA:
27079       /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
27080       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27081          delta = dis_AVX128_E_V_to_G(
27082                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
27083          goto decode_success;
27084       }
27085       /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
27086       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27087          delta = dis_AVX256_E_V_to_G(
27088                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
27089          goto decode_success;
27090       }
27091       break;
27092
27093    case 0xDB:
27094       /* VPAND r/m, rV, r ::: r = rV & r/m */
27095       /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
27096       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27097          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27098                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
27099          goto decode_success;
27100       }
27101       /* VPAND r/m, rV, r ::: r = rV & r/m */
27102       /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
27103       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27104          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27105                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
27106          goto decode_success;
27107       }
27108       break;
27109
27110    case 0xDC:
27111       /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
27112       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27113          delta = dis_AVX128_E_V_to_G(
27114                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
27115          goto decode_success;
27116       }
27117       /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
27118       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27119          delta = dis_AVX256_E_V_to_G(
27120                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
27121          goto decode_success;
27122       }
27123       break;
27124
27125    case 0xDD:
27126       /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
27127       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27128          delta = dis_AVX128_E_V_to_G(
27129                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
27130          goto decode_success;
27131       }
27132       /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
27133       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27134          delta = dis_AVX256_E_V_to_G(
27135                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
27136          goto decode_success;
27137       }
27138       break;
27139
27140    case 0xDE:
27141       /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
27142       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27143          delta = dis_AVX128_E_V_to_G(
27144                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
27145          goto decode_success;
27146       }
27147       /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
27148       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27149          delta = dis_AVX256_E_V_to_G(
27150                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
27151          goto decode_success;
27152       }
27153       break;
27154
27155    case 0xDF:
27156       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
27157       /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
27158       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27159          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
27160                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
27161                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
27162          goto decode_success;
27163       }
27164       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
27165       /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
27166       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27167          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
27168                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
27169                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
27170          goto decode_success;
27171       }
27172       break;
27173
27174    case 0xE0:
27175       /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
27176       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27177          delta = dis_AVX128_E_V_to_G(
27178                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
27179          goto decode_success;
27180       }
27181       /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
27182       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27183          delta = dis_AVX256_E_V_to_G(
27184                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
27185          goto decode_success;
27186       }
27187       break;
27188
27189    case 0xE1:
27190       /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
27191       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27192          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27193                                         "vpsraw", Iop_SarN16x8 );
27194          *uses_vvvv = True;
27195          goto decode_success;
27196       }
27197       /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
27198       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27199          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27200                                         "vpsraw", Iop_SarN16x16 );
27201          *uses_vvvv = True;
27202          goto decode_success;
27203       }
27204       break;
27205
27206    case 0xE2:
27207       /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
27208       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27209          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27210                                         "vpsrad", Iop_SarN32x4 );
27211          *uses_vvvv = True;
27212          goto decode_success;
27213       }
27214       /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
27215       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27216          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27217                                         "vpsrad", Iop_SarN32x8 );
27218          *uses_vvvv = True;
27219          goto decode_success;
27220       }
27221       break;
27222
27223    case 0xE3:
27224       /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
27225       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27226          delta = dis_AVX128_E_V_to_G(
27227                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
27228          goto decode_success;
27229       }
27230       /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
27231       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27232          delta = dis_AVX256_E_V_to_G(
27233                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
27234          goto decode_success;
27235       }
27236       break;
27237
27238    case 0xE4:
27239       /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
27240       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27241          delta = dis_AVX128_E_V_to_G(
27242                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
27243          goto decode_success;
27244       }
27245       /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
27246       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27247          delta = dis_AVX256_E_V_to_G(
27248                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
27249          goto decode_success;
27250       }
27251       break;
27252
27253    case 0xE5:
27254       /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
27255       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27256          delta = dis_AVX128_E_V_to_G(
27257                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
27258          goto decode_success;
27259       }
27260       /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
27261       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27262          delta = dis_AVX256_E_V_to_G(
27263                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
27264          goto decode_success;
27265       }
27266       break;
27267
27268    case 0xE6:
27269       /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
27270       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
27271          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
27272          goto decode_success;
27273       }
27274       /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
27275       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
27276          delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
27277          goto decode_success;
27278       }
27279       /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
27280       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27281          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
27282                                    True/*r2zero*/);
27283          goto decode_success;
27284       }
27285       /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
27286       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27287          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
27288          goto decode_success;
27289       }
27290       /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
27291       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27292          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
27293                                    False/*!r2zero*/);
27294          goto decode_success;
27295       }
27296       /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
27297       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27298          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
27299          goto decode_success;
27300       }
27301       break;
27302
27303    case 0xE7:
27304       /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
27305       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27306          UChar modrm = getUChar(delta);
27307          UInt rG     = gregOfRexRM(pfx,modrm);
27308          if (!epartIsReg(modrm)) {
27309             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27310             gen_SIGNAL_if_not_16_aligned( vbi, addr );
27311             storeLE( mkexpr(addr), getXMMReg(rG) );
27312             DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
27313             delta += alen;
27314             goto decode_success;
27315          }
27316          /* else fall through */
27317       }
27318       /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
27319       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27320          UChar modrm = getUChar(delta);
27321          UInt rG     = gregOfRexRM(pfx,modrm);
27322          if (!epartIsReg(modrm)) {
27323             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27324             gen_SIGNAL_if_not_32_aligned( vbi, addr );
27325             storeLE( mkexpr(addr), getYMMReg(rG) );
27326             DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
27327             delta += alen;
27328             goto decode_success;
27329          }
27330          /* else fall through */
27331       }
27332       break;
27333
27334    case 0xE8:
27335       /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
27336       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27337          delta = dis_AVX128_E_V_to_G(
27338                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
27339          goto decode_success;
27340       }
27341       /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
27342       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27343          delta = dis_AVX256_E_V_to_G(
27344                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
27345          goto decode_success;
27346       }
27347       break;
27348
27349    case 0xE9:
27350       /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
27351       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27352          delta = dis_AVX128_E_V_to_G(
27353                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
27354          goto decode_success;
27355       }
27356       /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
27357       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27358          delta = dis_AVX256_E_V_to_G(
27359                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
27360          goto decode_success;
27361       }
27362       break;
27363
27364    case 0xEA:
27365       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
27366       /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
27367       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27368          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27369                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
27370          goto decode_success;
27371       }
27372       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
27373       /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
27374       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27375          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27376                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
27377          goto decode_success;
27378       }
27379       break;
27380
27381    case 0xEB:
27382       /* VPOR r/m, rV, r ::: r = rV | r/m */
27383       /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
27384       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27385          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27386                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
27387          goto decode_success;
27388       }
27389       /* VPOR r/m, rV, r ::: r = rV | r/m */
27390       /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
27391       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27392          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27393                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
27394          goto decode_success;
27395       }
27396       break;
27397
27398    case 0xEC:
27399       /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
27400       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27401          delta = dis_AVX128_E_V_to_G(
27402                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
27403          goto decode_success;
27404       }
27405       /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
27406       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27407          delta = dis_AVX256_E_V_to_G(
27408                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
27409          goto decode_success;
27410       }
27411       break;
27412
27413    case 0xED:
27414       /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
27415       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27416          delta = dis_AVX128_E_V_to_G(
27417                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
27418          goto decode_success;
27419       }
27420       /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
27421       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27422          delta = dis_AVX256_E_V_to_G(
27423                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
27424          goto decode_success;
27425       }
27426       break;
27427
27428    case 0xEE:
27429       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
27430       /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
27431       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27432          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27433                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
27434          goto decode_success;
27435       }
27436       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
27437       /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
27438       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27439          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27440                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
27441          goto decode_success;
27442       }
27443       break;
27444
27445    case 0xEF:
27446       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
27447       /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
27448       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27449          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27450                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
27451          goto decode_success;
27452       }
27453       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
27454       /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
27455       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27456          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27457                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
27458          goto decode_success;
27459       }
27460       break;
27461
27462    case 0xF0:
27463       /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
27464       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27465          UChar  modrm = getUChar(delta);
27466          UInt   rD    = gregOfRexRM(pfx, modrm);
27467          IRTemp tD    = newTemp(Ity_V256);
27468          if (epartIsReg(modrm)) break;
27469          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27470          delta += alen;
27471          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
27472          DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
27473          putYMMReg(rD, mkexpr(tD));
27474          goto decode_success;
27475       }
27476       /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
27477       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27478          UChar  modrm = getUChar(delta);
27479          UInt   rD    = gregOfRexRM(pfx, modrm);
27480          IRTemp tD    = newTemp(Ity_V128);
27481          if (epartIsReg(modrm)) break;
27482          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27483          delta += alen;
27484          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
27485          DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
27486          putYMMRegLoAndZU(rD, mkexpr(tD));
27487          goto decode_success;
27488       }
27489       break;
27490
27491    case 0xF1:
27492       /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
27493       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27494          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27495                                         "vpsllw", Iop_ShlN16x8 );
27496          *uses_vvvv = True;
27497          goto decode_success;
27498
27499       }
27500       /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
27501       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27502          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27503                                         "vpsllw", Iop_ShlN16x16 );
27504          *uses_vvvv = True;
27505          goto decode_success;
27506
27507       }
27508       break;
27509
27510    case 0xF2:
27511       /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
27512       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27513          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27514                                         "vpslld", Iop_ShlN32x4 );
27515          *uses_vvvv = True;
27516          goto decode_success;
27517       }
27518       /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
27519       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27520          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27521                                         "vpslld", Iop_ShlN32x8 );
27522          *uses_vvvv = True;
27523          goto decode_success;
27524       }
27525       break;
27526
27527    case 0xF3:
27528       /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
27529       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27530          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27531                                         "vpsllq", Iop_ShlN64x2 );
27532          *uses_vvvv = True;
27533          goto decode_success;
27534       }
27535       /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
27536       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27537          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27538                                         "vpsllq", Iop_ShlN64x4 );
27539          *uses_vvvv = True;
27540          goto decode_success;
27541       }
27542       break;
27543
27544    case 0xF4:
27545       /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
27546       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27547          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27548                     uses_vvvv, vbi, pfx, delta,
27549                     "vpmuludq", math_PMULUDQ_128 );
27550          goto decode_success;
27551       }
27552       /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
27553       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27554          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27555                     uses_vvvv, vbi, pfx, delta,
27556                     "vpmuludq", math_PMULUDQ_256 );
27557          goto decode_success;
27558       }
27559       break;
27560
27561    case 0xF5:
27562       /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
27563       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27564          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27565                     uses_vvvv, vbi, pfx, delta,
27566                     "vpmaddwd", math_PMADDWD_128 );
27567          goto decode_success;
27568       }
27569       /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
27570       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27571          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27572                     uses_vvvv, vbi, pfx, delta,
27573                     "vpmaddwd", math_PMADDWD_256 );
27574          goto decode_success;
27575       }
27576       break;
27577
27578    case 0xF6:
27579       /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
27580       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27581          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27582                     uses_vvvv, vbi, pfx, delta,
27583                     "vpsadbw", math_PSADBW_128 );
27584          goto decode_success;
27585       }
27586       /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
27587       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27588          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27589                     uses_vvvv, vbi, pfx, delta,
27590                     "vpsadbw", math_PSADBW_256 );
27591          goto decode_success;
27592       }
27593       break;
27594
27595    case 0xF7:
27596       /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
27597       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
27598           && epartIsReg(getUChar(delta))) {
27599          delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
27600          goto decode_success;
27601       }
27602       break;
27603
27604    case 0xF8:
27605       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
27606       /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
27607       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27608          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27609                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
27610          goto decode_success;
27611       }
27612       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
27613       /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
27614       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27615          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27616                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
27617          goto decode_success;
27618       }
27619       break;
27620
27621    case 0xF9:
27622       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
27623       /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
27624       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27625          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27626                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
27627          goto decode_success;
27628       }
27629       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
27630       /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
27631       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27632          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27633                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
27634          goto decode_success;
27635       }
27636       break;
27637
27638    case 0xFA:
27639       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
27640       /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
27641       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27642          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27643                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
27644          goto decode_success;
27645       }
27646       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
27647       /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
27648       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27649          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27650                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
27651          goto decode_success;
27652       }
27653       break;
27654
27655    case 0xFB:
27656       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
27657       /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
27658       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27659          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27660                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
27661          goto decode_success;
27662       }
27663       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
27664       /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
27665       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27666          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27667                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
27668          goto decode_success;
27669       }
27670       break;
27671
27672    case 0xFC:
27673       /* VPADDB r/m, rV, r ::: r = rV + r/m */
27674       /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
27675       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27676          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27677                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
27678          goto decode_success;
27679       }
27680       /* VPADDB r/m, rV, r ::: r = rV + r/m */
27681       /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
27682       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27683          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27684                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
27685          goto decode_success;
27686       }
27687       break;
27688
27689    case 0xFD:
27690       /* VPADDW r/m, rV, r ::: r = rV + r/m */
27691       /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
27692       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27693          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27694                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
27695          goto decode_success;
27696       }
27697       /* VPADDW r/m, rV, r ::: r = rV + r/m */
27698       /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
27699       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27700          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27701                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
27702          goto decode_success;
27703       }
27704       break;
27705
27706    case 0xFE:
27707       /* VPADDD r/m, rV, r ::: r = rV + r/m */
27708       /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
27709       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27710          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27711                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
27712          goto decode_success;
27713       }
27714       /* VPADDD r/m, rV, r ::: r = rV + r/m */
27715       /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
27716       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27717          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27718                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
27719          goto decode_success;
27720       }
27721       break;
27722
27723    default:
27724       break;
27725
27726    }
27727
27728   //decode_failure:
27729    return deltaIN;
27730
27731   decode_success:
27732    return delta;
27733 }
27734
27735
27736 /*------------------------------------------------------------*/
27737 /*---                                                      ---*/
27738 /*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
27739 /*---                                                      ---*/
27740 /*------------------------------------------------------------*/
27741
27742 static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
27743 {
27744    /* In the control vector, zero out all but the bottom two bits of
27745       each 32-bit lane. */
27746    IRExpr* cv1 = binop(Iop_ShrN32x4,
27747                        binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
27748                        mkU8(30));
27749    /* And use the resulting cleaned-up control vector as steering
27750       in a Perm operation. */
27751    IRTemp res = newTemp(Ity_V128);
27752    assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
27753    return res;
27754 }
27755
27756 static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
27757 {
27758    IRTemp dHi, dLo, cHi, cLo;
27759    dHi = dLo = cHi = cLo = IRTemp_INVALID;
27760    breakupV256toV128s( dataV, &dHi, &dLo );
27761    breakupV256toV128s( ctrlV, &cHi, &cLo );
27762    IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
27763    IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
27764    IRTemp res = newTemp(Ity_V256);
27765    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
27766    return res;
27767 }
27768
27769 static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
27770 {
27771    /* No cleverness here .. */
27772    IRTemp dHi, dLo, cHi, cLo;
27773    dHi = dLo = cHi = cLo = IRTemp_INVALID;
27774    breakupV128to64s( dataV, &dHi, &dLo );
27775    breakupV128to64s( ctrlV, &cHi, &cLo );
27776    IRExpr* rHi
27777       = IRExpr_ITE( unop(Iop_64to1,
27778                          binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
27779                     mkexpr(dHi), mkexpr(dLo) );
27780    IRExpr* rLo
27781       = IRExpr_ITE( unop(Iop_64to1,
27782                          binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
27783                     mkexpr(dHi), mkexpr(dLo) );
27784    IRTemp res = newTemp(Ity_V128);
27785    assign(res, binop(Iop_64HLtoV128, rHi, rLo));
27786    return res;
27787 }
27788
27789 static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
27790 {
27791    IRTemp dHi, dLo, cHi, cLo;
27792    dHi = dLo = cHi = cLo = IRTemp_INVALID;
27793    breakupV256toV128s( dataV, &dHi, &dLo );
27794    breakupV256toV128s( ctrlV, &cHi, &cLo );
27795    IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
27796    IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
27797    IRTemp res = newTemp(Ity_V256);
27798    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
27799    return res;
27800 }
27801
27802 static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
27803 {
27804    /* In the control vector, zero out all but the bottom three bits of
27805       each 32-bit lane. */
27806    IRExpr* cv1 = binop(Iop_ShrN32x8,
27807                        binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
27808                        mkU8(29));
27809    /* And use the resulting cleaned-up control vector as steering
27810       in a Perm operation. */
27811    IRTemp res = newTemp(Ity_V256);
27812    assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
27813    return res;
27814 }
27815
27816 static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
27817                          const VexAbiInfo* vbi, Prefix pfx, Long delta,
27818                          const HChar* opname, IROp op8 )
27819 {
27820    HChar   dis_buf[50];
27821    Int     alen;
27822    Int     size = getRexW(pfx) ? 8 : 4;
27823    IRType  ty   = szToITy(size);
27824    IRTemp  src  = newTemp(ty);
27825    IRTemp  amt  = newTemp(ty);
27826    UChar   rm   = getUChar(delta);
27827
27828    assign( amt, getIRegV(size,pfx) );
27829    if (epartIsReg(rm)) {
27830       assign( src, getIRegE(size,pfx,rm) );
27831       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
27832                            nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
27833       delta++;
27834    } else {
27835       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27836       assign( src, loadLE(ty, mkexpr(addr)) );
27837       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
27838                            nameIRegG(size,pfx,rm));
27839       delta += alen;
27840    }
27841
27842    putIRegG( size, pfx, rm,
27843              binop(mkSizedOp(ty,op8), mkexpr(src),
27844                    narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
27845                                           mkU(ty,8*size-1)))) );
27846    /* Flags aren't modified.  */
27847    *uses_vvvv = True;
27848    return delta;
27849 }
27850
27851
27852 static Long dis_FMA ( const VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
27853 {
27854    UChar  modrm   = getUChar(delta);
27855    UInt   rG      = gregOfRexRM(pfx, modrm);
27856    UInt   rV      = getVexNvvvv(pfx);
27857    Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
27858    IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
27859    IRType vty     = scalar ? ty : (getVexL(pfx) ? Ity_V256 : Ity_V128);
27860    IRTemp addr    = IRTemp_INVALID;
27861    HChar  dis_buf[50];
27862    Int    alen    = 0;
27863    const HChar *name;
27864    const HChar *suffix;
27865    const HChar *order;
27866    Bool   negateRes   = False;
27867    Bool   negateZeven = False;
27868    Bool   negateZodd  = False;
27869    UInt   count = 0;
27870
27871    switch (opc & 0xF) {
27872       case 0x6: name = "addsub"; negateZeven = True; break;
27873       case 0x7: name = "subadd"; negateZodd = True; break;
27874       case 0x8:
27875       case 0x9: name = "add"; break;
27876       case 0xA:
27877       case 0xB: name = "sub"; negateZeven = True; negateZodd = True;
27878          break;
27879       case 0xC:
27880       case 0xD: name = "add"; negateRes = True; negateZeven = True;
27881                                                 negateZodd = True; break;
27882       case 0xE:
27883       case 0xF: name = "sub"; negateRes = True; break;
27884       default:  vpanic("dis_FMA(amd64)"); break;
27885    }
27886    switch (opc & 0xF0) {
27887       case 0x90: order = "132"; break;
27888       case 0xA0: order = "213"; break;
27889       case 0xB0: order = "231"; break;
27890       default:   vpanic("dis_FMA(amd64)"); break;
27891    }
27892    if (scalar) {
27893       suffix = ty == Ity_F64 ? "sd" : "ss";
27894    } else {
27895       suffix = ty == Ity_F64 ? "pd" : "ps";
27896    }
27897
27898    // Figure out |count| (the number of elements) by considering |vty| and |ty|.
27899    count = sizeofIRType(vty) / sizeofIRType(ty);
27900    vassert(count == 1 || count == 2 || count == 4 || count == 8);
27901
27902    // Fetch operands into the first |count| elements of |sX|, |sY| and |sZ|.
27903    UInt i;
27904    IRExpr *sX[8], *sY[8], *sZ[8], *res[8];
27905    for (i = 0; i < 8; i++) sX[i] = sY[i] = sZ[i] = res[i] = NULL;
27906
27907    IRExpr* (*getYMMRegLane)(UInt,Int)
27908       = ty == Ity_F32 ? getYMMRegLane32F : getYMMRegLane64F;
27909    void (*putYMMRegLane)(UInt,Int,IRExpr*)
27910       = ty == Ity_F32 ? putYMMRegLane32F : putYMMRegLane64F;
27911
27912    for (i = 0; i < count; i++) {
27913       sX[i] = getYMMRegLane(rG, i);
27914       sZ[i] = getYMMRegLane(rV, i);
27915    }
27916
27917    if (epartIsReg(modrm)) {
27918       UInt rE = eregOfRexRM(pfx, modrm);
27919       delta += 1;
27920       for (i = 0; i < count; i++) {
27921          sY[i] = getYMMRegLane(rE, i);
27922       }
27923       if (vty == Ity_V256) {
27924          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27925              name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
27926              nameYMMReg(rG));
27927       } else {
27928          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27929              name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
27930              nameXMMReg(rG));
27931       }
27932    } else {
27933       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27934       delta += alen;
27935       for (i = 0; i < count; i++) {
27936          sY[i] = loadLE(ty, binop(Iop_Add64, mkexpr(addr),
27937                                   mkU64(i * sizeofIRType(ty))));
27938       }
27939       if (vty == Ity_V256) {
27940          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27941              name, order, suffix, dis_buf, nameYMMReg(rV),
27942              nameYMMReg(rG));
27943       } else {
27944          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27945              name, order, suffix, dis_buf, nameXMMReg(rV),
27946              nameXMMReg(rG));
27947       }
27948    }
27949
27950    /* vX/vY/vZ are now in 132 order.  If the instruction requires a different
27951       order, swap them around.  */
27952
27953 #  define COPY_ARR(_dst, _src) \
27954       do { for (int j = 0; j < 8; j++) { _dst[j] = _src[j]; } } while (0)
27955
27956    if ((opc & 0xF0) != 0x90) {
27957       IRExpr* temp[8];
27958       COPY_ARR(temp, sX);
27959       if ((opc & 0xF0) == 0xA0) {
27960          COPY_ARR(sX, sZ);
27961          COPY_ARR(sZ, sY);
27962          COPY_ARR(sY, temp);
27963       } else {
27964          COPY_ARR(sX, sZ);
27965          COPY_ARR(sZ, temp);
27966       }
27967    }
27968
27969 #  undef COPY_ARR
27970
27971    for (i = 0; i < count; i++) {
27972       IROp opNEG = ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32;
27973       if ((i & 1) ? negateZodd : negateZeven) {
27974          sZ[i] = unop(opNEG, sZ[i]);
27975       }
27976       res[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
27977                           get_FAKE_roundingmode(), sX[i], sY[i], sZ[i]);
27978       if (negateRes) {
27979          res[i] = unop(opNEG, res[i]);
27980       }
27981    }
27982
27983    for (i = 0; i < count; i++) {
27984       putYMMRegLane(rG, i, res[i]);
27985    }
27986
27987    switch (vty) {
27988       case Ity_F32:  putYMMRegLane32(rG, 1, mkU32(0)); /*fallthru*/
27989       case Ity_F64:  putYMMRegLane64(rG, 1, mkU64(0)); /*fallthru*/
27990       case Ity_V128: putYMMRegLane128(rG, 1, mkV128(0)); /*fallthru*/
27991       case Ity_V256: break;
27992       default: vassert(0);
27993    }
27994
27995    return delta;
27996 }
27997
27998
27999 /* Masked load or masked store. */
28000 static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
28001                             Prefix pfx, Long delta,
28002                             const HChar* opname, Bool isYMM, IRType ty,
28003                             Bool isLoad )
28004 {
28005    HChar   dis_buf[50];
28006    Int     alen, i;
28007    IRTemp  addr;
28008    UChar   modrm = getUChar(delta);
28009    UInt    rG    = gregOfRexRM(pfx,modrm);
28010    UInt    rV    = getVexNvvvv(pfx);
28011
28012    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28013    delta += alen;
28014
28015    /**/ if (isLoad && isYMM) {
28016       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
28017    }
28018    else if (isLoad && !isYMM) {
28019       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
28020    }
28021
28022    else if (!isLoad && isYMM) {
28023       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rG), nameYMMReg(rV), dis_buf );
28024    }
28025    else {
28026       vassert(!isLoad && !isYMM);
28027       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rG), nameXMMReg(rV), dis_buf );
28028    }
28029
28030    vassert(ty == Ity_I32 || ty == Ity_I64);
28031    Bool laneIs32 = ty == Ity_I32;
28032
28033    Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
28034
28035    for (i = 0; i < nLanes; i++) {
28036       IRExpr* shAmt = laneIs32 ? mkU8(31)    : mkU8(63);
28037       IRExpr* one   = laneIs32 ? mkU32(1)    : mkU64(1);
28038       IROp    opSHR = laneIs32 ? Iop_Shr32   : Iop_Shr64;
28039       IROp    opEQ  = laneIs32 ? Iop_CmpEQ32 : Iop_CmpEQ64;
28040       IRExpr* lane  = (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i );
28041
28042       IRTemp  cond = newTemp(Ity_I1);
28043       assign(cond, binop(opEQ, binop(opSHR, lane, shAmt), one));
28044
28045       IRTemp  data = newTemp(ty);
28046       IRExpr* ea   = binop(Iop_Add64, mkexpr(addr),
28047                                       mkU64(i * (laneIs32 ? 4 : 8)));
28048       if (isLoad) {
28049          stmt(
28050             IRStmt_LoadG(
28051                Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
28052                data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
28053          ));
28054          (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
28055       } else {
28056          assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
28057          stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
28058       }
28059    }
28060
28061    if (isLoad && !isYMM)
28062       putYMMRegLane128( rG, 1, mkV128(0) );
28063
28064    *uses_vvvv = True;
28065    return delta;
28066 }
28067
28068
28069 /* Gather.  */
28070 static ULong dis_VGATHER ( Bool *uses_vvvv, const VexAbiInfo* vbi,
28071                            Prefix pfx, Long delta,
28072                            const HChar* opname, Bool isYMM,
28073                            Bool isVM64x, IRType ty )
28074 {
28075    HChar  dis_buf[50];
28076    Int    alen, i, vscale, count1, count2;
28077    IRTemp addr;
28078    UChar  modrm = getUChar(delta);
28079    UInt   rG    = gregOfRexRM(pfx,modrm);
28080    UInt   rV    = getVexNvvvv(pfx);
28081    UInt   rI;
28082    IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
28083    IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
28084    IRTemp cond;
28085    addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
28086                          idxTy, &vscale );
28087    if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
28088       return delta;
28089    if (dstTy == Ity_V256) {
28090       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
28091    } else {
28092       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
28093    }
28094    delta += alen;
28095
28096    if (ty == Ity_I32) {
28097       count1 = isYMM ? 8 : 4;
28098       count2 = isVM64x ? count1 / 2 : count1;
28099    } else {
28100       count1 = count2 = isYMM ? 4 : 2;
28101    }
28102
28103    /* First update the mask register to copies of the sign bit.  */
28104    if (ty == Ity_I32) {
28105       if (isYMM)
28106          putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
28107       else
28108          putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
28109    } else {
28110       for (i = 0; i < count1; i++) {
28111          putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
28112                                        mkU8(63)) );
28113       }
28114    }
28115
28116    /* Next gather the individual elements.  If any fault occurs, the
28117       corresponding mask element will be set and the loop stops.  */
28118    for (i = 0; i < count2; i++) {
28119       IRExpr *expr, *addr_expr;
28120       cond = newTemp(Ity_I1);
28121       assign( cond,
28122               binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
28123                     ty == Ity_I32 ? getYMMRegLane32( rV, i )
28124                                   : getYMMRegLane64( rV, i ),
28125                     mkU(ty, 0)) );
28126       expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
28127                            : getYMMRegLane64( rG, i );
28128       addr_expr = isVM64x ? getYMMRegLane64( rI, i )
28129                           : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
28130       switch (vscale) {
28131          case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
28132          case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
28133          case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
28134          default: break;
28135       }
28136       addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
28137       addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
28138       addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
28139       expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
28140       if (ty == Ity_I32) {
28141          putYMMRegLane32( rG, i, expr );
28142          putYMMRegLane32( rV, i, mkU32(0) );
28143       } else {
28144          putYMMRegLane64( rG, i, expr);
28145          putYMMRegLane64( rV, i, mkU64(0) );
28146       }
28147    }
28148
28149    if (!isYMM || (ty == Ity_I32 && isVM64x)) {
28150       if (ty == Ity_I64 || isYMM)
28151          putYMMRegLane128( rV, 1, mkV128(0) );
28152       else if (ty == Ity_I32 && count2 == 2) {
28153          putYMMRegLane64( rV, 1, mkU64(0) );
28154          putYMMRegLane64( rG, 1, mkU64(0) );
28155       }
28156       putYMMRegLane128( rG, 1, mkV128(0) );
28157    }
28158
28159    *uses_vvvv = True;
28160    return delta;
28161 }
28162
28163
28164 __attribute__((noinline))
28165 static
28166 Long dis_ESC_0F38__VEX (
28167         /*MB_OUT*/DisResult* dres,
28168         /*OUT*/   Bool*      uses_vvvv,
28169         const VexArchInfo* archinfo,
28170         const VexAbiInfo*  vbi,
28171         Prefix pfx, Int sz, Long deltaIN
28172      )
28173 {
28174    IRTemp addr  = IRTemp_INVALID;
28175    Int    alen  = 0;
28176    HChar  dis_buf[50];
28177    Long   delta = deltaIN;
28178    UChar  opc   = getUChar(delta);
28179    delta++;
28180    *uses_vvvv = False;
28181
28182    switch (opc) {
28183
28184    case 0x00:
28185       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
28186       /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
28187       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28188          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28189                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
28190          goto decode_success;
28191       }
28192       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
28193       /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
28194       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28195          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28196                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
28197          goto decode_success;
28198       }
28199       break;
28200
28201    case 0x01:
28202    case 0x02:
28203    case 0x03:
28204       /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
28205       /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
28206       /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
28207       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28208          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
28209          *uses_vvvv = True;
28210          goto decode_success;
28211       }
28212       /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
28213       /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
28214       /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
28215       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28216          delta = dis_PHADD_256( vbi, pfx, delta, opc );
28217          *uses_vvvv = True;
28218          goto decode_success;
28219       }
28220       break;
28221
28222    case 0x04:
28223       /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
28224       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28225          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28226                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
28227                     math_PMADDUBSW_128 );
28228          goto decode_success;
28229       }
28230       /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
28231       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28232          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28233                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
28234                     math_PMADDUBSW_256 );
28235          goto decode_success;
28236       }
28237       break;
28238
28239    case 0x05:
28240    case 0x06:
28241    case 0x07:
28242       /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
28243       /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
28244       /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
28245       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28246          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
28247          *uses_vvvv = True;
28248          goto decode_success;
28249       }
28250       /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
28251       /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
28252       /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
28253       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28254          delta = dis_PHADD_256( vbi, pfx, delta, opc );
28255          *uses_vvvv = True;
28256          goto decode_success;
28257       }
28258       break;
28259
28260    case 0x08:
28261    case 0x09:
28262    case 0x0A:
28263       /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
28264       /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
28265       /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
28266       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28267          IRTemp sV      = newTemp(Ity_V128);
28268          IRTemp dV      = newTemp(Ity_V128);
28269          IRTemp sHi, sLo, dHi, dLo;
28270          sHi = sLo = dHi = dLo = IRTemp_INVALID;
28271          HChar  ch      = '?';
28272          Int    laneszB = 0;
28273          UChar  modrm   = getUChar(delta);
28274          UInt   rG      = gregOfRexRM(pfx,modrm);
28275          UInt   rV      = getVexNvvvv(pfx);
28276
28277          switch (opc) {
28278             case 0x08: laneszB = 1; ch = 'b'; break;
28279             case 0x09: laneszB = 2; ch = 'w'; break;
28280             case 0x0A: laneszB = 4; ch = 'd'; break;
28281             default: vassert(0);
28282          }
28283
28284          assign( dV, getXMMReg(rV) );
28285
28286          if (epartIsReg(modrm)) {
28287             UInt rE = eregOfRexRM(pfx,modrm);
28288             assign( sV, getXMMReg(rE) );
28289             delta += 1;
28290             DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
28291                 nameXMMReg(rV), nameXMMReg(rG));
28292          } else {
28293             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28294             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
28295             delta += alen;
28296             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
28297                 nameXMMReg(rV), nameXMMReg(rG));
28298          }
28299
28300          breakupV128to64s( dV, &dHi, &dLo );
28301          breakupV128to64s( sV, &sHi, &sLo );
28302
28303          putYMMRegLoAndZU(
28304             rG,
28305             binop(Iop_64HLtoV128,
28306                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
28307                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
28308             )
28309          );
28310          *uses_vvvv = True;
28311          goto decode_success;
28312       }
28313       /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
28314       /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
28315       /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
28316       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28317          IRTemp sV      = newTemp(Ity_V256);
28318          IRTemp dV      = newTemp(Ity_V256);
28319          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
28320          s3 = s2 = s1 = s0 = IRTemp_INVALID;
28321          d3 = d2 = d1 = d0 = IRTemp_INVALID;
28322          UChar  ch      = '?';
28323          Int    laneszB = 0;
28324          UChar  modrm   = getUChar(delta);
28325          UInt   rG      = gregOfRexRM(pfx,modrm);
28326          UInt   rV      = getVexNvvvv(pfx);
28327
28328          switch (opc) {
28329             case 0x08: laneszB = 1; ch = 'b'; break;
28330             case 0x09: laneszB = 2; ch = 'w'; break;
28331             case 0x0A: laneszB = 4; ch = 'd'; break;
28332             default: vassert(0);
28333          }
28334
28335          assign( dV, getYMMReg(rV) );
28336
28337          if (epartIsReg(modrm)) {
28338             UInt rE = eregOfRexRM(pfx,modrm);
28339             assign( sV, getYMMReg(rE) );
28340             delta += 1;
28341             DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
28342                 nameYMMReg(rV), nameYMMReg(rG));
28343          } else {
28344             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28345             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
28346             delta += alen;
28347             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
28348                 nameYMMReg(rV), nameYMMReg(rG));
28349          }
28350
28351          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
28352          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
28353
28354          putYMMReg(
28355             rG,
28356             binop( Iop_V128HLtoV256,
28357                    binop(Iop_64HLtoV128,
28358                          dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
28359                          dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
28360                    ),
28361                    binop(Iop_64HLtoV128,
28362                          dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
28363                          dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
28364                    )
28365             )
28366          );
28367          *uses_vvvv = True;
28368          goto decode_success;
28369       }
28370       break;
28371
28372    case 0x0B:
28373       /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
28374       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28375          IRTemp sV      = newTemp(Ity_V128);
28376          IRTemp dV      = newTemp(Ity_V128);
28377          IRTemp sHi, sLo, dHi, dLo;
28378          sHi = sLo = dHi = dLo = IRTemp_INVALID;
28379          UChar  modrm   = getUChar(delta);
28380          UInt   rG      = gregOfRexRM(pfx,modrm);
28381          UInt   rV      = getVexNvvvv(pfx);
28382
28383          assign( dV, getXMMReg(rV) );
28384
28385          if (epartIsReg(modrm)) {
28386             UInt rE = eregOfRexRM(pfx,modrm);
28387             assign( sV, getXMMReg(rE) );
28388             delta += 1;
28389             DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
28390                 nameXMMReg(rV), nameXMMReg(rG));
28391          } else {
28392             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28393             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
28394             delta += alen;
28395             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
28396                 nameXMMReg(rV), nameXMMReg(rG));
28397          }
28398
28399          breakupV128to64s( dV, &dHi, &dLo );
28400          breakupV128to64s( sV, &sHi, &sLo );
28401
28402          putYMMRegLoAndZU(
28403             rG,
28404             binop(Iop_64HLtoV128,
28405                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
28406                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
28407             )
28408          );
28409          *uses_vvvv = True;
28410          goto decode_success;
28411       }
28412       /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
28413       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28414          IRTemp sV      = newTemp(Ity_V256);
28415          IRTemp dV      = newTemp(Ity_V256);
28416          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
28417          s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
28418          UChar  modrm   = getUChar(delta);
28419          UInt   rG      = gregOfRexRM(pfx,modrm);
28420          UInt   rV      = getVexNvvvv(pfx);
28421
28422          assign( dV, getYMMReg(rV) );
28423
28424          if (epartIsReg(modrm)) {
28425             UInt rE = eregOfRexRM(pfx,modrm);
28426             assign( sV, getYMMReg(rE) );
28427             delta += 1;
28428             DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
28429                 nameYMMReg(rV), nameYMMReg(rG));
28430          } else {
28431             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28432             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
28433             delta += alen;
28434             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
28435                 nameYMMReg(rV), nameYMMReg(rG));
28436          }
28437
28438          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
28439          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
28440
28441          putYMMReg(
28442             rG,
28443             binop(Iop_V128HLtoV256,
28444                   binop(Iop_64HLtoV128,
28445                         dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
28446                         dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
28447                   binop(Iop_64HLtoV128,
28448                         dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
28449                         dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
28450             )
28451          );
28452          *uses_vvvv = True;
28453          dres->hint = Dis_HintVerbose;
28454          goto decode_success;
28455       }
28456       break;
28457
28458    case 0x0C:
28459       /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
28460       if (have66noF2noF3(pfx)
28461           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
28462          UChar  modrm = getUChar(delta);
28463          UInt   rG    = gregOfRexRM(pfx, modrm);
28464          UInt   rV    = getVexNvvvv(pfx);
28465          IRTemp ctrlV = newTemp(Ity_V128);
28466          if (epartIsReg(modrm)) {
28467             UInt rE = eregOfRexRM(pfx, modrm);
28468             delta += 1;
28469             DIP("vpermilps %s,%s,%s\n",
28470                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
28471             assign(ctrlV, getXMMReg(rE));
28472          } else {
28473             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28474             delta += alen;
28475             DIP("vpermilps %s,%s,%s\n",
28476                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
28477             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
28478          }
28479          IRTemp dataV = newTemp(Ity_V128);
28480          assign(dataV, getXMMReg(rV));
28481          IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
28482          putYMMRegLoAndZU(rG, mkexpr(resV));
28483          *uses_vvvv = True;
28484          goto decode_success;
28485       }
28486       /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
28487       if (have66noF2noF3(pfx)
28488           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28489          UChar  modrm = getUChar(delta);
28490          UInt   rG    = gregOfRexRM(pfx, modrm);
28491          UInt   rV    = getVexNvvvv(pfx);
28492          IRTemp ctrlV = newTemp(Ity_V256);
28493          if (epartIsReg(modrm)) {
28494             UInt rE = eregOfRexRM(pfx, modrm);
28495             delta += 1;
28496             DIP("vpermilps %s,%s,%s\n",
28497                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
28498             assign(ctrlV, getYMMReg(rE));
28499          } else {
28500             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28501             delta += alen;
28502             DIP("vpermilps %s,%s,%s\n",
28503                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
28504             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
28505          }
28506          IRTemp dataV = newTemp(Ity_V256);
28507          assign(dataV, getYMMReg(rV));
28508          IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
28509          putYMMReg(rG, mkexpr(resV));
28510          *uses_vvvv = True;
28511          goto decode_success;
28512       }
28513       break;
28514
28515    case 0x0D:
28516       /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
28517       if (have66noF2noF3(pfx)
28518           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
28519          UChar  modrm = getUChar(delta);
28520          UInt   rG    = gregOfRexRM(pfx, modrm);
28521          UInt   rV    = getVexNvvvv(pfx);
28522          IRTemp ctrlV = newTemp(Ity_V128);
28523          if (epartIsReg(modrm)) {
28524             UInt rE = eregOfRexRM(pfx, modrm);
28525             delta += 1;
28526             DIP("vpermilpd %s,%s,%s\n",
28527                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
28528             assign(ctrlV, getXMMReg(rE));
28529          } else {
28530             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28531             delta += alen;
28532             DIP("vpermilpd %s,%s,%s\n",
28533                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
28534             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
28535          }
28536          IRTemp dataV = newTemp(Ity_V128);
28537          assign(dataV, getXMMReg(rV));
28538          IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
28539          putYMMRegLoAndZU(rG, mkexpr(resV));
28540          *uses_vvvv = True;
28541          goto decode_success;
28542       }
28543       /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
28544       if (have66noF2noF3(pfx)
28545           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28546          UChar  modrm = getUChar(delta);
28547          UInt   rG    = gregOfRexRM(pfx, modrm);
28548          UInt   rV    = getVexNvvvv(pfx);
28549          IRTemp ctrlV = newTemp(Ity_V256);
28550          if (epartIsReg(modrm)) {
28551             UInt rE = eregOfRexRM(pfx, modrm);
28552             delta += 1;
28553             DIP("vpermilpd %s,%s,%s\n",
28554                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
28555             assign(ctrlV, getYMMReg(rE));
28556          } else {
28557             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28558             delta += alen;
28559             DIP("vpermilpd %s,%s,%s\n",
28560                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
28561             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
28562          }
28563          IRTemp dataV = newTemp(Ity_V256);
28564          assign(dataV, getYMMReg(rV));
28565          IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
28566          putYMMReg(rG, mkexpr(resV));
28567          *uses_vvvv = True;
28568          goto decode_success;
28569       }
28570       break;
28571
28572    case 0x0E:
28573       /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
28574       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28575          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
28576          goto decode_success;
28577       }
28578       /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
28579       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28580          delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
28581          goto decode_success;
28582       }
28583       break;
28584
28585    case 0x0F:
28586       /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
28587       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28588          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
28589          goto decode_success;
28590       }
28591       /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
28592       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28593          delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
28594          goto decode_success;
28595       }
28596       break;
28597
28598    case 0x13:
28599       /* VCVTPH2PS xmm2/m64, xmm1 = VEX.128.66.0F38.W0 13 /r */
28600       if (have66noF2noF3(pfx)
28601           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/
28602           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
28603          delta = dis_VCVTPH2PS( vbi, pfx, delta, /*is256bit=*/False );
28604          goto decode_success;
28605       }
28606       /* VCVTPH2PS xmm2/m128, xmm1 = VEX.256.66.0F38.W0 13 /r */
28607       if (have66noF2noF3(pfx)
28608           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/
28609           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
28610          delta = dis_VCVTPH2PS( vbi, pfx, delta, /*is256bit=*/True );
28611          goto decode_success;
28612       }
28613       break;
28614
28615    case 0x16:
28616       /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
28617       if (have66noF2noF3(pfx)
28618           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28619          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28620                     uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
28621          goto decode_success;
28622       }
28623       break;
28624
28625    case 0x17:
28626       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
28627       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28628          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
28629          goto decode_success;
28630       }
28631       /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
28632       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28633          delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
28634          goto decode_success;
28635       }
28636       break;
28637
28638    case 0x18:
28639       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
28640       if (have66noF2noF3(pfx)
28641           && 0==getVexL(pfx)/*128*/
28642           && !epartIsReg(getUChar(delta))) {
28643          UChar modrm = getUChar(delta);
28644          UInt  rG    = gregOfRexRM(pfx, modrm);
28645          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28646          delta += alen;
28647          DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
28648          IRTemp t32 = newTemp(Ity_I32);
28649          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
28650          IRTemp t64 = newTemp(Ity_I64);
28651          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28652          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28653          putYMMRegLoAndZU(rG, res);
28654          goto decode_success;
28655       }
28656       /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
28657       if (have66noF2noF3(pfx)
28658           && 1==getVexL(pfx)/*256*/
28659           && !epartIsReg(getUChar(delta))) {
28660          UChar modrm = getUChar(delta);
28661          UInt  rG    = gregOfRexRM(pfx, modrm);
28662          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28663          delta += alen;
28664          DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
28665          IRTemp t32 = newTemp(Ity_I32);
28666          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
28667          IRTemp t64 = newTemp(Ity_I64);
28668          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28669          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28670                                                   mkexpr(t64), mkexpr(t64));
28671          putYMMReg(rG, res);
28672          goto decode_success;
28673       }
28674       /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
28675       if (have66noF2noF3(pfx)
28676           && 0==getVexL(pfx)/*128*/
28677           && epartIsReg(getUChar(delta))) {
28678          UChar modrm = getUChar(delta);
28679          UInt  rG    = gregOfRexRM(pfx, modrm);
28680          UInt  rE    = eregOfRexRM(pfx, modrm);
28681          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
28682          IRTemp t32 = newTemp(Ity_I32);
28683          assign(t32, getXMMRegLane32(rE, 0));
28684          IRTemp t64 = newTemp(Ity_I64);
28685          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28686          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28687          putYMMRegLoAndZU(rG, res);
28688          delta++;
28689          goto decode_success;
28690       }
28691       /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
28692       if (have66noF2noF3(pfx)
28693           && 1==getVexL(pfx)/*256*/
28694           && epartIsReg(getUChar(delta))) {
28695          UChar modrm = getUChar(delta);
28696          UInt  rG    = gregOfRexRM(pfx, modrm);
28697          UInt  rE    = eregOfRexRM(pfx, modrm);
28698          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28699          IRTemp t32 = newTemp(Ity_I32);
28700          assign(t32, getXMMRegLane32(rE, 0));
28701          IRTemp t64 = newTemp(Ity_I64);
28702          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28703          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28704                                                   mkexpr(t64), mkexpr(t64));
28705          putYMMReg(rG, res);
28706          delta++;
28707          goto decode_success;
28708       }
28709       break;
28710
28711    case 0x19:
28712       /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
28713       if (have66noF2noF3(pfx)
28714           && 1==getVexL(pfx)/*256*/
28715           && !epartIsReg(getUChar(delta))) {
28716          UChar modrm = getUChar(delta);
28717          UInt  rG    = gregOfRexRM(pfx, modrm);
28718          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28719          delta += alen;
28720          DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
28721          IRTemp t64 = newTemp(Ity_I64);
28722          assign(t64, loadLE(Ity_I64, mkexpr(addr)));
28723          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28724                                                   mkexpr(t64), mkexpr(t64));
28725          putYMMReg(rG, res);
28726          goto decode_success;
28727       }
28728       /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
28729       if (have66noF2noF3(pfx)
28730           && 1==getVexL(pfx)/*256*/
28731           && epartIsReg(getUChar(delta))) {
28732          UChar modrm = getUChar(delta);
28733          UInt  rG    = gregOfRexRM(pfx, modrm);
28734          UInt  rE    = eregOfRexRM(pfx, modrm);
28735          DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28736          IRTemp t64 = newTemp(Ity_I64);
28737          assign(t64, getXMMRegLane64(rE, 0));
28738          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28739                                                   mkexpr(t64), mkexpr(t64));
28740          putYMMReg(rG, res);
28741          delta++;
28742          goto decode_success;
28743       }
28744       break;
28745
28746    case 0x1A:
28747       /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
28748       if (have66noF2noF3(pfx)
28749           && 1==getVexL(pfx)/*256*/
28750           && !epartIsReg(getUChar(delta))) {
28751          UChar modrm = getUChar(delta);
28752          UInt  rG    = gregOfRexRM(pfx, modrm);
28753          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28754          delta += alen;
28755          DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
28756          IRTemp t128 = newTemp(Ity_V128);
28757          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
28758          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
28759          goto decode_success;
28760       }
28761       break;
28762
28763    case 0x1C:
28764       /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
28765       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28766          delta = dis_AVX128_E_to_G_unary(
28767                     uses_vvvv, vbi, pfx, delta,
28768                     "vpabsb", math_PABS_XMM_pap1 );
28769          goto decode_success;
28770       }
28771       /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
28772       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28773          delta = dis_AVX256_E_to_G_unary(
28774                     uses_vvvv, vbi, pfx, delta,
28775                     "vpabsb", math_PABS_YMM_pap1 );
28776          goto decode_success;
28777       }
28778       break;
28779
28780    case 0x1D:
28781       /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
28782       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28783          delta = dis_AVX128_E_to_G_unary(
28784                     uses_vvvv, vbi, pfx, delta,
28785                     "vpabsw", math_PABS_XMM_pap2 );
28786          goto decode_success;
28787       }
28788       /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
28789       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28790          delta = dis_AVX256_E_to_G_unary(
28791                     uses_vvvv, vbi, pfx, delta,
28792                     "vpabsw", math_PABS_YMM_pap2 );
28793          goto decode_success;
28794       }
28795       break;
28796
28797    case 0x1E:
28798       /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
28799       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28800          delta = dis_AVX128_E_to_G_unary(
28801                     uses_vvvv, vbi, pfx, delta,
28802                     "vpabsd", math_PABS_XMM_pap4 );
28803          goto decode_success;
28804       }
28805       /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
28806       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28807          delta = dis_AVX256_E_to_G_unary(
28808                     uses_vvvv, vbi, pfx, delta,
28809                     "vpabsd", math_PABS_YMM_pap4 );
28810          goto decode_success;
28811       }
28812       break;
28813
28814    case 0x20:
28815       /* VPMOVSXBW xmm2/m64, xmm1 */
28816       /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
28817       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28818          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
28819                                    True/*isAvx*/, False/*!xIsZ*/ );
28820          goto decode_success;
28821       }
28822       /* VPMOVSXBW xmm2/m128, ymm1 */
28823       /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
28824       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28825          delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
28826          goto decode_success;
28827       }
28828       break;
28829
28830    case 0x21:
28831       /* VPMOVSXBD xmm2/m32, xmm1 */
28832       /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
28833       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28834          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
28835                                    True/*isAvx*/, False/*!xIsZ*/ );
28836          goto decode_success;
28837       }
28838       /* VPMOVSXBD xmm2/m64, ymm1 */
28839       /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
28840       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28841          delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
28842          goto decode_success;
28843       }
28844       break;
28845
28846    case 0x22:
28847       /* VPMOVSXBQ xmm2/m16, xmm1 */
28848       /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
28849       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28850          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
28851          goto decode_success;
28852       }
28853       /* VPMOVSXBQ xmm2/m32, ymm1 */
28854       /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
28855       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28856          delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
28857          goto decode_success;
28858       }
28859       break;
28860
28861    case 0x23:
28862       /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
28863       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28864          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
28865                                    True/*isAvx*/, False/*!xIsZ*/ );
28866          goto decode_success;
28867       }
28868       /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
28869       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28870          delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
28871          goto decode_success;
28872       }
28873       break;
28874
28875    case 0x24:
28876       /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
28877       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28878          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
28879          goto decode_success;
28880       }
28881       /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
28882       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28883          delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
28884          goto decode_success;
28885       }
28886       break;
28887
28888    case 0x25:
28889       /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
28890       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28891          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
28892                                    True/*isAvx*/, False/*!xIsZ*/ );
28893          goto decode_success;
28894       }
28895       /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
28896       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28897          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
28898          goto decode_success;
28899       }
28900       break;
28901
28902    case 0x28:
28903       /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
28904       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28905          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28906                     uses_vvvv, vbi, pfx, delta,
28907                     "vpmuldq", math_PMULDQ_128 );
28908          goto decode_success;
28909       }
28910       /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
28911       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28912          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28913                     uses_vvvv, vbi, pfx, delta,
28914                     "vpmuldq", math_PMULDQ_256 );
28915          goto decode_success;
28916       }
28917       break;
28918
28919    case 0x29:
28920       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
28921       /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
28922       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28923          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28924                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
28925          goto decode_success;
28926       }
28927       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
28928       /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
28929       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28930          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28931                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
28932          goto decode_success;
28933       }
28934       break;
28935
28936    case 0x2A:
28937       /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
28938       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28939           && !epartIsReg(getUChar(delta))) {
28940          UChar  modrm = getUChar(delta);
28941          UInt   rD    = gregOfRexRM(pfx, modrm);
28942          IRTemp tD    = newTemp(Ity_V128);
28943          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28944          delta += alen;
28945          gen_SIGNAL_if_not_16_aligned(vbi, addr);
28946          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
28947          DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
28948          putYMMRegLoAndZU(rD, mkexpr(tD));
28949          goto decode_success;
28950       }
28951       /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
28952       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28953           && !epartIsReg(getUChar(delta))) {
28954          UChar  modrm = getUChar(delta);
28955          UInt   rD    = gregOfRexRM(pfx, modrm);
28956          IRTemp tD    = newTemp(Ity_V256);
28957          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28958          delta += alen;
28959          gen_SIGNAL_if_not_32_aligned(vbi, addr);
28960          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
28961          DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
28962          putYMMReg(rD, mkexpr(tD));
28963          goto decode_success;
28964       }
28965       break;
28966
28967    case 0x2B:
28968       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
28969       /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
28970       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28971          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
28972                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
28973                     Iop_QNarrowBin32Sto16Ux8, NULL,
28974                     False/*!invertLeftArg*/, True/*swapArgs*/ );
28975          goto decode_success;
28976       }
28977       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
28978       /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
28979       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28980          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28981                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
28982                     math_VPACKUSDW_YMM );
28983          goto decode_success;
28984       }
28985       break;
28986
28987    case 0x2C:
28988       /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2C /r */
28989       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28990           && 0==getRexW(pfx)/*W0*/
28991           && !epartIsReg(getUChar(delta))) {
28992          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28993                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
28994          goto decode_success;
28995       }
28996       /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2C /r */
28997       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28998           && 0==getRexW(pfx)/*W0*/
28999           && !epartIsReg(getUChar(delta))) {
29000          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
29001                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
29002          goto decode_success;
29003       }
29004       break;
29005
29006    case 0x2D:
29007       /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2D /r */
29008       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29009           && 0==getRexW(pfx)/*W0*/
29010           && !epartIsReg(getUChar(delta))) {
29011          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
29012                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
29013          goto decode_success;
29014       }
29015       /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2D /r */
29016       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29017           && 0==getRexW(pfx)/*W0*/
29018           && !epartIsReg(getUChar(delta))) {
29019          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
29020                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
29021          goto decode_success;
29022       }
29023       break;
29024
29025    case 0x2E:
29026       /* VMASKMOVPS xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2E /r */
29027       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29028           && 0==getRexW(pfx)/*W0*/
29029           && !epartIsReg(getUChar(delta))) {
29030          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
29031                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
29032          goto decode_success;
29033       }
29034       /* VMASKMOVPS ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2E /r */
29035       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29036           && 0==getRexW(pfx)/*W0*/
29037           && !epartIsReg(getUChar(delta))) {
29038          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
29039                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
29040          goto decode_success;
29041       }
29042       break;
29043
29044    case 0x2F:
29045       /* VMASKMOVPD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2F /r */
29046       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29047           && 0==getRexW(pfx)/*W0*/
29048           && !epartIsReg(getUChar(delta))) {
29049          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
29050                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
29051          goto decode_success;
29052       }
29053       /* VMASKMOVPD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2F /r */
29054       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29055           && 0==getRexW(pfx)/*W0*/
29056           && !epartIsReg(getUChar(delta))) {
29057          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
29058                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
29059          goto decode_success;
29060       }
29061       break;
29062
29063    case 0x30:
29064       /* VPMOVZXBW xmm2/m64, xmm1 */
29065       /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
29066       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29067          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
29068                                    True/*isAvx*/, True/*xIsZ*/ );
29069          goto decode_success;
29070       }
29071       /* VPMOVZXBW xmm2/m128, ymm1 */
29072       /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
29073       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29074          delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
29075          goto decode_success;
29076       }
29077       break;
29078
29079    case 0x31:
29080       /* VPMOVZXBD xmm2/m32, xmm1 */
29081       /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
29082       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29083          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
29084                                    True/*isAvx*/, True/*xIsZ*/ );
29085          goto decode_success;
29086       }
29087       /* VPMOVZXBD xmm2/m64, ymm1 */
29088       /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
29089       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29090          delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
29091          goto decode_success;
29092       }
29093       break;
29094
29095    case 0x32:
29096       /* VPMOVZXBQ xmm2/m16, xmm1 */
29097       /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
29098       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29099          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
29100          goto decode_success;
29101       }
29102       /* VPMOVZXBQ xmm2/m32, ymm1 */
29103       /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
29104       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29105          delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
29106          goto decode_success;
29107       }
29108       break;
29109
29110    case 0x33:
29111       /* VPMOVZXWD xmm2/m64, xmm1 */
29112       /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
29113       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29114          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
29115                                    True/*isAvx*/, True/*xIsZ*/ );
29116          goto decode_success;
29117       }
29118       /* VPMOVZXWD xmm2/m128, ymm1 */
29119       /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
29120       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29121          delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
29122          goto decode_success;
29123       }
29124       break;
29125
29126    case 0x34:
29127       /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
29128       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29129          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
29130          goto decode_success;
29131       }
29132       /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
29133       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29134          delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
29135          goto decode_success;
29136       }
29137       break;
29138
29139    case 0x35:
29140       /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
29141       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29142          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
29143                                    True/*isAvx*/, True/*xIsZ*/ );
29144          goto decode_success;
29145       }
29146       /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
29147       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29148          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
29149          goto decode_success;
29150       }
29151       break;
29152
29153    case 0x36:
29154       /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
29155       if (have66noF2noF3(pfx)
29156           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
29157          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
29158                     uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
29159          goto decode_success;
29160       }
29161       break;
29162
29163    case 0x37:
29164       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
29165       /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
29166       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29167          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29168                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
29169          goto decode_success;
29170       }
29171       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
29172       /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
29173       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29174          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29175                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
29176          goto decode_success;
29177       }
29178       break;
29179
29180    case 0x38:
29181       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
29182       /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
29183       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29184          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29185                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
29186          goto decode_success;
29187       }
29188       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
29189       /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
29190       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29191          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29192                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
29193          goto decode_success;
29194       }
29195       break;
29196
29197    case 0x39:
29198       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
29199       /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
29200       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29201          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29202                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
29203          goto decode_success;
29204       }
29205       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
29206       /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
29207       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29208          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29209                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
29210          goto decode_success;
29211       }
29212       break;
29213
29214    case 0x3A:
29215       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
29216       /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
29217       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29218          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29219                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
29220          goto decode_success;
29221       }
29222       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
29223       /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
29224       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29225          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29226                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
29227          goto decode_success;
29228       }
29229       break;
29230
29231    case 0x3B:
29232       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
29233       /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
29234       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29235          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29236                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
29237          goto decode_success;
29238       }
29239       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
29240       /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
29241       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29242          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29243                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
29244          goto decode_success;
29245       }
29246       break;
29247
29248    case 0x3C:
29249       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
29250       /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
29251       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29252          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29253                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
29254          goto decode_success;
29255       }
29256       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
29257       /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
29258       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29259          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29260                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
29261          goto decode_success;
29262       }
29263       break;
29264
29265    case 0x3D:
29266       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
29267       /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
29268       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29269          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29270                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
29271          goto decode_success;
29272       }
29273       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
29274       /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
29275       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29276          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29277                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
29278          goto decode_success;
29279       }
29280       break;
29281
29282    case 0x3E:
29283       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
29284       /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
29285       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29286          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29287                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
29288          goto decode_success;
29289       }
29290       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
29291       /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
29292       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29293          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29294                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
29295          goto decode_success;
29296       }
29297       break;
29298
29299    case 0x3F:
29300       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
29301       /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
29302       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29303          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29304                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
29305          goto decode_success;
29306       }
29307       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
29308       /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
29309       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29310          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29311                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
29312          goto decode_success;
29313       }
29314       break;
29315
29316    case 0x40:
29317       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
29318       /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
29319       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29320          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29321                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
29322          goto decode_success;
29323       }
29324       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
29325       /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
29326       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29327          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29328                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
29329          goto decode_success;
29330       }
29331       break;
29332
29333    case 0x41:
29334       /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
29335       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29336          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
29337          goto decode_success;
29338       }
29339       break;
29340
29341    case 0x45:
29342       /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
29343       /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
29344       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
29345          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
29346                                          Iop_Shr32, 1==getVexL(pfx) );
29347          *uses_vvvv = True;
29348          goto decode_success;
29349       }
29350       /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
29351       /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
29352       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
29353          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
29354                                          Iop_Shr64, 1==getVexL(pfx) );
29355          *uses_vvvv = True;
29356          goto decode_success;
29357       }
29358       break;
29359
29360    case 0x46:
29361       /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
29362       /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
29363       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
29364          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
29365                                          Iop_Sar32, 1==getVexL(pfx) );
29366          *uses_vvvv = True;
29367          goto decode_success;
29368       }
29369       break;
29370
29371    case 0x47:
29372       /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
29373       /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
29374       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
29375          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
29376                                          Iop_Shl32, 1==getVexL(pfx) );
29377          *uses_vvvv = True;
29378          goto decode_success;
29379       }
29380       /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
29381       /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
29382       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
29383          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
29384                                          Iop_Shl64, 1==getVexL(pfx) );
29385          *uses_vvvv = True;
29386          goto decode_success;
29387       }
29388       break;
29389
29390    case 0x58:
29391       /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
29392       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29393           && 0==getRexW(pfx)/*W0*/) {
29394          UChar modrm = getUChar(delta);
29395          UInt  rG    = gregOfRexRM(pfx, modrm);
29396          IRTemp t32 = newTemp(Ity_I32);
29397          if (epartIsReg(modrm)) {
29398             UInt rE = eregOfRexRM(pfx, modrm);
29399             delta++;
29400             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29401             assign(t32, getXMMRegLane32(rE, 0));
29402          } else {
29403             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29404             delta += alen;
29405             DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
29406             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
29407          }
29408          IRTemp t64 = newTemp(Ity_I64);
29409          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29410          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29411          putYMMRegLoAndZU(rG, res);
29412          goto decode_success;
29413       }
29414       /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
29415       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29416           && 0==getRexW(pfx)/*W0*/) {
29417          UChar modrm = getUChar(delta);
29418          UInt  rG    = gregOfRexRM(pfx, modrm);
29419          IRTemp t32 = newTemp(Ity_I32);
29420          if (epartIsReg(modrm)) {
29421             UInt rE = eregOfRexRM(pfx, modrm);
29422             delta++;
29423             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29424             assign(t32, getXMMRegLane32(rE, 0));
29425          } else {
29426             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29427             delta += alen;
29428             DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
29429             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
29430          }
29431          IRTemp t64 = newTemp(Ity_I64);
29432          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29433          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29434                                                   mkexpr(t64), mkexpr(t64));
29435          putYMMReg(rG, res);
29436          goto decode_success;
29437       }
29438       break;
29439
29440    case 0x59:
29441       /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
29442       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29443           && 0==getRexW(pfx)/*W0*/) {
29444          UChar modrm = getUChar(delta);
29445          UInt  rG    = gregOfRexRM(pfx, modrm);
29446          IRTemp t64 = newTemp(Ity_I64);
29447          if (epartIsReg(modrm)) {
29448             UInt rE = eregOfRexRM(pfx, modrm);
29449             delta++;
29450             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29451             assign(t64, getXMMRegLane64(rE, 0));
29452          } else {
29453             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29454             delta += alen;
29455             DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
29456             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
29457          }
29458          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29459          putYMMRegLoAndZU(rG, res);
29460          goto decode_success;
29461       }
29462       /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
29463       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29464           && 0==getRexW(pfx)/*W0*/) {
29465          UChar modrm = getUChar(delta);
29466          UInt  rG    = gregOfRexRM(pfx, modrm);
29467          IRTemp t64 = newTemp(Ity_I64);
29468          if (epartIsReg(modrm)) {
29469             UInt rE = eregOfRexRM(pfx, modrm);
29470             delta++;
29471             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29472             assign(t64, getXMMRegLane64(rE, 0));
29473          } else {
29474             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29475             delta += alen;
29476             DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
29477             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
29478          }
29479          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29480                                                   mkexpr(t64), mkexpr(t64));
29481          putYMMReg(rG, res);
29482          goto decode_success;
29483       }
29484       break;
29485
29486    case 0x5A:
29487       /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
29488       if (have66noF2noF3(pfx)
29489           && 1==getVexL(pfx)/*256*/
29490           && !epartIsReg(getUChar(delta))) {
29491          UChar modrm = getUChar(delta);
29492          UInt  rG    = gregOfRexRM(pfx, modrm);
29493          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29494          delta += alen;
29495          DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
29496          IRTemp t128 = newTemp(Ity_V128);
29497          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
29498          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
29499          goto decode_success;
29500       }
29501       break;
29502
29503    case 0x78:
29504       /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
29505       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29506           && 0==getRexW(pfx)/*W0*/) {
29507          UChar modrm = getUChar(delta);
29508          UInt  rG    = gregOfRexRM(pfx, modrm);
29509          IRTemp t8   = newTemp(Ity_I8);
29510          if (epartIsReg(modrm)) {
29511             UInt rE = eregOfRexRM(pfx, modrm);
29512             delta++;
29513             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29514             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
29515          } else {
29516             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29517             delta += alen;
29518             DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
29519             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
29520          }
29521          IRTemp t16 = newTemp(Ity_I16);
29522          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
29523          IRTemp t32 = newTemp(Ity_I32);
29524          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29525          IRTemp t64 = newTemp(Ity_I64);
29526          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29527          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29528          putYMMRegLoAndZU(rG, res);
29529          goto decode_success;
29530       }
29531       /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
29532       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29533           && 0==getRexW(pfx)/*W0*/) {
29534          UChar modrm = getUChar(delta);
29535          UInt  rG    = gregOfRexRM(pfx, modrm);
29536          IRTemp t8   = newTemp(Ity_I8);
29537          if (epartIsReg(modrm)) {
29538             UInt rE = eregOfRexRM(pfx, modrm);
29539             delta++;
29540             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29541             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
29542          } else {
29543             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29544             delta += alen;
29545             DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
29546             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
29547          }
29548          IRTemp t16 = newTemp(Ity_I16);
29549          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
29550          IRTemp t32 = newTemp(Ity_I32);
29551          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29552          IRTemp t64 = newTemp(Ity_I64);
29553          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29554          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29555                                                   mkexpr(t64), mkexpr(t64));
29556          putYMMReg(rG, res);
29557          goto decode_success;
29558       }
29559       break;
29560
29561    case 0x79:
29562       /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
29563       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29564           && 0==getRexW(pfx)/*W0*/) {
29565          UChar modrm = getUChar(delta);
29566          UInt  rG    = gregOfRexRM(pfx, modrm);
29567          IRTemp t16  = newTemp(Ity_I16);
29568          if (epartIsReg(modrm)) {
29569             UInt rE = eregOfRexRM(pfx, modrm);
29570             delta++;
29571             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29572             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
29573          } else {
29574             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29575             delta += alen;
29576             DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
29577             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
29578          }
29579          IRTemp t32 = newTemp(Ity_I32);
29580          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29581          IRTemp t64 = newTemp(Ity_I64);
29582          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29583          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29584          putYMMRegLoAndZU(rG, res);
29585          goto decode_success;
29586       }
29587       /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
29588       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29589           && 0==getRexW(pfx)/*W0*/) {
29590          UChar modrm = getUChar(delta);
29591          UInt  rG    = gregOfRexRM(pfx, modrm);
29592          IRTemp t16  = newTemp(Ity_I16);
29593          if (epartIsReg(modrm)) {
29594             UInt rE = eregOfRexRM(pfx, modrm);
29595             delta++;
29596             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29597             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
29598          } else {
29599             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29600             delta += alen;
29601             DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
29602             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
29603          }
29604          IRTemp t32 = newTemp(Ity_I32);
29605          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29606          IRTemp t64 = newTemp(Ity_I64);
29607          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29608          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29609                                                   mkexpr(t64), mkexpr(t64));
29610          putYMMReg(rG, res);
29611          goto decode_success;
29612       }
29613       break;
29614
29615    case 0x8C:
29616       /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
29617       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29618           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29619          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29620                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
29621          goto decode_success;
29622       }
29623       /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
29624       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29625           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29626          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29627                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
29628          goto decode_success;
29629       }
29630       /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
29631       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29632           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29633          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29634                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
29635          goto decode_success;
29636       }
29637       /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
29638       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29639           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29640          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29641                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
29642          goto decode_success;
29643       }
29644       break;
29645
29646    case 0x8E:
29647       /* VPMASKMOVD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
29648       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29649           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29650          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29651                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
29652          goto decode_success;
29653       }
29654       /* VPMASKMOVD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
29655       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29656           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29657          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29658                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
29659          goto decode_success;
29660       }
29661       /* VPMASKMOVQ xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
29662       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29663           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29664          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29665                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
29666          goto decode_success;
29667       }
29668       /* VPMASKMOVQ ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
29669       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29670           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29671          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29672                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
29673          goto decode_success;
29674       }
29675       break;
29676
29677    case 0x90:
29678       /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
29679       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29680           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29681          Long delta0 = delta;
29682          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
29683                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
29684          if (delta != delta0)
29685             goto decode_success;
29686       }
29687       /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
29688       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29689           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29690          Long delta0 = delta;
29691          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
29692                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
29693          if (delta != delta0)
29694             goto decode_success;
29695       }
29696       /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
29697       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29698           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29699          Long delta0 = delta;
29700          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
29701                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
29702          if (delta != delta0)
29703             goto decode_success;
29704       }
29705       /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
29706       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29707           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29708          Long delta0 = delta;
29709          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
29710                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
29711          if (delta != delta0)
29712             goto decode_success;
29713       }
29714       break;
29715
29716    case 0x91:
29717       /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
29718       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29719           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29720          Long delta0 = delta;
29721          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
29722                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
29723          if (delta != delta0)
29724             goto decode_success;
29725       }
29726       /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
29727       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29728           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29729          Long delta0 = delta;
29730          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
29731                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
29732          if (delta != delta0)
29733             goto decode_success;
29734       }
29735       /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
29736       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29737           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29738          Long delta0 = delta;
29739          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
29740                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
29741          if (delta != delta0)
29742             goto decode_success;
29743       }
29744       /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
29745       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29746           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29747          Long delta0 = delta;
29748          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
29749                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
29750          if (delta != delta0)
29751             goto decode_success;
29752       }
29753       break;
29754
29755    case 0x92:
29756       /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
29757       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29758           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29759          Long delta0 = delta;
29760          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
29761                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
29762          if (delta != delta0)
29763             goto decode_success;
29764       }
29765       /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
29766       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29767           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29768          Long delta0 = delta;
29769          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
29770                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
29771          if (delta != delta0)
29772             goto decode_success;
29773       }
29774       /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
29775       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29776           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29777          Long delta0 = delta;
29778          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
29779                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
29780          if (delta != delta0)
29781             goto decode_success;
29782       }
29783       /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
29784       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29785           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29786          Long delta0 = delta;
29787          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
29788                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
29789          if (delta != delta0)
29790             goto decode_success;
29791       }
29792       break;
29793
29794    case 0x93:
29795       /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
29796       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29797           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29798          Long delta0 = delta;
29799          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
29800                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
29801          if (delta != delta0)
29802             goto decode_success;
29803       }
29804       /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
29805       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29806           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29807          Long delta0 = delta;
29808          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
29809                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
29810          if (delta != delta0)
29811             goto decode_success;
29812       }
29813       /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
29814       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29815           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29816          Long delta0 = delta;
29817          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
29818                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
29819          if (delta != delta0)
29820             goto decode_success;
29821       }
29822       /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
29823       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29824           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29825          Long delta0 = delta;
29826          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
29827                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
29828          if (delta != delta0)
29829             goto decode_success;
29830       }
29831       break;
29832
29833    case 0x96 ... 0x9F:
29834    case 0xA6 ... 0xAF:
29835    case 0xB6 ... 0xBF:
29836       /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
29837       /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
29838       /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
29839       /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
29840       /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
29841       /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
29842       /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
29843       /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
29844       /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
29845       /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
29846       /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
29847       /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
29848       /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
29849       /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
29850       /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
29851       /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
29852       /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
29853       /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
29854       /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
29855       /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
29856       /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
29857       /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
29858       /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
29859       /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
29860       /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
29861       /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
29862       /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
29863       /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
29864       /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
29865       /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
29866       /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
29867       /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
29868       /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
29869       /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
29870       /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
29871       /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
29872       /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
29873       /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
29874       /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
29875       /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
29876       /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
29877       /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
29878       /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
29879       /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
29880       /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
29881       /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
29882       /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
29883       /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
29884       /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
29885       /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
29886       /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
29887       /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
29888       /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
29889       /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
29890       /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
29891       /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
29892       /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
29893       /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
29894       /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
29895       /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
29896       /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
29897       /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
29898       /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
29899       /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
29900       /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
29901       /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
29902       /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
29903       /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
29904       /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
29905       /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
29906       /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
29907       /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
29908       /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
29909       /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
29910       /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
29911       /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
29912       /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
29913       /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
29914       /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
29915       /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
29916       /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
29917       /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
29918       /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
29919       /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
29920       /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
29921       /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
29922       /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
29923       /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
29924       /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
29925       /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
29926       /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
29927       /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
29928       /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
29929       /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
29930       /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
29931       /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
29932       if (have66noF2noF3(pfx)) {
29933          delta = dis_FMA( vbi, pfx, delta, opc );
29934          *uses_vvvv = True;
29935          dres->hint = Dis_HintVerbose;
29936          goto decode_success;
29937       }
29938       break;
29939
29940    case 0xDB:
29941    case 0xDC:
29942    case 0xDD:
29943    case 0xDE:
29944    case 0xDF:
29945       /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
29946       /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
29947       /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
29948       /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
29949       /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
29950       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29951          delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
29952          if (opc != 0xDB) *uses_vvvv = True;
29953          goto decode_success;
29954       }
29955       break;
29956
29957    case 0xF2:
29958       /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
29959       /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
29960       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29961          Int     size = getRexW(pfx) ? 8 : 4;
29962          IRType  ty   = szToITy(size);
29963          IRTemp  dst  = newTemp(ty);
29964          IRTemp  src1 = newTemp(ty);
29965          IRTemp  src2 = newTemp(ty);
29966          UChar   rm   = getUChar(delta);
29967
29968          assign( src1, getIRegV(size,pfx) );
29969          if (epartIsReg(rm)) {
29970             assign( src2, getIRegE(size,pfx,rm) );
29971             DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
29972                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
29973             delta++;
29974          } else {
29975             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29976             assign( src2, loadLE(ty, mkexpr(addr)) );
29977             DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
29978                 nameIRegG(size,pfx,rm));
29979             delta += alen;
29980          }
29981
29982          assign( dst, binop( mkSizedOp(ty,Iop_And8),
29983                              unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
29984                              mkexpr(src2) ) );
29985          putIRegG( size, pfx, rm, mkexpr(dst) );
29986          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29987                                                ? AMD64G_CC_OP_ANDN64
29988                                                : AMD64G_CC_OP_ANDN32)) );
29989          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29990          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
29991          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
29992          *uses_vvvv = True;
29993          goto decode_success;
29994       }
29995       break;
29996
29997    case 0xF3:
29998       /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
29999       /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
30000       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
30001           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
30002          Int     size = getRexW(pfx) ? 8 : 4;
30003          IRType  ty   = szToITy(size);
30004          IRTemp  src  = newTemp(ty);
30005          IRTemp  dst  = newTemp(ty);
30006          UChar   rm   = getUChar(delta);
30007
30008          if (epartIsReg(rm)) {
30009             assign( src, getIRegE(size,pfx,rm) );
30010             DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
30011                 nameIRegV(size,pfx));
30012             delta++;
30013          } else {
30014             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30015             assign( src, loadLE(ty, mkexpr(addr)) );
30016             DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
30017             delta += alen;
30018          }
30019
30020          assign( dst, binop(mkSizedOp(ty,Iop_And8),
30021                             binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
30022                                   mkexpr(src)), mkexpr(src)) );
30023          putIRegV( size, pfx, mkexpr(dst) );
30024          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30025                                                ? AMD64G_CC_OP_BLSI64
30026                                                : AMD64G_CC_OP_BLSI32)) );
30027          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30028          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
30029          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
30030          *uses_vvvv = True;
30031          goto decode_success;
30032       }
30033       /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
30034       /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
30035       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
30036           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
30037          Int     size = getRexW(pfx) ? 8 : 4;
30038          IRType  ty   = szToITy(size);
30039          IRTemp  src  = newTemp(ty);
30040          IRTemp  dst  = newTemp(ty);
30041          UChar   rm   = getUChar(delta);
30042
30043          if (epartIsReg(rm)) {
30044             assign( src, getIRegE(size,pfx,rm) );
30045             DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
30046                 nameIRegV(size,pfx));
30047             delta++;
30048          } else {
30049             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30050             assign( src, loadLE(ty, mkexpr(addr)) );
30051             DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
30052             delta += alen;
30053          }
30054
30055          assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
30056                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
30057                                   mkU(ty, 1)), mkexpr(src)) );
30058          putIRegV( size, pfx, mkexpr(dst) );
30059          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30060                                                ? AMD64G_CC_OP_BLSMSK64
30061                                                : AMD64G_CC_OP_BLSMSK32)) );
30062          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30063          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
30064          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
30065          *uses_vvvv = True;
30066          goto decode_success;
30067       }
30068       /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
30069       /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
30070       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
30071           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
30072          Int     size = getRexW(pfx) ? 8 : 4;
30073          IRType  ty   = szToITy(size);
30074          IRTemp  src  = newTemp(ty);
30075          IRTemp  dst  = newTemp(ty);
30076          UChar   rm   = getUChar(delta);
30077
30078          if (epartIsReg(rm)) {
30079             assign( src, getIRegE(size,pfx,rm) );
30080             DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
30081                 nameIRegV(size,pfx));
30082             delta++;
30083          } else {
30084             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30085             assign( src, loadLE(ty, mkexpr(addr)) );
30086             DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
30087             delta += alen;
30088          }
30089
30090          assign( dst, binop(mkSizedOp(ty,Iop_And8),
30091                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
30092                                   mkU(ty, 1)), mkexpr(src)) );
30093          putIRegV( size, pfx, mkexpr(dst) );
30094          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30095                                                ? AMD64G_CC_OP_BLSR64
30096                                                : AMD64G_CC_OP_BLSR32)) );
30097          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30098          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
30099          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
30100          *uses_vvvv = True;
30101          goto decode_success;
30102       }
30103       break;
30104
30105    case 0xF5:
30106       /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
30107       /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
30108       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30109          Int     size  = getRexW(pfx) ? 8 : 4;
30110          IRType  ty    = szToITy(size);
30111          IRTemp  dst   = newTemp(ty);
30112          IRTemp  src1  = newTemp(ty);
30113          IRTemp  src2  = newTemp(ty);
30114          IRTemp  start = newTemp(Ity_I8);
30115          IRTemp  cond  = newTemp(Ity_I1);
30116          UChar   rm    = getUChar(delta);
30117
30118          assign( src2, getIRegV(size,pfx) );
30119          if (epartIsReg(rm)) {
30120             assign( src1, getIRegE(size,pfx,rm) );
30121             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
30122                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
30123             delta++;
30124          } else {
30125             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30126             assign( src1, loadLE(ty, mkexpr(addr)) );
30127             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
30128                 nameIRegG(size,pfx,rm));
30129             delta += alen;
30130          }
30131
30132          assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
30133          assign( cond, binop(Iop_CmpLT32U,
30134                              unop(Iop_8Uto32, mkexpr(start)),
30135                              mkU32(8*size)) );
30136          /* if (start < opsize) {
30137                if (start == 0)
30138                   dst = 0;
30139                else
30140                   dst = (src1 << (opsize-start)) u>> (opsize-start);
30141             } else {
30142                dst = src1;
30143             } */
30144          assign( dst,
30145                  IRExpr_ITE(
30146                     mkexpr(cond),
30147                     IRExpr_ITE(
30148                        binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
30149                        mkU(ty, 0),
30150                        binop(
30151                           mkSizedOp(ty,Iop_Shr8),
30152                           binop(
30153                              mkSizedOp(ty,Iop_Shl8),
30154                              mkexpr(src1),
30155                              binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
30156                           ),
30157                           binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
30158                        )
30159                     ),
30160                     mkexpr(src1)
30161                  )
30162                );
30163          putIRegG( size, pfx, rm, mkexpr(dst) );
30164          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30165                                                ? AMD64G_CC_OP_BLSR64
30166                                                : AMD64G_CC_OP_BLSR32)) );
30167          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30168          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
30169          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
30170          *uses_vvvv = True;
30171          goto decode_success;
30172       }
30173       /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
30174       /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
30175       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30176          Int     size = getRexW(pfx) ? 8 : 4;
30177          IRType  ty   = szToITy(size);
30178          IRTemp  src  = newTemp(ty);
30179          IRTemp  mask = newTemp(ty);
30180          UChar   rm   = getUChar(delta);
30181
30182          assign( src, getIRegV(size,pfx) );
30183          if (epartIsReg(rm)) {
30184             assign( mask, getIRegE(size,pfx,rm) );
30185             DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
30186                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
30187             delta++;
30188          } else {
30189             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30190             assign( mask, loadLE(ty, mkexpr(addr)) );
30191             DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
30192                 nameIRegG(size,pfx,rm));
30193             delta += alen;
30194          }
30195
30196          IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
30197                                         widenUto64(mkexpr(mask)) );
30198          putIRegG( size, pfx, rm,
30199                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
30200                                               "amd64g_calculate_pdep",
30201                                               &amd64g_calculate_pdep, args)) );
30202          *uses_vvvv = True;
30203          /* Flags aren't modified.  */
30204          goto decode_success;
30205       }
30206       /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
30207       /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
30208       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30209          Int     size = getRexW(pfx) ? 8 : 4;
30210          IRType  ty   = szToITy(size);
30211          IRTemp  src  = newTemp(ty);
30212          IRTemp  mask = newTemp(ty);
30213          UChar   rm   = getUChar(delta);
30214
30215          assign( src, getIRegV(size,pfx) );
30216          if (epartIsReg(rm)) {
30217             assign( mask, getIRegE(size,pfx,rm) );
30218             DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
30219                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
30220             delta++;
30221          } else {
30222             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30223             assign( mask, loadLE(ty, mkexpr(addr)) );
30224             DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
30225                 nameIRegG(size,pfx,rm));
30226             delta += alen;
30227          }
30228
30229          /* First mask off bits not set in mask, they are ignored
30230             and it should be fine if they contain undefined values.  */
30231          IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
30232                                 mkexpr(src), mkexpr(mask));
30233          IRExpr** args = mkIRExprVec_2( widenUto64(masked),
30234                                         widenUto64(mkexpr(mask)) );
30235          putIRegG( size, pfx, rm,
30236                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
30237                                               "amd64g_calculate_pext",
30238                                               &amd64g_calculate_pext, args)) );
30239          *uses_vvvv = True;
30240          /* Flags aren't modified.  */
30241          goto decode_success;
30242       }
30243       break;
30244
30245    case 0xF6:
30246       /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
30247       /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
30248       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30249          Int     size = getRexW(pfx) ? 8 : 4;
30250          IRType  ty   = szToITy(size);
30251          IRTemp  src1 = newTemp(ty);
30252          IRTemp  src2 = newTemp(ty);
30253          IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
30254          UChar   rm   = getUChar(delta);
30255
30256          assign( src1, getIRegRDX(size) );
30257          if (epartIsReg(rm)) {
30258             assign( src2, getIRegE(size,pfx,rm) );
30259             DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
30260                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
30261             delta++;
30262          } else {
30263             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30264             assign( src2, loadLE(ty, mkexpr(addr)) );
30265             DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
30266                 nameIRegG(size,pfx,rm));
30267             delta += alen;
30268          }
30269
30270          assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
30271                             mkexpr(src1), mkexpr(src2)) );
30272          putIRegV( size, pfx,
30273                    unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
30274          putIRegG( size, pfx, rm,
30275                    unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
30276                         mkexpr(res)) );
30277          *uses_vvvv = True;
30278          /* Flags aren't modified.  */
30279          goto decode_success;
30280       }
30281       break;
30282
30283    case 0xF7:
30284       /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
30285       /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
30286       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30287          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
30288          goto decode_success;
30289       }
30290       /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
30291       /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
30292       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30293          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
30294          goto decode_success;
30295       }
30296       /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
30297       /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
30298       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30299          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
30300          goto decode_success;
30301       }
30302       /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
30303       /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
30304       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30305          Int     size  = getRexW(pfx) ? 8 : 4;
30306          IRType  ty    = szToITy(size);
30307          IRTemp  dst   = newTemp(ty);
30308          IRTemp  src1  = newTemp(ty);
30309          IRTemp  src2  = newTemp(ty);
30310          IRTemp  stle  = newTemp(Ity_I16);
30311          IRTemp  start = newTemp(Ity_I8);
30312          IRTemp  len   = newTemp(Ity_I8);
30313          UChar   rm    = getUChar(delta);
30314
30315          assign( src2, getIRegV(size,pfx) );
30316          if (epartIsReg(rm)) {
30317             assign( src1, getIRegE(size,pfx,rm) );
30318             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
30319                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
30320             delta++;
30321          } else {
30322             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30323             assign( src1, loadLE(ty, mkexpr(addr)) );
30324             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
30325                 nameIRegG(size,pfx,rm));
30326             delta += alen;
30327          }
30328
30329          assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
30330          assign( start, unop( Iop_16to8, mkexpr(stle) ) );
30331          assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
30332          /* if (start+len < opsize) {
30333                if (len != 0)
30334                   dst = (src1 << (opsize-start-len)) u>> (opsize-len);
30335                else
30336                   dst = 0;
30337             } else {
30338                if (start < opsize)
30339                   dst = src1 u>> start;
30340                else
30341                   dst = 0;
30342             } */
30343          assign( dst,
30344                  IRExpr_ITE(
30345                     binop(Iop_CmpLT32U,
30346                           binop(Iop_Add32,
30347                                 unop(Iop_8Uto32, mkexpr(start)),
30348                                 unop(Iop_8Uto32, mkexpr(len))),
30349                           mkU32(8*size)),
30350                     IRExpr_ITE(
30351                        binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
30352                        mkU(ty, 0),
30353                        binop(mkSizedOp(ty,Iop_Shr8),
30354                              binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
30355                                    binop(Iop_Sub8,
30356                                          binop(Iop_Sub8, mkU8(8*size),
30357                                                mkexpr(start)),
30358                                          mkexpr(len))),
30359                              binop(Iop_Sub8, mkU8(8*size),
30360                                    mkexpr(len)))
30361                     ),
30362                     IRExpr_ITE(
30363                        binop(Iop_CmpLT32U,
30364                              unop(Iop_8Uto32, mkexpr(start)),
30365                              mkU32(8*size)),
30366                        binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
30367                              mkexpr(start)),
30368                        mkU(ty, 0)
30369                     )
30370                  )
30371                );
30372          putIRegG( size, pfx, rm, mkexpr(dst) );
30373          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30374                                                ? AMD64G_CC_OP_ANDN64
30375                                                : AMD64G_CC_OP_ANDN32)) );
30376          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30377          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
30378          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
30379          *uses_vvvv = True;
30380          goto decode_success;
30381       }
30382       break;
30383
30384    default:
30385       break;
30386
30387    }
30388
30389   //decode_failure:
30390    return deltaIN;
30391
30392   decode_success:
30393    return delta;
30394 }
30395
30396 /* operand format:
30397  * [0] = dst
30398  * [n] = srcn
30399  */
30400 static Long decode_vregW(Int count, Long delta, UChar modrm, Prefix pfx,
30401                          const VexAbiInfo* vbi, IRTemp *v, UInt *dst, Int swap)
30402 {
30403    v[0] = newTemp(Ity_V128);
30404    v[1] = newTemp(Ity_V128);
30405    v[2] = newTemp(Ity_V128);
30406    v[3] = newTemp(Ity_V128);
30407    IRTemp addr = IRTemp_INVALID;
30408    Int    alen = 0;
30409    HChar  dis_buf[50];
30410
30411    *dst = gregOfRexRM(pfx, modrm);
30412    assign( v[0], getXMMReg(*dst) );
30413
30414    if ( epartIsReg( modrm ) ) {
30415       UInt ereg = eregOfRexRM(pfx, modrm);
30416       assign(swap ? v[count-1] : v[count-2], getXMMReg(ereg) );
30417       DIS(dis_buf, "%s", nameXMMReg(ereg));
30418    } else {
30419       Bool extra_byte = (getUChar(delta - 3) & 0xF) != 9;
30420                  addr = disAMode(&alen, vbi, pfx, delta, dis_buf, extra_byte);
30421       assign(swap ? v[count-1] : v[count-2], loadLE(Ity_V128, mkexpr(addr)));
30422       delta += alen - 1;
30423    }
30424
30425    UInt vvvv = getVexNvvvv(pfx);
30426    switch(count) {
30427       case 2:
30428          DIP( "%s,%s", nameXMMReg(*dst), dis_buf );
30429          break;
30430       case 3:
30431          assign( swap ? v[1] : v[2], getXMMReg(vvvv) );
30432          DIP( "%s,%s,%s", nameXMMReg(*dst), nameXMMReg(vvvv), dis_buf );
30433          break;
30434       case 4:
30435          {
30436             assign( v[1], getXMMReg(vvvv) );
30437             UInt src2 = getUChar(delta + 1) >> 4;
30438             assign( swap ? v[2] : v[3], getXMMReg(src2) );
30439             DIP( "%s,%s,%s,%s", nameXMMReg(*dst), nameXMMReg(vvvv),
30440                                 nameXMMReg(src2), dis_buf );
30441          }
30442          break;
30443    }
30444    return delta + 1;
30445 }
30446
30447 static Long dis_FMA4 (Prefix pfx, Long delta, UChar opc,
30448                       Bool* uses_vvvv, const VexAbiInfo* vbi )
30449 {
30450    UInt dst;
30451    *uses_vvvv = True;
30452
30453    UChar  modrm   = getUChar(delta);
30454
30455    Bool zero_64F = False;
30456    Bool zero_96F = False;
30457    UInt is_F32   = ((opc & 0x01) == 0x00) ? 1 : 0;
30458    Bool neg      = (opc & 0xF0) == 0x70;
30459    Bool alt      = (opc & 0xF0) == 0x50;
30460    Bool sub      = alt ? (opc & 0x0E) != 0x0E : (opc & 0x0C) == 0x0C;
30461
30462    IRTemp operand[4];
30463    switch(opc & 0xF) {
30464       case 0x0A: zero_96F = (opc >> 4) != 0x05; break;
30465       case 0x0B: zero_64F = (opc >> 4) != 0x05; break;
30466       case 0x0E: zero_96F = (opc >> 4) != 0x05; break;
30467       case 0x0F: zero_64F = (opc >> 4) != 0x05; break;
30468       default: break;
30469    }
30470    DIP("vfm%s",                  neg ?   "n" : "");
30471    if(alt) DIP("%s",             sub ? "add" : "sub");
30472    DIP("%s",                     sub ? "sub" : "add");
30473    DIP("%c ", (zero_64F || zero_96F) ?   's' : 'p');
30474    DIP("%c ",                is_F32  ?   's' : 'd');
30475    delta = decode_vregW(4, delta, modrm, pfx, vbi, operand, &dst, getRexW(pfx));
30476    DIP("\n");
30477    IRExpr *src[3];
30478
30479    void (*putXMM[2])(UInt,Int,IRExpr*) = {&putXMMRegLane64F, &putXMMRegLane32F};
30480
30481    IROp size_op[] = {Iop_V128to64, Iop_V128HIto64, Iop_64to32, Iop_64HIto32};
30482    IROp neg_op[]  = {Iop_NegF64, Iop_NegF32};
30483    int i, j;
30484    for(i = 0; i < is_F32 * 2 + 2; i++) {
30485       for(j = 0; j < 3; j++) {
30486          if(is_F32) {
30487             src[j] = unop(Iop_ReinterpI32asF32,
30488                         unop(size_op[i%2+2],
30489                            unop(size_op[i/2],
30490                                  mkexpr(operand[j + 1])
30491                               )
30492                            ));
30493          } else {
30494             src[j] = unop(Iop_ReinterpI64asF64,
30495                         unop(size_op[i%2],
30496                            mkexpr(operand[j + 1])
30497                         ));
30498          }
30499       }
30500       putXMM[is_F32](dst, i, IRExpr_Qop(is_F32 ? Iop_MAddF32 : Iop_MAddF64,
30501                                              get_FAKE_roundingmode(),
30502                                              neg ? unop(neg_op[is_F32], src[0])
30503                                                  : src[0],
30504                                              src[1],
30505                                              sub ? unop(neg_op[is_F32], src[2])
30506                                                  : src[2]
30507                                           ));
30508       if(alt) {
30509          sub = !sub;
30510       }
30511    }
30512
30513    /* Zero out top bits of ymm/xmm register. */
30514    putYMMRegLane128( dst, 1, mkV128(0) );
30515
30516    if(zero_64F || zero_96F) {
30517       putXMMRegLane64( dst, 1, IRExpr_Const(IRConst_U64(0)));
30518    }
30519
30520    if(zero_96F) {
30521       putXMMRegLane32( dst, 1, IRExpr_Const(IRConst_U32(0)));
30522    }
30523
30524    return delta+1;
30525 }
30526
30527 /*------------------------------------------------------------*/
30528 /*---                                                      ---*/
30529 /*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
30530 /*---                                                      ---*/
30531 /*------------------------------------------------------------*/
30532
30533 static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
30534 {
30535    vassert(imm8 < 256);
30536    IRTemp s3, s2, s1, s0;
30537    s3 = s2 = s1 = s0 = IRTemp_INVALID;
30538    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
30539 #  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
30540                                     : ((_nn)==2) ? s2 : s3)
30541    IRTemp res = newTemp(Ity_V128);
30542    assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
30543                               SEL((imm8 >> 4) & 3),
30544                               SEL((imm8 >> 2) & 3),
30545                               SEL((imm8 >> 0) & 3) ));
30546 #  undef SEL
30547    return res;
30548 }
30549
30550 /* Handles 128 and 256 bit versions of VCVTPS2PH. */
30551 static Long dis_VCVTPS2PH ( const VexAbiInfo* vbi, Prefix pfx,
30552                             Long delta, Bool is256bit )
30553 {
30554    /* This is a width-halving store or reg-reg move, that does conversion on the
30555       transferred data. */
30556    UChar  modrm = getUChar(delta);
30557    UInt    rG   = gregOfRexRM(pfx, modrm);
30558    IRTemp  rm   = newTemp(Ity_I32);
30559    IROp    op   = is256bit ? Iop_F32toF16x8 : Iop_F32toF16x4;
30560    IRExpr* srcG = (is256bit ? getYMMReg : getXMMReg)(rG);
30561
30562    /* (imm & 3) contains an Intel-encoded rounding mode.  Because that encoding
30563       is the same as the encoding for IRRoundingMode, we can use that value
30564       directly in the IR as a rounding mode. */
30565
30566    if (epartIsReg(modrm)) {
30567       UInt rE = eregOfRexRM(pfx, modrm);
30568       delta += 1;
30569       UInt imm = getUChar(delta);
30570       assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30571       IRExpr* res = binop(op, mkexpr(rm), srcG);
30572       if (!is256bit)
30573          res = unop(Iop_64UtoV128, res);
30574       putYMMRegLoAndZU(rE, res);
30575       DIP("vcvtps2ph $%u,%s,%s\n",
30576           imm, (is256bit ? nameYMMReg : nameXMMReg)(rG), nameXMMReg(rE));
30577    } else {
30578       Int    alen   = 0;
30579       HChar  dis_buf[50];
30580       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30581       delta += alen;
30582       UInt imm = getUChar(delta);
30583       assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30584       IRExpr* res = binop(op, mkexpr(rm), srcG);
30585       storeLE(mkexpr(addr), res);
30586       DIP("vcvtps2ph $%u,%s,%s\n",
30587           imm, (is256bit ? nameYMMReg : nameXMMReg)(rG), dis_buf);
30588    }
30589    delta++;
30590    /* doesn't use vvvv */
30591    return delta;
30592 }
30593
30594 __attribute__((noinline))
30595 static
30596 Long dis_ESC_0F3A__VEX (
30597         /*MB_OUT*/DisResult* dres,
30598         /*OUT*/   Bool*      uses_vvvv,
30599         const VexArchInfo* archinfo,
30600         const VexAbiInfo*  vbi,
30601         Prefix pfx, Int sz, Long deltaIN
30602      )
30603 {
30604    IRTemp addr  = IRTemp_INVALID;
30605    Int    alen  = 0;
30606    HChar  dis_buf[50];
30607    Long   delta = deltaIN;
30608    UChar  opc   = getUChar(delta);
30609    delta++;
30610    *uses_vvvv = False;
30611
30612    switch (opc) {
30613
30614    case 0x00:
30615    case 0x01:
30616       /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
30617       /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
30618       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
30619           && 1==getRexW(pfx)/*W1*/) {
30620          UChar  modrm = getUChar(delta);
30621          UInt   imm8  = 0;
30622          UInt   rG    = gregOfRexRM(pfx, modrm);
30623          IRTemp sV    = newTemp(Ity_V256);
30624          const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
30625          if (epartIsReg(modrm)) {
30626             UInt rE = eregOfRexRM(pfx, modrm);
30627             delta += 1;
30628             imm8 = getUChar(delta);
30629             DIP("%s $%u,%s,%s\n",
30630                 name, imm8, nameYMMReg(rE), nameYMMReg(rG));
30631             assign(sV, getYMMReg(rE));
30632          } else {
30633             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30634             delta += alen;
30635             imm8 = getUChar(delta);
30636             DIP("%s $%u,%s,%s\n",
30637                 name, imm8, dis_buf, nameYMMReg(rG));
30638             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
30639          }
30640          delta++;
30641          IRTemp s[4];
30642          s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
30643          breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
30644          IRTemp dV = newTemp(Ity_V256);
30645          assign(dV, IRExpr_Qop(Iop_64x4toV256,
30646                                mkexpr(s[(imm8 >> 6) & 3]),
30647                                mkexpr(s[(imm8 >> 4) & 3]),
30648                                mkexpr(s[(imm8 >> 2) & 3]),
30649                                mkexpr(s[(imm8 >> 0) & 3])));
30650          putYMMReg(rG, mkexpr(dV));
30651          goto decode_success;
30652       }
30653       break;
30654
30655    case 0x02:
30656       /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
30657       if (have66noF2noF3(pfx)
30658           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30659          UChar  modrm = getUChar(delta);
30660          UInt   imm8  = 0;
30661          UInt   rG    = gregOfRexRM(pfx, modrm);
30662          UInt   rV    = getVexNvvvv(pfx);
30663          IRTemp sV    = newTemp(Ity_V128);
30664          IRTemp dV    = newTemp(Ity_V128);
30665          UInt   i;
30666          IRTemp s[4], d[4];
30667          assign(sV, getXMMReg(rV));
30668          if (epartIsReg(modrm)) {
30669             UInt rE = eregOfRexRM(pfx, modrm);
30670             delta += 1;
30671             imm8 = getUChar(delta);
30672             DIP("vpblendd $%u,%s,%s,%s\n",
30673                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
30674             assign(dV, getXMMReg(rE));
30675          } else {
30676             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30677             delta += alen;
30678             imm8 = getUChar(delta);
30679             DIP("vpblendd $%u,%s,%s,%s\n",
30680                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
30681             assign(dV, loadLE(Ity_V128, mkexpr(addr)));
30682          }
30683          delta++;
30684          for (i = 0; i < 4; i++) {
30685             s[i] = IRTemp_INVALID;
30686             d[i] = IRTemp_INVALID;
30687          }
30688          breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
30689          breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
30690          for (i = 0; i < 4; i++)
30691             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
30692          putYMMRegLane128(rG, 1, mkV128(0));
30693          *uses_vvvv = True;
30694          goto decode_success;
30695       }
30696       /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
30697       if (have66noF2noF3(pfx)
30698           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30699          UChar  modrm = getUChar(delta);
30700          UInt   imm8  = 0;
30701          UInt   rG    = gregOfRexRM(pfx, modrm);
30702          UInt   rV    = getVexNvvvv(pfx);
30703          IRTemp sV    = newTemp(Ity_V256);
30704          IRTemp dV    = newTemp(Ity_V256);
30705          UInt   i;
30706          IRTemp s[8], d[8];
30707          assign(sV, getYMMReg(rV));
30708          if (epartIsReg(modrm)) {
30709             UInt rE = eregOfRexRM(pfx, modrm);
30710             delta += 1;
30711             imm8 = getUChar(delta);
30712             DIP("vpblendd $%u,%s,%s,%s\n",
30713                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30714             assign(dV, getYMMReg(rE));
30715          } else {
30716             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30717             delta += alen;
30718             imm8 = getUChar(delta);
30719             DIP("vpblendd $%u,%s,%s,%s\n",
30720                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30721             assign(dV, loadLE(Ity_V256, mkexpr(addr)));
30722          }
30723          delta++;
30724          for (i = 0; i < 8; i++) {
30725             s[i] = IRTemp_INVALID;
30726             d[i] = IRTemp_INVALID;
30727          }
30728          breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
30729                                &s[3], &s[2], &s[1], &s[0] );
30730          breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
30731                                &d[3], &d[2], &d[1], &d[0] );
30732          for (i = 0; i < 8; i++)
30733             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
30734          *uses_vvvv = True;
30735          goto decode_success;
30736       }
30737       break;
30738
30739    case 0x04:
30740       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
30741       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30742          UChar  modrm = getUChar(delta);
30743          UInt   imm8  = 0;
30744          UInt   rG    = gregOfRexRM(pfx, modrm);
30745          IRTemp sV    = newTemp(Ity_V256);
30746          if (epartIsReg(modrm)) {
30747             UInt rE = eregOfRexRM(pfx, modrm);
30748             delta += 1;
30749             imm8 = getUChar(delta);
30750             DIP("vpermilps $%u,%s,%s\n",
30751                 imm8, nameYMMReg(rE), nameYMMReg(rG));
30752             assign(sV, getYMMReg(rE));
30753          } else {
30754             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30755             delta += alen;
30756             imm8 = getUChar(delta);
30757             DIP("vpermilps $%u,%s,%s\n",
30758                 imm8, dis_buf, nameYMMReg(rG));
30759             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
30760          }
30761          delta++;
30762          IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
30763          breakupV256toV128s( sV, &sVhi, &sVlo );
30764          IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
30765          IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
30766          IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
30767          putYMMReg(rG, res);
30768          goto decode_success;
30769       }
30770       /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
30771       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30772          UChar  modrm = getUChar(delta);
30773          UInt   imm8  = 0;
30774          UInt   rG    = gregOfRexRM(pfx, modrm);
30775          IRTemp sV    = newTemp(Ity_V128);
30776          if (epartIsReg(modrm)) {
30777             UInt rE = eregOfRexRM(pfx, modrm);
30778             delta += 1;
30779             imm8 = getUChar(delta);
30780             DIP("vpermilps $%u,%s,%s\n",
30781                 imm8, nameXMMReg(rE), nameXMMReg(rG));
30782             assign(sV, getXMMReg(rE));
30783          } else {
30784             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30785             delta += alen;
30786             imm8 = getUChar(delta);
30787             DIP("vpermilps $%u,%s,%s\n",
30788                 imm8, dis_buf, nameXMMReg(rG));
30789             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
30790          }
30791          delta++;
30792          putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
30793          goto decode_success;
30794       }
30795       break;
30796
30797    case 0x05:
30798       /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
30799       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30800          UChar  modrm = getUChar(delta);
30801          UInt   imm8  = 0;
30802          UInt   rG    = gregOfRexRM(pfx, modrm);
30803          IRTemp sV    = newTemp(Ity_V128);
30804          if (epartIsReg(modrm)) {
30805             UInt rE = eregOfRexRM(pfx, modrm);
30806             delta += 1;
30807             imm8 = getUChar(delta);
30808             DIP("vpermilpd $%u,%s,%s\n",
30809                 imm8, nameXMMReg(rE), nameXMMReg(rG));
30810             assign(sV, getXMMReg(rE));
30811          } else {
30812             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30813             delta += alen;
30814             imm8 = getUChar(delta);
30815             DIP("vpermilpd $%u,%s,%s\n",
30816                 imm8, dis_buf, nameXMMReg(rG));
30817             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
30818          }
30819          delta++;
30820          IRTemp s1 = newTemp(Ity_I64);
30821          IRTemp s0 = newTemp(Ity_I64);
30822          assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
30823          assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
30824          IRTemp dV = newTemp(Ity_V128);
30825          assign(dV, binop(Iop_64HLtoV128,
30826                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
30827                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
30828          putYMMRegLoAndZU(rG, mkexpr(dV));
30829          goto decode_success;
30830       }
30831       /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
30832       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30833          UChar  modrm = getUChar(delta);
30834          UInt   imm8  = 0;
30835          UInt   rG    = gregOfRexRM(pfx, modrm);
30836          IRTemp sV    = newTemp(Ity_V256);
30837          if (epartIsReg(modrm)) {
30838             UInt rE = eregOfRexRM(pfx, modrm);
30839             delta += 1;
30840             imm8 = getUChar(delta);
30841             DIP("vpermilpd $%u,%s,%s\n",
30842                 imm8, nameYMMReg(rE), nameYMMReg(rG));
30843             assign(sV, getYMMReg(rE));
30844          } else {
30845             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30846             delta += alen;
30847             imm8 = getUChar(delta);
30848             DIP("vpermilpd $%u,%s,%s\n",
30849                 imm8, dis_buf, nameYMMReg(rG));
30850             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
30851          }
30852          delta++;
30853          IRTemp s3, s2, s1, s0;
30854          s3 = s2 = s1 = s0 = IRTemp_INVALID;
30855          breakupV256to64s(sV, &s3, &s2, &s1, &s0);
30856          IRTemp dV = newTemp(Ity_V256);
30857          assign(dV, IRExpr_Qop(Iop_64x4toV256,
30858                                mkexpr((imm8 & (1<<3)) ? s3 : s2),
30859                                mkexpr((imm8 & (1<<2)) ? s3 : s2),
30860                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
30861                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
30862          putYMMReg(rG, mkexpr(dV));
30863          goto decode_success;
30864       }
30865       break;
30866
30867    case 0x06:
30868       /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
30869       if (have66noF2noF3(pfx)
30870           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30871          UChar  modrm = getUChar(delta);
30872          UInt   imm8  = 0;
30873          UInt   rG    = gregOfRexRM(pfx, modrm);
30874          UInt   rV    = getVexNvvvv(pfx);
30875          IRTemp s00   = newTemp(Ity_V128);
30876          IRTemp s01   = newTemp(Ity_V128);
30877          IRTemp s10   = newTemp(Ity_V128);
30878          IRTemp s11   = newTemp(Ity_V128);
30879          assign(s00, getYMMRegLane128(rV, 0));
30880          assign(s01, getYMMRegLane128(rV, 1));
30881          if (epartIsReg(modrm)) {
30882             UInt rE = eregOfRexRM(pfx, modrm);
30883             delta += 1;
30884             imm8 = getUChar(delta);
30885             DIP("vperm2f128 $%u,%s,%s,%s\n",
30886                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30887             assign(s10, getYMMRegLane128(rE, 0));
30888             assign(s11, getYMMRegLane128(rE, 1));
30889          } else {
30890             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30891             delta += alen;
30892             imm8 = getUChar(delta);
30893             DIP("vperm2f128 $%u,%s,%s,%s\n",
30894                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30895             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
30896                                                mkexpr(addr), mkU64(0))));
30897             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
30898                                                mkexpr(addr), mkU64(16))));
30899          }
30900          delta++;
30901 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
30902                                            : ((_nn)==2) ? s10 : s11)
30903          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
30904          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
30905 #        undef SEL
30906          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
30907          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
30908          *uses_vvvv = True;
30909          goto decode_success;
30910       }
30911       break;
30912
30913    case 0x08:
30914       /* VROUNDPS imm8, xmm2/m128, xmm1 */
30915       /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
30916       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30917          UChar  modrm = getUChar(delta);
30918          UInt   rG    = gregOfRexRM(pfx, modrm);
30919          IRTemp src   = newTemp(Ity_V128);
30920          IRTemp s0    = IRTemp_INVALID;
30921          IRTemp s1    = IRTemp_INVALID;
30922          IRTemp s2    = IRTemp_INVALID;
30923          IRTemp s3    = IRTemp_INVALID;
30924          IRTemp rm    = newTemp(Ity_I32);
30925          Int    imm   = 0;
30926
30927          modrm = getUChar(delta);
30928
30929          if (epartIsReg(modrm)) {
30930             UInt rE = eregOfRexRM(pfx, modrm);
30931             assign( src, getXMMReg( rE ) );
30932             imm = getUChar(delta+1);
30933             if (imm & ~15) break;
30934             delta += 1+1;
30935             DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
30936          } else {
30937             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30938             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
30939             imm = getUChar(delta+alen);
30940             if (imm & ~15) break;
30941             delta += alen+1;
30942             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
30943          }
30944
30945          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30946             that encoding is the same as the encoding for IRRoundingMode,
30947             we can use that value directly in the IR as a rounding
30948             mode. */
30949          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30950
30951          breakupV128to32s( src, &s3, &s2, &s1, &s0 );
30952          putYMMRegLane128( rG, 1, mkV128(0) );
30953 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
30954                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
30955          putYMMRegLane32F( rG, 3, CVT(s3) );
30956          putYMMRegLane32F( rG, 2, CVT(s2) );
30957          putYMMRegLane32F( rG, 1, CVT(s1) );
30958          putYMMRegLane32F( rG, 0, CVT(s0) );
30959 #        undef CVT
30960          goto decode_success;
30961       }
30962       /* VROUNDPS imm8, ymm2/m256, ymm1 */
30963       /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
30964       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30965          UChar  modrm = getUChar(delta);
30966          UInt   rG    = gregOfRexRM(pfx, modrm);
30967          IRTemp src   = newTemp(Ity_V256);
30968          IRTemp s0    = IRTemp_INVALID;
30969          IRTemp s1    = IRTemp_INVALID;
30970          IRTemp s2    = IRTemp_INVALID;
30971          IRTemp s3    = IRTemp_INVALID;
30972          IRTemp s4    = IRTemp_INVALID;
30973          IRTemp s5    = IRTemp_INVALID;
30974          IRTemp s6    = IRTemp_INVALID;
30975          IRTemp s7    = IRTemp_INVALID;
30976          IRTemp rm    = newTemp(Ity_I32);
30977          Int    imm   = 0;
30978
30979          modrm = getUChar(delta);
30980
30981          if (epartIsReg(modrm)) {
30982             UInt rE = eregOfRexRM(pfx, modrm);
30983             assign( src, getYMMReg( rE ) );
30984             imm = getUChar(delta+1);
30985             if (imm & ~15) break;
30986             delta += 1+1;
30987             DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
30988          } else {
30989             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30990             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
30991             imm = getUChar(delta+alen);
30992             if (imm & ~15) break;
30993             delta += alen+1;
30994             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
30995          }
30996
30997          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30998             that encoding is the same as the encoding for IRRoundingMode,
30999             we can use that value directly in the IR as a rounding
31000             mode. */
31001          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
31002
31003          breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
31004 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
31005                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
31006          putYMMRegLane32F( rG, 7, CVT(s7) );
31007          putYMMRegLane32F( rG, 6, CVT(s6) );
31008          putYMMRegLane32F( rG, 5, CVT(s5) );
31009          putYMMRegLane32F( rG, 4, CVT(s4) );
31010          putYMMRegLane32F( rG, 3, CVT(s3) );
31011          putYMMRegLane32F( rG, 2, CVT(s2) );
31012          putYMMRegLane32F( rG, 1, CVT(s1) );
31013          putYMMRegLane32F( rG, 0, CVT(s0) );
31014 #        undef CVT
31015          goto decode_success;
31016       }
31017       break;
31018
31019    case 0x09:
31020       /* VROUNDPD imm8, xmm2/m128, xmm1 */
31021       /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
31022       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31023          UChar  modrm = getUChar(delta);
31024          UInt   rG    = gregOfRexRM(pfx, modrm);
31025          IRTemp src   = newTemp(Ity_V128);
31026          IRTemp s0    = IRTemp_INVALID;
31027          IRTemp s1    = IRTemp_INVALID;
31028          IRTemp rm    = newTemp(Ity_I32);
31029          Int    imm   = 0;
31030
31031          modrm = getUChar(delta);
31032
31033          if (epartIsReg(modrm)) {
31034             UInt rE = eregOfRexRM(pfx, modrm);
31035             assign( src, getXMMReg( rE ) );
31036             imm = getUChar(delta+1);
31037             if (imm & ~15) break;
31038             delta += 1+1;
31039             DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
31040          } else {
31041             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31042             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
31043             imm = getUChar(delta+alen);
31044             if (imm & ~15) break;
31045             delta += alen+1;
31046             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
31047          }
31048
31049          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
31050             that encoding is the same as the encoding for IRRoundingMode,
31051             we can use that value directly in the IR as a rounding
31052             mode. */
31053          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
31054
31055          breakupV128to64s( src, &s1, &s0 );
31056          putYMMRegLane128( rG, 1, mkV128(0) );
31057 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
31058                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
31059          putYMMRegLane64F( rG, 1, CVT(s1) );
31060          putYMMRegLane64F( rG, 0, CVT(s0) );
31061 #        undef CVT
31062          goto decode_success;
31063       }
31064       /* VROUNDPD imm8, ymm2/m256, ymm1 */
31065       /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
31066       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31067          UChar  modrm = getUChar(delta);
31068          UInt   rG    = gregOfRexRM(pfx, modrm);
31069          IRTemp src   = newTemp(Ity_V256);
31070          IRTemp s0    = IRTemp_INVALID;
31071          IRTemp s1    = IRTemp_INVALID;
31072          IRTemp s2    = IRTemp_INVALID;
31073          IRTemp s3    = IRTemp_INVALID;
31074          IRTemp rm    = newTemp(Ity_I32);
31075          Int    imm   = 0;
31076
31077          modrm = getUChar(delta);
31078
31079          if (epartIsReg(modrm)) {
31080             UInt rE = eregOfRexRM(pfx, modrm);
31081             assign( src, getYMMReg( rE ) );
31082             imm = getUChar(delta+1);
31083             if (imm & ~15) break;
31084             delta += 1+1;
31085             DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
31086          } else {
31087             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31088             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
31089             imm = getUChar(delta+alen);
31090             if (imm & ~15) break;
31091             delta += alen+1;
31092             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
31093          }
31094
31095          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
31096             that encoding is the same as the encoding for IRRoundingMode,
31097             we can use that value directly in the IR as a rounding
31098             mode. */
31099          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
31100
31101          breakupV256to64s( src, &s3, &s2, &s1, &s0 );
31102 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
31103                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
31104          putYMMRegLane64F( rG, 3, CVT(s3) );
31105          putYMMRegLane64F( rG, 2, CVT(s2) );
31106          putYMMRegLane64F( rG, 1, CVT(s1) );
31107          putYMMRegLane64F( rG, 0, CVT(s0) );
31108 #        undef CVT
31109          goto decode_success;
31110       }
31111       break;
31112
31113    case 0x0A:
31114    case 0x0B:
31115       /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
31116       /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
31117       /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
31118       /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
31119       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31120          UChar  modrm = getUChar(delta);
31121          UInt   rG    = gregOfRexRM(pfx, modrm);
31122          UInt   rV    = getVexNvvvv(pfx);
31123          Bool   isD   = opc == 0x0B;
31124          IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
31125          IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
31126          Int    imm   = 0;
31127
31128          if (epartIsReg(modrm)) {
31129             UInt rE = eregOfRexRM(pfx, modrm);
31130             assign( src,
31131                     isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
31132             imm = getUChar(delta+1);
31133             if (imm & ~15) break;
31134             delta += 1+1;
31135             DIP( "vrounds%c $%d,%s,%s,%s\n",
31136                  isD ? 'd' : 's',
31137                  imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
31138          } else {
31139             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31140             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
31141             imm = getUChar(delta+alen);
31142             if (imm & ~15) break;
31143             delta += alen+1;
31144             DIP( "vrounds%c $%d,%s,%s,%s\n",
31145                  isD ? 'd' : 's',
31146                  imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
31147          }
31148
31149          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
31150             that encoding is the same as the encoding for IRRoundingMode,
31151             we can use that value directly in the IR as a rounding
31152             mode. */
31153          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
31154                            (imm & 4) ? get_sse_roundingmode()
31155                                      : mkU32(imm & 3),
31156                            mkexpr(src)) );
31157
31158          if (isD)
31159             putXMMRegLane64F( rG, 0, mkexpr(res) );
31160          else {
31161             putXMMRegLane32F( rG, 0, mkexpr(res) );
31162             putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
31163          }
31164          putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
31165          putYMMRegLane128( rG, 1, mkV128(0) );
31166          *uses_vvvv = True;
31167          goto decode_success;
31168       }
31169       break;
31170
31171    case 0x0C:
31172       /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
31173       /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
31174       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31175          UChar  modrm = getUChar(delta);
31176          UInt   imm8;
31177          UInt   rG    = gregOfRexRM(pfx, modrm);
31178          UInt   rV    = getVexNvvvv(pfx);
31179          IRTemp sV    = newTemp(Ity_V256);
31180          IRTemp sE    = newTemp(Ity_V256);
31181          assign ( sV, getYMMReg(rV) );
31182          if (epartIsReg(modrm)) {
31183             UInt rE = eregOfRexRM(pfx, modrm);
31184             delta += 1;
31185             imm8 = getUChar(delta);
31186             DIP("vblendps $%u,%s,%s,%s\n",
31187                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31188             assign(sE, getYMMReg(rE));
31189          } else {
31190             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31191             delta += alen;
31192             imm8 = getUChar(delta);
31193             DIP("vblendps $%u,%s,%s,%s\n",
31194                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31195             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
31196          }
31197          delta++;
31198          putYMMReg( rG,
31199                     mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
31200          *uses_vvvv = True;
31201          goto decode_success;
31202       }
31203       /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
31204       /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
31205       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31206          UChar  modrm = getUChar(delta);
31207          UInt   imm8;
31208          UInt   rG    = gregOfRexRM(pfx, modrm);
31209          UInt   rV    = getVexNvvvv(pfx);
31210          IRTemp sV    = newTemp(Ity_V128);
31211          IRTemp sE    = newTemp(Ity_V128);
31212          assign ( sV, getXMMReg(rV) );
31213          if (epartIsReg(modrm)) {
31214             UInt rE = eregOfRexRM(pfx, modrm);
31215             delta += 1;
31216             imm8 = getUChar(delta);
31217             DIP("vblendps $%u,%s,%s,%s\n",
31218                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
31219             assign(sE, getXMMReg(rE));
31220          } else {
31221             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31222             delta += alen;
31223             imm8 = getUChar(delta);
31224             DIP("vblendps $%u,%s,%s,%s\n",
31225                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
31226             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
31227          }
31228          delta++;
31229          putYMMRegLoAndZU( rG,
31230                            mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
31231          *uses_vvvv = True;
31232          goto decode_success;
31233       }
31234       break;
31235
31236    case 0x0D:
31237       /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
31238       /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
31239       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31240          UChar  modrm = getUChar(delta);
31241          UInt   imm8;
31242          UInt   rG    = gregOfRexRM(pfx, modrm);
31243          UInt   rV    = getVexNvvvv(pfx);
31244          IRTemp sV    = newTemp(Ity_V256);
31245          IRTemp sE    = newTemp(Ity_V256);
31246          assign ( sV, getYMMReg(rV) );
31247          if (epartIsReg(modrm)) {
31248             UInt rE = eregOfRexRM(pfx, modrm);
31249             delta += 1;
31250             imm8 = getUChar(delta);
31251             DIP("vblendpd $%u,%s,%s,%s\n",
31252                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31253             assign(sE, getYMMReg(rE));
31254          } else {
31255             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31256             delta += alen;
31257             imm8 = getUChar(delta);
31258             DIP("vblendpd $%u,%s,%s,%s\n",
31259                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31260             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
31261          }
31262          delta++;
31263          putYMMReg( rG,
31264                     mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
31265          *uses_vvvv = True;
31266          goto decode_success;
31267       }
31268       /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
31269       /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
31270       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31271          UChar  modrm = getUChar(delta);
31272          UInt   imm8;
31273          UInt   rG    = gregOfRexRM(pfx, modrm);
31274          UInt   rV    = getVexNvvvv(pfx);
31275          IRTemp sV    = newTemp(Ity_V128);
31276          IRTemp sE    = newTemp(Ity_V128);
31277          assign ( sV, getXMMReg(rV) );
31278          if (epartIsReg(modrm)) {
31279             UInt rE = eregOfRexRM(pfx, modrm);
31280             delta += 1;
31281             imm8 = getUChar(delta);
31282             DIP("vblendpd $%u,%s,%s,%s\n",
31283                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
31284             assign(sE, getXMMReg(rE));
31285          } else {
31286             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31287             delta += alen;
31288             imm8 = getUChar(delta);
31289             DIP("vblendpd $%u,%s,%s,%s\n",
31290                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
31291             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
31292          }
31293          delta++;
31294          putYMMRegLoAndZU( rG,
31295                            mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
31296          *uses_vvvv = True;
31297          goto decode_success;
31298       }
31299       break;
31300
31301    case 0x0E:
31302       /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
31303       /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
31304       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31305          UChar  modrm = getUChar(delta);
31306          UInt   imm8;
31307          UInt   rG    = gregOfRexRM(pfx, modrm);
31308          UInt   rV    = getVexNvvvv(pfx);
31309          IRTemp sV    = newTemp(Ity_V128);
31310          IRTemp sE    = newTemp(Ity_V128);
31311          assign ( sV, getXMMReg(rV) );
31312          if (epartIsReg(modrm)) {
31313             UInt rE = eregOfRexRM(pfx, modrm);
31314             delta += 1;
31315             imm8 = getUChar(delta);
31316             DIP("vpblendw $%u,%s,%s,%s\n",
31317                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
31318             assign(sE, getXMMReg(rE));
31319          } else {
31320             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31321             delta += alen;
31322             imm8 = getUChar(delta);
31323             DIP("vpblendw $%u,%s,%s,%s\n",
31324                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
31325             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
31326          }
31327          delta++;
31328          putYMMRegLoAndZU( rG,
31329                            mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
31330          *uses_vvvv = True;
31331          goto decode_success;
31332       }
31333       /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
31334       /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
31335       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31336          UChar  modrm = getUChar(delta);
31337          UInt   imm8;
31338          UInt   rG    = gregOfRexRM(pfx, modrm);
31339          UInt   rV    = getVexNvvvv(pfx);
31340          IRTemp sV    = newTemp(Ity_V256);
31341          IRTemp sE    = newTemp(Ity_V256);
31342          IRTemp sVhi, sVlo, sEhi, sElo;
31343          sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
31344          assign ( sV, getYMMReg(rV) );
31345          if (epartIsReg(modrm)) {
31346             UInt rE = eregOfRexRM(pfx, modrm);
31347             delta += 1;
31348             imm8 = getUChar(delta);
31349             DIP("vpblendw $%u,%s,%s,%s\n",
31350                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31351             assign(sE, getYMMReg(rE));
31352          } else {
31353             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31354             delta += alen;
31355             imm8 = getUChar(delta);
31356             DIP("vpblendw $%u,%s,%s,%s\n",
31357                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31358             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
31359          }
31360          delta++;
31361          breakupV256toV128s( sV, &sVhi, &sVlo );
31362          breakupV256toV128s( sE, &sEhi, &sElo );
31363          putYMMReg( rG, binop( Iop_V128HLtoV256,
31364                                mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
31365                                mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
31366          *uses_vvvv = True;
31367          goto decode_success;
31368       }
31369       break;
31370
31371    case 0x0F:
31372       /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
31373       /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
31374       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31375          UChar  modrm = getUChar(delta);
31376          UInt   rG    = gregOfRexRM(pfx, modrm);
31377          UInt   rV    = getVexNvvvv(pfx);
31378          IRTemp sV    = newTemp(Ity_V128);
31379          IRTemp dV    = newTemp(Ity_V128);
31380          UInt   imm8;
31381
31382          assign( dV, getXMMReg(rV) );
31383
31384          if ( epartIsReg( modrm ) ) {
31385             UInt   rE = eregOfRexRM(pfx, modrm);
31386             assign( sV, getXMMReg(rE) );
31387             imm8 = getUChar(delta+1);
31388             delta += 1+1;
31389             DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameXMMReg(rE),
31390                                            nameXMMReg(rV), nameXMMReg(rG));
31391          } else {
31392             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31393             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
31394             imm8 = getUChar(delta+alen);
31395             delta += alen+1;
31396             DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
31397                                            nameXMMReg(rV), nameXMMReg(rG));
31398          }
31399
31400          IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
31401          putYMMRegLoAndZU( rG, mkexpr(res) );
31402          *uses_vvvv = True;
31403          goto decode_success;
31404       }
31405       /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
31406       /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
31407       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31408          UChar  modrm = getUChar(delta);
31409          UInt   rG    = gregOfRexRM(pfx, modrm);
31410          UInt   rV    = getVexNvvvv(pfx);
31411          IRTemp sV    = newTemp(Ity_V256);
31412          IRTemp dV    = newTemp(Ity_V256);
31413          IRTemp sHi, sLo, dHi, dLo;
31414          sHi = sLo = dHi = dLo = IRTemp_INVALID;
31415          UInt   imm8;
31416
31417          assign( dV, getYMMReg(rV) );
31418
31419          if ( epartIsReg( modrm ) ) {
31420             UInt   rE = eregOfRexRM(pfx, modrm);
31421             assign( sV, getYMMReg(rE) );
31422             imm8 = getUChar(delta+1);
31423             delta += 1+1;
31424             DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameYMMReg(rE),
31425                                            nameYMMReg(rV), nameYMMReg(rG));
31426          } else {
31427             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31428             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
31429             imm8 = getUChar(delta+alen);
31430             delta += alen+1;
31431             DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
31432                                            nameYMMReg(rV), nameYMMReg(rG));
31433          }
31434
31435          breakupV256toV128s( dV, &dHi, &dLo );
31436          breakupV256toV128s( sV, &sHi, &sLo );
31437          putYMMReg( rG, binop( Iop_V128HLtoV256,
31438                                mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
31439                                mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
31440                     );
31441          *uses_vvvv = True;
31442          goto decode_success;
31443       }
31444       break;
31445
31446    case 0x14:
31447       /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
31448       if (have66noF2noF3(pfx)
31449           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31450          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
31451          goto decode_success;
31452       }
31453       break;
31454
31455    case 0x15:
31456       /* VPEXTRW imm8, reg/m16, xmm2 */
31457       /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
31458       if (have66noF2noF3(pfx)
31459           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31460          delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
31461          goto decode_success;
31462       }
31463       break;
31464
31465    case 0x16:
31466       /* VPEXTRD imm8, r32/m32, xmm2 */
31467       /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
31468       if (have66noF2noF3(pfx)
31469           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31470          delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
31471          goto decode_success;
31472       }
31473       /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
31474       if (have66noF2noF3(pfx)
31475           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
31476          delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
31477          goto decode_success;
31478       }
31479       break;
31480
31481    case 0x17:
31482       /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
31483       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31484          delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
31485          goto decode_success;
31486       }
31487       break;
31488
31489    case 0x18:
31490       /* VINSERTF128 r/m, rV, rD
31491          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
31492       /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
31493       if (have66noF2noF3(pfx)
31494           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31495          UChar  modrm = getUChar(delta);
31496          UInt   ib    = 0;
31497          UInt   rG    = gregOfRexRM(pfx, modrm);
31498          UInt   rV    = getVexNvvvv(pfx);
31499          IRTemp t128  = newTemp(Ity_V128);
31500          if (epartIsReg(modrm)) {
31501             UInt rE = eregOfRexRM(pfx, modrm);
31502             delta += 1;
31503             assign(t128, getXMMReg(rE));
31504             ib = getUChar(delta);
31505             DIP("vinsertf128 $%u,%s,%s,%s\n",
31506                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31507          } else {
31508             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31509             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
31510             delta += alen;
31511             ib = getUChar(delta);
31512             DIP("vinsertf128 $%u,%s,%s,%s\n",
31513                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31514          }
31515          delta++;
31516          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
31517          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
31518          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
31519          *uses_vvvv = True;
31520          goto decode_success;
31521       }
31522       break;
31523
31524    case 0x19:
31525      /* VEXTRACTF128 $lane_no, rS, r/m
31526         ::: r/m:V128 = a lane of rS:V256 (RM format) */
31527      /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
31528       if (have66noF2noF3(pfx)
31529           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31530          UChar  modrm = getUChar(delta);
31531          UInt   ib    = 0;
31532          UInt   rS    = gregOfRexRM(pfx, modrm);
31533          IRTemp t128  = newTemp(Ity_V128);
31534          if (epartIsReg(modrm)) {
31535             UInt rD = eregOfRexRM(pfx, modrm);
31536             delta += 1;
31537             ib = getUChar(delta);
31538             assign(t128, getYMMRegLane128(rS, ib & 1));
31539             putYMMRegLoAndZU(rD, mkexpr(t128));
31540             DIP("vextractf128 $%u,%s,%s\n",
31541                 ib, nameXMMReg(rS), nameYMMReg(rD));
31542          } else {
31543             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31544             delta += alen;
31545             ib = getUChar(delta);
31546             assign(t128, getYMMRegLane128(rS, ib & 1));
31547             storeLE(mkexpr(addr), mkexpr(t128));
31548             DIP("vextractf128 $%u,%s,%s\n",
31549                 ib, nameYMMReg(rS), dis_buf);
31550          }
31551          delta++;
31552          /* doesn't use vvvv */
31553          goto decode_success;
31554       }
31555       break;
31556
31557    case 0x1D:
31558       /* VCVTPS2PH imm8, xmm2, xmm1/m64 = VEX.128.66.0F3A.W0 1D /r ib */
31559       if (have66noF2noF3(pfx)
31560           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/
31561           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
31562          delta = dis_VCVTPS2PH( vbi, pfx, delta, /*is256bit=*/False );
31563          goto decode_success;
31564       }
31565       /* VCVTPS2PH imm8, ymm2, ymm1/m128 = VEX.256.66.0F3A.W0 1D /r ib */
31566       if (have66noF2noF3(pfx)
31567           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/
31568           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
31569          delta = dis_VCVTPS2PH( vbi, pfx, delta, /*is256bit=*/True );
31570          goto decode_success;
31571       }
31572       break;
31573
31574    case 0x20:
31575       /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
31576       if (have66noF2noF3(pfx)
31577           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31578          UChar  modrm  = getUChar(delta);
31579          UInt   rG     = gregOfRexRM(pfx, modrm);
31580          UInt   rV     = getVexNvvvv(pfx);
31581          Int    imm8;
31582          IRTemp src_u8 = newTemp(Ity_I8);
31583
31584          if ( epartIsReg( modrm ) ) {
31585             UInt rE = eregOfRexRM(pfx,modrm);
31586             imm8 = (Int)(getUChar(delta+1) & 15);
31587             assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
31588             delta += 1+1;
31589             DIP( "vpinsrb $%d,%s,%s,%s\n",
31590                  imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
31591          } else {
31592             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31593             imm8 = (Int)(getUChar(delta+alen) & 15);
31594             assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
31595             delta += alen+1;
31596             DIP( "vpinsrb $%d,%s,%s,%s\n",
31597                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31598          }
31599
31600          IRTemp src_vec = newTemp(Ity_V128);
31601          assign(src_vec, getXMMReg( rV ));
31602          IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
31603          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31604          *uses_vvvv = True;
31605          goto decode_success;
31606       }
31607       break;
31608
31609    case 0x21:
31610       /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
31611          = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
31612       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31613          UChar  modrm = getUChar(delta);
31614          UInt   rG    = gregOfRexRM(pfx, modrm);
31615          UInt   rV    = getVexNvvvv(pfx);
31616          UInt   imm8;
31617          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
31618          const IRTemp inval = IRTemp_INVALID;
31619
31620          if ( epartIsReg( modrm ) ) {
31621             UInt   rE = eregOfRexRM(pfx, modrm);
31622             IRTemp vE = newTemp(Ity_V128);
31623             assign( vE, getXMMReg(rE) );
31624             IRTemp dsE[4] = { inval, inval, inval, inval };
31625             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
31626             imm8 = getUChar(delta+1);
31627             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
31628             delta += 1+1;
31629             DIP( "insertps $%u, %s,%s\n",
31630                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
31631          } else {
31632             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31633             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
31634             imm8 = getUChar(delta+alen);
31635             delta += alen+1;
31636             DIP( "insertps $%u, %s,%s\n",
31637                  imm8, dis_buf, nameXMMReg(rG) );
31638          }
31639
31640          IRTemp vV = newTemp(Ity_V128);
31641          assign( vV, getXMMReg(rV) );
31642
31643          putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
31644          *uses_vvvv = True;
31645          goto decode_success;
31646       }
31647       break;
31648
31649    case 0x22:
31650       /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
31651       if (have66noF2noF3(pfx)
31652           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31653          UChar  modrm = getUChar(delta);
31654          UInt   rG    = gregOfRexRM(pfx, modrm);
31655          UInt   rV    = getVexNvvvv(pfx);
31656          Int    imm8_10;
31657          IRTemp src_u32 = newTemp(Ity_I32);
31658
31659          if ( epartIsReg( modrm ) ) {
31660             UInt rE = eregOfRexRM(pfx,modrm);
31661             imm8_10 = (Int)(getUChar(delta+1) & 3);
31662             assign( src_u32, getIReg32( rE ) );
31663             delta += 1+1;
31664             DIP( "vpinsrd $%d,%s,%s,%s\n",
31665                  imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
31666          } else {
31667             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31668             imm8_10 = (Int)(getUChar(delta+alen) & 3);
31669             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
31670             delta += alen+1;
31671             DIP( "vpinsrd $%d,%s,%s,%s\n",
31672                  imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31673          }
31674
31675          IRTemp src_vec = newTemp(Ity_V128);
31676          assign(src_vec, getXMMReg( rV ));
31677          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
31678          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31679          *uses_vvvv = True;
31680          goto decode_success;
31681       }
31682       /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
31683       if (have66noF2noF3(pfx)
31684           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
31685          UChar  modrm = getUChar(delta);
31686          UInt   rG    = gregOfRexRM(pfx, modrm);
31687          UInt   rV    = getVexNvvvv(pfx);
31688          Int    imm8_0;
31689          IRTemp src_u64 = newTemp(Ity_I64);
31690
31691          if ( epartIsReg( modrm ) ) {
31692             UInt rE = eregOfRexRM(pfx,modrm);
31693             imm8_0 = (Int)(getUChar(delta+1) & 1);
31694             assign( src_u64, getIReg64( rE ) );
31695             delta += 1+1;
31696             DIP( "vpinsrq $%d,%s,%s,%s\n",
31697                  imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
31698          } else {
31699             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31700             imm8_0 = (Int)(getUChar(delta+alen) & 1);
31701             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
31702             delta += alen+1;
31703             DIP( "vpinsrq $%d,%s,%s,%s\n",
31704                  imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31705          }
31706
31707          IRTemp src_vec = newTemp(Ity_V128);
31708          assign(src_vec, getXMMReg( rV ));
31709          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
31710          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31711          *uses_vvvv = True;
31712          goto decode_success;
31713       }
31714       break;
31715
31716    case 0x38:
31717       /* VINSERTI128 r/m, rV, rD
31718          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
31719       /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
31720       if (have66noF2noF3(pfx)
31721           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31722          UChar  modrm = getUChar(delta);
31723          UInt   ib    = 0;
31724          UInt   rG    = gregOfRexRM(pfx, modrm);
31725          UInt   rV    = getVexNvvvv(pfx);
31726          IRTemp t128  = newTemp(Ity_V128);
31727          if (epartIsReg(modrm)) {
31728             UInt rE = eregOfRexRM(pfx, modrm);
31729             delta += 1;
31730             assign(t128, getXMMReg(rE));
31731             ib = getUChar(delta);
31732             DIP("vinserti128 $%u,%s,%s,%s\n",
31733                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31734          } else {
31735             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31736             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
31737             delta += alen;
31738             ib = getUChar(delta);
31739             DIP("vinserti128 $%u,%s,%s,%s\n",
31740                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31741          }
31742          delta++;
31743          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
31744          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
31745          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
31746          *uses_vvvv = True;
31747          goto decode_success;
31748       }
31749       break;
31750
31751    case 0x39:
31752       /* VEXTRACTI128 $lane_no, rS, r/m
31753          ::: r/m:V128 = a lane of rS:V256 (RM format) */
31754       /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
31755       if (have66noF2noF3(pfx)
31756           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31757          UChar  modrm = getUChar(delta);
31758          UInt   ib    = 0;
31759          UInt   rS    = gregOfRexRM(pfx, modrm);
31760          IRTemp t128  = newTemp(Ity_V128);
31761          if (epartIsReg(modrm)) {
31762             UInt rD = eregOfRexRM(pfx, modrm);
31763             delta += 1;
31764             ib = getUChar(delta);
31765             assign(t128, getYMMRegLane128(rS, ib & 1));
31766             putYMMRegLoAndZU(rD, mkexpr(t128));
31767             DIP("vextracti128 $%u,%s,%s\n",
31768                 ib, nameXMMReg(rS), nameYMMReg(rD));
31769          } else {
31770             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31771             delta += alen;
31772             ib = getUChar(delta);
31773             assign(t128, getYMMRegLane128(rS, ib & 1));
31774             storeLE(mkexpr(addr), mkexpr(t128));
31775             DIP("vextracti128 $%u,%s,%s\n",
31776                 ib, nameYMMReg(rS), dis_buf);
31777          }
31778          delta++;
31779          /* doesn't use vvvv */
31780          goto decode_success;
31781       }
31782       break;
31783
31784    case 0x40:
31785       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
31786       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31787          UChar  modrm   = getUChar(delta);
31788          UInt   rG      = gregOfRexRM(pfx, modrm);
31789          UInt   rV      = getVexNvvvv(pfx);
31790          IRTemp dst_vec = newTemp(Ity_V128);
31791          Int    imm8;
31792          if (epartIsReg( modrm )) {
31793             UInt rE = eregOfRexRM(pfx,modrm);
31794             imm8 = (Int)getUChar(delta+1);
31795             assign( dst_vec, getXMMReg( rE ) );
31796             delta += 1+1;
31797             DIP( "vdpps $%d,%s,%s,%s\n",
31798                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31799          } else {
31800             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31801             imm8 = (Int)getUChar(delta+alen);
31802             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
31803             delta += alen+1;
31804             DIP( "vdpps $%d,%s,%s,%s\n",
31805                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31806          }
31807
31808          IRTemp src_vec = newTemp(Ity_V128);
31809          assign(src_vec, getXMMReg( rV ));
31810          IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
31811          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31812          *uses_vvvv = True;
31813          goto decode_success;
31814       }
31815       /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
31816       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31817          UChar  modrm   = getUChar(delta);
31818          UInt   rG      = gregOfRexRM(pfx, modrm);
31819          UInt   rV      = getVexNvvvv(pfx);
31820          IRTemp dst_vec = newTemp(Ity_V256);
31821          Int    imm8;
31822          if (epartIsReg( modrm )) {
31823             UInt rE = eregOfRexRM(pfx,modrm);
31824             imm8 = (Int)getUChar(delta+1);
31825             assign( dst_vec, getYMMReg( rE ) );
31826             delta += 1+1;
31827             DIP( "vdpps $%d,%s,%s,%s\n",
31828                  imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
31829          } else {
31830             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31831             imm8 = (Int)getUChar(delta+alen);
31832             assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
31833             delta += alen+1;
31834             DIP( "vdpps $%d,%s,%s,%s\n",
31835                  imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
31836          }
31837
31838          IRTemp src_vec = newTemp(Ity_V256);
31839          assign(src_vec, getYMMReg( rV ));
31840          IRTemp s0, s1, d0, d1;
31841          s0 = s1 = d0 = d1 = IRTemp_INVALID;
31842          breakupV256toV128s( dst_vec, &d1, &d0 );
31843          breakupV256toV128s( src_vec, &s1, &s0 );
31844          putYMMReg( rG, binop( Iop_V128HLtoV256,
31845                                mkexpr( math_DPPS_128(s1, d1, imm8) ),
31846                                mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
31847          *uses_vvvv = True;
31848          goto decode_success;
31849       }
31850       break;
31851
31852    case 0x41:
31853       /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
31854       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31855          UChar  modrm   = getUChar(delta);
31856          UInt   rG      = gregOfRexRM(pfx, modrm);
31857          UInt   rV      = getVexNvvvv(pfx);
31858          IRTemp dst_vec = newTemp(Ity_V128);
31859          Int    imm8;
31860          if (epartIsReg( modrm )) {
31861             UInt rE = eregOfRexRM(pfx,modrm);
31862             imm8 = (Int)getUChar(delta+1);
31863             assign( dst_vec, getXMMReg( rE ) );
31864             delta += 1+1;
31865             DIP( "vdppd $%d,%s,%s,%s\n",
31866                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31867          } else {
31868             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31869             imm8 = (Int)getUChar(delta+alen);
31870             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
31871             delta += alen+1;
31872             DIP( "vdppd $%d,%s,%s,%s\n",
31873                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31874          }
31875
31876          IRTemp src_vec = newTemp(Ity_V128);
31877          assign(src_vec, getXMMReg( rV ));
31878          IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
31879          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31880          *uses_vvvv = True;
31881          goto decode_success;
31882       }
31883       break;
31884
31885    case 0x42:
31886       /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
31887       /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
31888       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31889          UChar  modrm   = getUChar(delta);
31890          Int    imm8;
31891          IRTemp src_vec = newTemp(Ity_V128);
31892          IRTemp dst_vec = newTemp(Ity_V128);
31893          UInt   rG      = gregOfRexRM(pfx, modrm);
31894          UInt   rV      = getVexNvvvv(pfx);
31895
31896          assign( dst_vec, getXMMReg(rV) );
31897
31898          if ( epartIsReg( modrm ) ) {
31899             UInt rE = eregOfRexRM(pfx, modrm);
31900
31901             imm8 = (Int)getUChar(delta+1);
31902             assign( src_vec, getXMMReg(rE) );
31903             delta += 1+1;
31904             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31905                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31906          } else {
31907             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
31908                              1/* imm8 is 1 byte after the amode */ );
31909             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
31910             imm8 = (Int)getUChar(delta+alen);
31911             delta += alen+1;
31912             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31913                  dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31914          }
31915
31916          putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
31917                                                         src_vec, imm8) ) );
31918          *uses_vvvv = True;
31919          goto decode_success;
31920       }
31921       /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
31922       /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
31923       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31924          UChar  modrm   = getUChar(delta);
31925          Int    imm8;
31926          IRTemp src_vec = newTemp(Ity_V256);
31927          IRTemp dst_vec = newTemp(Ity_V256);
31928          UInt   rG      = gregOfRexRM(pfx, modrm);
31929          UInt   rV      = getVexNvvvv(pfx);
31930          IRTemp sHi, sLo, dHi, dLo;
31931          sHi = sLo = dHi = dLo = IRTemp_INVALID;
31932
31933          assign( dst_vec, getYMMReg(rV) );
31934
31935          if ( epartIsReg( modrm ) ) {
31936             UInt rE = eregOfRexRM(pfx, modrm);
31937
31938             imm8 = (Int)getUChar(delta+1);
31939             assign( src_vec, getYMMReg(rE) );
31940             delta += 1+1;
31941             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31942                  nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
31943          } else {
31944             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
31945                              1/* imm8 is 1 byte after the amode */ );
31946             assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
31947             imm8 = (Int)getUChar(delta+alen);
31948             delta += alen+1;
31949             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31950                  dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
31951          }
31952
31953          breakupV256toV128s( dst_vec, &dHi, &dLo );
31954          breakupV256toV128s( src_vec, &sHi, &sLo );
31955          putYMMReg( rG, binop( Iop_V128HLtoV256,
31956                                mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
31957                                mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
31958          *uses_vvvv = True;
31959          goto decode_success;
31960       }
31961       break;
31962
31963    case 0x44:
31964       /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
31965       /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
31966       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
31967        * Carry-less multiplication of selected XMM quadwords into XMM
31968        * registers (a.k.a multiplication of polynomials over GF(2))
31969        */
31970       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31971          UChar  modrm = getUChar(delta);
31972          Int imm8;
31973          IRTemp sV    = newTemp(Ity_V128);
31974          IRTemp dV    = newTemp(Ity_V128);
31975          UInt   rG    = gregOfRexRM(pfx, modrm);
31976          UInt   rV    = getVexNvvvv(pfx);
31977
31978          assign( dV, getXMMReg(rV) );
31979
31980          if ( epartIsReg( modrm ) ) {
31981             UInt rE = eregOfRexRM(pfx, modrm);
31982             imm8 = (Int)getUChar(delta+1);
31983             assign( sV, getXMMReg(rE) );
31984             delta += 1+1;
31985             DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
31986                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31987          } else {
31988             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
31989                              1/* imm8 is 1 byte after the amode */ );
31990             assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
31991             imm8 = (Int)getUChar(delta+alen);
31992             delta += alen+1;
31993             DIP( "vpclmulqdq $%d, %s,%s,%s\n",
31994                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31995          }
31996
31997          putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
31998          *uses_vvvv = True;
31999          goto decode_success;
32000       }
32001       break;
32002
32003    case 0x46:
32004       /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
32005       if (have66noF2noF3(pfx)
32006           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
32007          UChar  modrm = getUChar(delta);
32008          UInt   imm8  = 0;
32009          UInt   rG    = gregOfRexRM(pfx, modrm);
32010          UInt   rV    = getVexNvvvv(pfx);
32011          IRTemp s00   = newTemp(Ity_V128);
32012          IRTemp s01   = newTemp(Ity_V128);
32013          IRTemp s10   = newTemp(Ity_V128);
32014          IRTemp s11   = newTemp(Ity_V128);
32015          assign(s00, getYMMRegLane128(rV, 0));
32016          assign(s01, getYMMRegLane128(rV, 1));
32017          if (epartIsReg(modrm)) {
32018             UInt rE = eregOfRexRM(pfx, modrm);
32019             delta += 1;
32020             imm8 = getUChar(delta);
32021             DIP("vperm2i128 $%u,%s,%s,%s\n",
32022                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
32023             assign(s10, getYMMRegLane128(rE, 0));
32024             assign(s11, getYMMRegLane128(rE, 1));
32025          } else {
32026             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
32027             delta += alen;
32028             imm8 = getUChar(delta);
32029             DIP("vperm2i128 $%u,%s,%s,%s\n",
32030                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
32031             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
32032                                                mkexpr(addr), mkU64(0))));
32033             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
32034                                                mkexpr(addr), mkU64(16))));
32035          }
32036          delta++;
32037 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
32038                                            : ((_nn)==2) ? s10 : s11)
32039          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
32040          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
32041 #        undef SEL
32042          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
32043          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
32044          *uses_vvvv = True;
32045          goto decode_success;
32046       }
32047       break;
32048
32049    case 0x4A:
32050       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
32051          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
32052       /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
32053       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
32054          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
32055                                    "vblendvps", 4, Iop_SarN32x4 );
32056          *uses_vvvv = True;
32057          goto decode_success;
32058       }
32059       /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
32060          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
32061       /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
32062       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
32063          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
32064                                    "vblendvps", 4, Iop_SarN32x4 );
32065          *uses_vvvv = True;
32066          goto decode_success;
32067       }
32068       break;
32069
32070    case 0x4B:
32071       /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
32072          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
32073       /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
32074       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
32075          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
32076                                    "vblendvpd", 8, Iop_SarN64x2 );
32077          *uses_vvvv = True;
32078          goto decode_success;
32079       }
32080       /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
32081          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
32082       /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
32083       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
32084          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
32085                                    "vblendvpd", 8, Iop_SarN64x2 );
32086          *uses_vvvv = True;
32087          goto decode_success;
32088       }
32089       break;
32090
32091    case 0x4C:
32092       /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
32093          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
32094       /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
32095       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
32096          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
32097                                    "vpblendvb", 1, Iop_SarN8x16 );
32098          *uses_vvvv = True;
32099          goto decode_success;
32100       }
32101       /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
32102          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
32103       /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
32104       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
32105          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
32106                                    "vpblendvb", 1, Iop_SarN8x16 );
32107          *uses_vvvv = True;
32108          goto decode_success;
32109       }
32110       break;
32111
32112    case 0x60:
32113    case 0x61:
32114    case 0x62:
32115    case 0x63:
32116       /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
32117          VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
32118          VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
32119          VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
32120          (selected special cases that actually occur in glibc,
32121           not by any means a complete implementation.)
32122       */
32123       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
32124          Long delta0 = delta;
32125          delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
32126          if (delta > delta0) goto decode_success;
32127          /* else fall though; dis_PCMPxSTRx failed to decode it */
32128       }
32129       break;
32130
32131    case 0x5C ... 0x5F:
32132    case 0x68 ... 0x6F:
32133    case 0x78 ... 0x7F:
32134       /* FIXME: list the instructions decoded here */
32135       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
32136          Long delta0 = delta;
32137          delta = dis_FMA4( pfx, delta, opc, uses_vvvv, vbi );
32138          if (delta > delta0) {
32139             dres->hint = Dis_HintVerbose;
32140             goto decode_success;
32141          }
32142          /* else fall though; dis_FMA4 failed to decode it */
32143       }
32144       break;
32145
32146    case 0xDF:
32147       /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
32148       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
32149          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
32150          goto decode_success;
32151       }
32152       break;
32153
32154    case 0xF0:
32155       /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
32156       /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
32157       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
32158          Int     size = getRexW(pfx) ? 8 : 4;
32159          IRType  ty   = szToITy(size);
32160          IRTemp  src  = newTemp(ty);
32161          UChar   rm   = getUChar(delta);
32162          UChar   imm8;
32163
32164          if (epartIsReg(rm)) {
32165             imm8 = getUChar(delta+1);
32166             assign( src, getIRegE(size,pfx,rm) );
32167             DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
32168                                    nameIRegG(size,pfx,rm));
32169             delta += 2;
32170          } else {
32171             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
32172             imm8 = getUChar(delta+alen);
32173             assign( src, loadLE(ty, mkexpr(addr)) );
32174             DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
32175             delta += alen + 1;
32176          }
32177          imm8 &= 8*size-1;
32178
32179          /* dst = (src >>u imm8) | (src << (size-imm8)) */
32180          putIRegG( size, pfx, rm,
32181                    imm8 == 0 ? mkexpr(src)
32182                    : binop( mkSizedOp(ty,Iop_Or8),
32183                             binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
32184                                    mkU8(imm8) ),
32185                             binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
32186                                    mkU8(8*size-imm8) ) ) );
32187          /* Flags aren't modified.  */
32188          goto decode_success;
32189       }
32190       break;
32191
32192    default:
32193       break;
32194
32195    }
32196
32197   //decode_failure:
32198    return deltaIN;
32199
32200   decode_success:
32201    return delta;
32202 }
32203
32204
32205 /*------------------------------------------------------------*/
32206 /*---                                                      ---*/
32207 /*--- Disassemble a single instruction                     ---*/
32208 /*---                                                      ---*/
32209 /*------------------------------------------------------------*/
32210
32211 /* Disassemble a single instruction into IR.  The instruction is
32212    located in host memory at &guest_code[delta]. */
32213
32214 static
32215 DisResult disInstr_AMD64_WRK (
32216              /*OUT*/Bool* expect_CAS,
32217              Long         delta64,
32218              const VexArchInfo* archinfo,
32219              const VexAbiInfo*  vbi,
32220              Bool         sigill_diag
32221           )
32222 {
32223    IRTemp    t1, t2;
32224    UChar     pre;
32225    Int       n, n_prefixes;
32226    DisResult dres;
32227
32228    /* The running delta */
32229    Long delta = delta64;
32230
32231    /* Holds eip at the start of the insn, so that we can print
32232       consistent error messages for unimplemented insns. */
32233    Long delta_start = delta;
32234
32235    /* sz denotes the nominal data-op size of the insn; we change it to
32236       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
32237       conflict REX.W takes precedence. */
32238    Int sz = 4;
32239
32240    /* pfx holds the summary of prefixes. */
32241    Prefix pfx = PFX_EMPTY;
32242
32243    /* Holds the computed opcode-escape indication. */
32244    Escape esc = ESC_NONE;
32245
32246    /* Set result defaults. */
32247    dres.whatNext    = Dis_Continue;
32248    dres.len         = 0;
32249    dres.jk_StopHere = Ijk_INVALID;
32250    dres.hint        = Dis_HintNone;
32251    *expect_CAS = False;
32252
32253    vassert(guest_RIP_next_assumed == 0);
32254    vassert(guest_RIP_next_mustcheck == False);
32255
32256    t1 = t2 = IRTemp_INVALID;
32257
32258    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
32259
32260    /* Spot "Special" instructions (see comment at top of file). */
32261    {
32262       const UChar* code = guest_code + delta;
32263       /* Spot the 16-byte preamble:
32264          48C1C703   rolq $3,  %rdi
32265          48C1C70D   rolq $13, %rdi
32266          48C1C73D   rolq $61, %rdi
32267          48C1C733   rolq $51, %rdi
32268       */
32269       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
32270                                                && code[ 3] == 0x03 &&
32271           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
32272                                                && code[ 7] == 0x0D &&
32273           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
32274                                                && code[11] == 0x3D &&
32275           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
32276                                                && code[15] == 0x33) {
32277          /* Got a "Special" instruction preamble.  Which one is it? */
32278          if (code[16] == 0x48 && code[17] == 0x87
32279                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
32280             /* %RDX = client_request ( %RAX ) */
32281             DIP("%%rdx = client_request ( %%rax )\n");
32282             delta += 19;
32283             jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
32284             vassert(dres.whatNext == Dis_StopHere);
32285             goto decode_success;
32286          }
32287          else
32288          if (code[16] == 0x48 && code[17] == 0x87
32289                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
32290             /* %RAX = guest_NRADDR */
32291             DIP("%%rax = guest_NRADDR\n");
32292             delta += 19;
32293             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
32294             goto decode_success;
32295          }
32296          else
32297          if (code[16] == 0x48 && code[17] == 0x87
32298                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
32299             /* call-noredir *%RAX */
32300             DIP("call-noredir *%%rax\n");
32301             delta += 19;
32302             t1 = newTemp(Ity_I64);
32303             assign(t1, getIRegRAX(8));
32304             t2 = newTemp(Ity_I64);
32305             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
32306             putIReg64(R_RSP, mkexpr(t2));
32307             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
32308             jmp_treg(&dres, Ijk_NoRedir, t1);
32309             vassert(dres.whatNext == Dis_StopHere);
32310             goto decode_success;
32311          }
32312          else
32313          if (code[16] == 0x48 && code[17] == 0x87
32314                               && code[18] == 0xff /* xchgq %rdi,%rdi */) {
32315            /* IR injection */
32316             DIP("IR injection\n");
32317             vex_inject_ir(irsb, Iend_LE);
32318
32319             // Invalidate the current insn. The reason is that the IRop we're
32320             // injecting here can change. In which case the translation has to
32321             // be redone. For ease of handling, we simply invalidate all the
32322             // time.
32323             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
32324             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
32325
32326             delta += 19;
32327
32328             stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
32329             dres.whatNext    = Dis_StopHere;
32330             dres.jk_StopHere = Ijk_InvalICache;
32331             goto decode_success;
32332          }
32333          /* We don't know what it is. */
32334          goto decode_failure;
32335          /*NOTREACHED*/
32336       }
32337    }
32338
32339    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
32340       as many invalid combinations as possible. */
32341    n_prefixes = 0;
32342    while (True) {
32343       if (n_prefixes > 7) goto decode_failure;
32344       pre = getUChar(delta);
32345       switch (pre) {
32346          case 0x66: pfx |= PFX_66; break;
32347          case 0x67: pfx |= PFX_ASO; break;
32348          case 0xF2: pfx |= PFX_F2; break;
32349          case 0xF3: pfx |= PFX_F3; break;
32350          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
32351          case 0x2E: pfx |= PFX_CS; break;
32352          case 0x3E: pfx |= PFX_DS; break;
32353          case 0x26: pfx |= PFX_ES; break;
32354          case 0x64: pfx |= PFX_FS; break;
32355          case 0x65: pfx |= PFX_GS; break;
32356          case 0x36: pfx |= PFX_SS; break;
32357          case 0x40 ... 0x4F:
32358             pfx |= PFX_REX;
32359             if (pre & (1<<3)) pfx |= PFX_REXW;
32360             if (pre & (1<<2)) pfx |= PFX_REXR;
32361             if (pre & (1<<1)) pfx |= PFX_REXX;
32362             if (pre & (1<<0)) pfx |= PFX_REXB;
32363             break;
32364          default:
32365             goto not_a_legacy_prefix;
32366       }
32367       n_prefixes++;
32368       delta++;
32369    }
32370
32371    not_a_legacy_prefix:
32372    /* We've used up all the non-VEX prefixes.  Parse and validate a
32373       VEX prefix if that's appropriate. */
32374    if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
32375       /* Used temporarily for holding VEX prefixes. */
32376       UChar vex0 = getUChar(delta);
32377       if (vex0 == 0xC4) {
32378          /* 3-byte VEX */
32379          UChar vex1 = getUChar(delta+1);
32380          UChar vex2 = getUChar(delta+2);
32381          delta += 3;
32382          pfx |= PFX_VEX;
32383          /* Snarf contents of byte 1 */
32384          /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
32385          /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
32386          /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
32387          /* m-mmmm */
32388          switch (vex1 & 0x1F) {
32389             case 1: esc = ESC_0F;   break;
32390             case 2: esc = ESC_0F38; break;
32391             case 3: esc = ESC_0F3A; break;
32392             /* Any other m-mmmm field will #UD */
32393             default: goto decode_failure;
32394          }
32395          /* Snarf contents of byte 2 */
32396          /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
32397          /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
32398          /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
32399          /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
32400          /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
32401          /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
32402          /* pp */
32403          switch (vex2 & 3) {
32404             case 0: break;
32405             case 1: pfx |= PFX_66; break;
32406             case 2: pfx |= PFX_F3; break;
32407             case 3: pfx |= PFX_F2; break;
32408             default: vassert(0);
32409          }
32410       }
32411       else if (vex0 == 0xC5) {
32412          /* 2-byte VEX */
32413          UChar vex1 = getUChar(delta+1);
32414          delta += 2;
32415          pfx |= PFX_VEX;
32416          /* Snarf contents of byte 1 */
32417          /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
32418          /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
32419          /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
32420          /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
32421          /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
32422          /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
32423          /* pp */
32424          switch (vex1 & 3) {
32425             case 0: break;
32426             case 1: pfx |= PFX_66; break;
32427             case 2: pfx |= PFX_F3; break;
32428             case 3: pfx |= PFX_F2; break;
32429             default: vassert(0);
32430          }
32431          /* implied: */
32432          esc = ESC_0F;
32433       }
32434       /* Can't have both VEX and REX */
32435       if ((pfx & PFX_VEX) && (pfx & PFX_REX))
32436          goto decode_failure; /* can't have both */
32437    }
32438
32439    /* Dump invalid combinations */
32440    n = 0;
32441    if (pfx & PFX_F2) n++;
32442    if (pfx & PFX_F3) n++;
32443    if (n > 1)
32444       goto decode_failure; /* can't have both */
32445
32446    n = 0;
32447    if (pfx & PFX_CS) n++;
32448    if (pfx & PFX_DS) n++;
32449    if (pfx & PFX_ES) n++;
32450    if (pfx & PFX_FS) n++;
32451    if (pfx & PFX_GS) n++;
32452    if (pfx & PFX_SS) n++;
32453    if (n > 1)
32454       goto decode_failure; /* multiple seg overrides == illegal */
32455
32456    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
32457       that we should accept it. */
32458    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_const)
32459       goto decode_failure;
32460
32461    /* Ditto for %gs prefixes. */
32462    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_const)
32463       goto decode_failure;
32464
32465    /* Set up sz. */
32466    sz = 4;
32467    if (pfx & PFX_66) sz = 2;
32468    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
32469
32470    /* Now we should be looking at the primary opcode byte or the
32471       leading escapes.  Check that any LOCK prefix is actually
32472       allowed. */
32473    if (haveLOCK(pfx)) {
32474       if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
32475          DIP("lock ");
32476       } else {
32477          *expect_CAS = False;
32478          goto decode_failure;
32479       }
32480    }
32481
32482    /* Eat up opcode escape bytes, until we're really looking at the
32483       primary opcode byte.  But only if there's no VEX present. */
32484    if (!(pfx & PFX_VEX)) {
32485       vassert(esc == ESC_NONE);
32486       pre = getUChar(delta);
32487       if (pre == 0x0F) {
32488          delta++;
32489          pre = getUChar(delta);
32490          switch (pre) {
32491             case 0x38: esc = ESC_0F38; delta++; break;
32492             case 0x3A: esc = ESC_0F3A; delta++; break;
32493             default:   esc = ESC_0F; break;
32494          }
32495       }
32496    }
32497
32498    /* So now we're really really looking at the primary opcode
32499       byte. */
32500    Long delta_at_primary_opcode = delta;
32501
32502    if (!(pfx & PFX_VEX)) {
32503       /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
32504          instructions preserve the upper 128 bits of YMM registers;
32505          iow we can simply ignore the presence of the upper halves of
32506          these registers. */
32507       switch (esc) {
32508          case ESC_NONE:
32509             delta = dis_ESC_NONE( &dres, expect_CAS,
32510                                   archinfo, vbi, pfx, sz, delta );
32511             break;
32512          case ESC_0F:
32513             delta = dis_ESC_0F  ( &dres, expect_CAS,
32514                                   archinfo, vbi, pfx, sz, delta );
32515             break;
32516          case ESC_0F38:
32517             delta = dis_ESC_0F38( &dres,
32518                                   archinfo, vbi, pfx, sz, delta );
32519             break;
32520          case ESC_0F3A:
32521             delta = dis_ESC_0F3A( &dres,
32522                                   archinfo, vbi, pfx, sz, delta );
32523             break;
32524          default:
32525             vassert(0);
32526       }
32527    } else {
32528       /* VEX prefixed instruction */
32529       /* Sloppy Intel wording: "An instruction encoded with a VEX.128
32530          prefix that loads a YMM register operand ..." zeroes out bits
32531          128 and above of the register. */
32532       Bool uses_vvvv = False;
32533       switch (esc) {
32534          case ESC_0F:
32535             delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
32536                                       archinfo, vbi, pfx, sz, delta );
32537             break;
32538          case ESC_0F38:
32539             delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
32540                                         archinfo, vbi, pfx, sz, delta );
32541             break;
32542          case ESC_0F3A:
32543             delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
32544                                         archinfo, vbi, pfx, sz, delta );
32545             break;
32546          case ESC_NONE:
32547             /* The presence of a VEX prefix, by Intel definition,
32548                always implies at least an 0F escape. */
32549             goto decode_failure;
32550          default:
32551             vassert(0);
32552       }
32553       /* If the insn doesn't use VEX.vvvv then it must be all ones.
32554          Check this. */
32555       if (!uses_vvvv) {
32556          if (getVexNvvvv(pfx) != 0)
32557             goto decode_failure;
32558       }
32559    }
32560
32561    vassert(delta - delta_at_primary_opcode >= 0);
32562    vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
32563
32564    /* Use delta == delta_at_primary_opcode to denote decode failure.
32565       This implies that any successful decode must use at least one
32566       byte up. */
32567    if (delta == delta_at_primary_opcode)
32568       goto decode_failure;
32569    else
32570       goto decode_success; /* \o/ */
32571
32572
32573   decode_failure:
32574    /* All decode failures end up here. */
32575    if (sigill_diag) {
32576       vex_printf("vex amd64->IR: unhandled instruction bytes: "
32577                  "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
32578                  getUChar(delta_start+0),
32579                  getUChar(delta_start+1),
32580                  getUChar(delta_start+2),
32581                  getUChar(delta_start+3),
32582                  getUChar(delta_start+4),
32583                  getUChar(delta_start+5),
32584                  getUChar(delta_start+6),
32585                  getUChar(delta_start+7),
32586                  getUChar(delta_start+8),
32587                  getUChar(delta_start+9) );
32588       vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
32589                  haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
32590                  getRexX(pfx), getRexB(pfx));
32591       vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
32592                  haveVEX(pfx) ? 1 : 0, getVexL(pfx),
32593                  getVexNvvvv(pfx),
32594                  esc==ESC_NONE ? "NONE" :
32595                    esc==ESC_0F ? "0F" :
32596                    esc==ESC_0F38 ? "0F38" :
32597                    esc==ESC_0F3A ? "0F3A" : "???");
32598       vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
32599                  have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
32600                  haveF3(pfx) ? 1 : 0);
32601    }
32602
32603    /* Tell the dispatcher that this insn cannot be decoded, and so has
32604       not been executed, and (is currently) the next to be executed.
32605       RIP should be up-to-date since it made so at the start of each
32606       insn, but nevertheless be paranoid and update it again right
32607       now. */
32608    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
32609    jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
32610    vassert(dres.whatNext == Dis_StopHere);
32611    dres.len = 0;
32612    /* We also need to say that a CAS is not expected now, regardless
32613       of what it might have been set to at the start of the function,
32614       since the IR that we've emitted just above (to synthesis a
32615       SIGILL) does not involve any CAS, and presumably no other IR has
32616       been emitted for this (non-decoded) insn. */
32617    *expect_CAS = False;
32618    return dres;
32619
32620
32621   decode_success:
32622    /* All decode successes end up here. */
32623    switch (dres.whatNext) {
32624       case Dis_Continue:
32625          stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
32626          break;
32627       case Dis_StopHere:
32628          break;
32629       default:
32630          vassert(0);
32631    }
32632
32633    DIP("\n");
32634    dres.len = toUInt(delta - delta_start);
32635    return dres;
32636 }
32637
32638 #undef DIP
32639 #undef DIS
32640
32641
32642 /*------------------------------------------------------------*/
32643 /*--- Top-level fn                                         ---*/
32644 /*------------------------------------------------------------*/
32645
32646 /* Disassemble a single instruction into IR.  The instruction
32647    is located in host memory at &guest_code[delta]. */
32648
32649 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
32650                            const UChar* guest_code_IN,
32651                            Long         delta,
32652                            Addr         guest_IP,
32653                            VexArch      guest_arch,
32654                            const VexArchInfo* archinfo,
32655                            const VexAbiInfo*  abiinfo,
32656                            VexEndness   host_endness_IN,
32657                            Bool         sigill_diag_IN )
32658 {
32659    Int       i, x1, x2;
32660    Bool      expect_CAS, has_CAS;
32661    DisResult dres;
32662
32663    /* Set globals (see top of this file) */
32664    vassert(guest_arch == VexArchAMD64);
32665    guest_code           = guest_code_IN;
32666    irsb                 = irsb_IN;
32667    host_endness         = host_endness_IN;
32668    guest_RIP_curr_instr = guest_IP;
32669    guest_RIP_bbstart    = guest_IP - delta;
32670
32671    /* We'll consult these after doing disInstr_AMD64_WRK. */
32672    guest_RIP_next_assumed   = 0;
32673    guest_RIP_next_mustcheck = False;
32674
32675    x1 = irsb_IN->stmts_used;
32676    expect_CAS = False;
32677    dres = disInstr_AMD64_WRK ( &expect_CAS,
32678                                delta, archinfo, abiinfo, sigill_diag_IN );
32679    x2 = irsb_IN->stmts_used;
32680    vassert(x2 >= x1);
32681
32682    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
32683       got it right.  Failure of this assertion is serious and denotes
32684       a bug in disInstr. */
32685    if (guest_RIP_next_mustcheck
32686        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
32687       vex_printf("\n");
32688       vex_printf("assumed next %%rip = 0x%llx\n",
32689                  guest_RIP_next_assumed );
32690       vex_printf(" actual next %%rip = 0x%llx\n",
32691                  guest_RIP_curr_instr + dres.len );
32692       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
32693    }
32694
32695    /* See comment at the top of disInstr_AMD64_WRK for meaning of
32696       expect_CAS.  Here, we (sanity-)check for the presence/absence of
32697       IRCAS as directed by the returned expect_CAS value. */
32698    has_CAS = False;
32699    for (i = x1; i < x2; i++) {
32700       if (irsb_IN->stmts[i]->tag == Ist_CAS)
32701          has_CAS = True;
32702    }
32703
32704    if (expect_CAS != has_CAS) {
32705       /* inconsistency detected.  re-disassemble the instruction so as
32706          to generate a useful error message; then assert. */
32707       vex_traceflags |= VEX_TRACE_FE;
32708       dres = disInstr_AMD64_WRK ( &expect_CAS,
32709                                   delta, archinfo, abiinfo, sigill_diag_IN );
32710       for (i = x1; i < x2; i++) {
32711          vex_printf("\t\t");
32712          ppIRStmt(irsb_IN->stmts[i]);
32713          vex_printf("\n");
32714       }
32715       /* Failure of this assertion is serious and denotes a bug in
32716          disInstr. */
32717       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
32718    }
32719
32720    return dres;
32721 }
32722
32723
32724 /*------------------------------------------------------------*/
32725 /*--- Unused stuff                                         ---*/
32726 /*------------------------------------------------------------*/
32727
32728 // A potentially more Memcheck-friendly version of gen_LZCNT, if
32729 // this should ever be needed.
32730 //
32731 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
32732 //{
32733 //   /* Scheme is simple: propagate the most significant 1-bit into all
32734 //      lower positions in the word.  This gives a word of the form
32735 //      0---01---1.  Now invert it, giving a word of the form
32736 //      1---10---0, then do a population-count idiom (to count the 1s,
32737 //      which is the number of leading zeroes, or the word size if the
32738 //      original word was 0.
32739 //   */
32740 //   Int i;
32741 //   IRTemp t[7];
32742 //   for (i = 0; i < 7; i++) {
32743 //      t[i] = newTemp(ty);
32744 //   }
32745 //   if (ty == Ity_I64) {
32746 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
32747 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
32748 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
32749 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
32750 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
32751 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
32752 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
32753 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
32754 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
32755 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
32756 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
32757 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
32758 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
32759 //      return gen_POPCOUNT(ty, t[6]);
32760 //   }
32761 //   if (ty == Ity_I32) {
32762 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
32763 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
32764 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
32765 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
32766 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
32767 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
32768 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
32769 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
32770 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
32771 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
32772 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
32773 //      return gen_POPCOUNT(ty, t[5]);
32774 //   }
32775 //   if (ty == Ity_I16) {
32776 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
32777 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
32778 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
32779 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
32780 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
32781 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
32782 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
32783 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
32784 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
32785 //      return gen_POPCOUNT(ty, t[4]);
32786 //   }
32787 //   vassert(0);
32788 //}
32789
32790
32791 /*--------------------------------------------------------------------*/
32792 /*--- end                                       guest_amd64_toIR.c ---*/
32793 /*--------------------------------------------------------------------*/