memcheck/mc_translate.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Instrument IR to perform memory checking operations.         ---*/
   4 /*---                                               mc_translate.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of MemCheck, a heavyweight Valgrind tool for
   9    detecting memory errors.
  10
  11    Copyright (C) 2000-2017 Julian Seward
  12       jseward@acm.org
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 #include "pub_tool_basics.h"
  31 #include "pub_tool_poolalloc.h"     // For mc_include.h
  32 #include "pub_tool_hashtable.h"     // For mc_include.h
  33 #include "pub_tool_libcassert.h"
  34 #include "pub_tool_libcprint.h"
  35 #include "pub_tool_tooliface.h"
  36 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
  37 #include "pub_tool_xarray.h"
  38 #include "pub_tool_mallocfree.h"
  39 #include "pub_tool_libcbase.h"
  40
  41 #include "mc_include.h"
  42
  43
  44 /* FIXMEs JRS 2011-June-16.
  45
  46    Check the interpretation for vector narrowing and widening ops,
  47    particularly the saturating ones.  I suspect they are either overly
  48    pessimistic and/or wrong.
  49
  50    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
  51    saturating shifts): the interpretation is overly pessimistic.
  52    See comments on the relevant cases below for details.
  53
  54    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
  55    both rounding and non-rounding variants): ditto
  56 */
  57
  58 /* This file implements the Memcheck instrumentation, and in
  59    particular contains the core of its undefined value detection
  60    machinery.  For a comprehensive background of the terminology,
  61    algorithms and rationale used herein, read:
  62
  63      Using Valgrind to detect undefined value errors with
  64      bit-precision
  65
  66      Julian Seward and Nicholas Nethercote
  67
  68      2005 USENIX Annual Technical Conference (General Track),
  69      Anaheim, CA, USA, April 10-15, 2005.
  70
  71    ----
  72
  73    Here is as good a place as any to record exactly when V bits are and
  74    should be checked, why, and what function is responsible.
  75
  76
  77    Memcheck complains when an undefined value is used:
  78
  79    1. In the condition of a conditional branch.  Because it could cause
  80       incorrect control flow, and thus cause incorrect externally-visible
  81       behaviour.  [mc_translate.c:complainIfUndefined]
  82
  83    2. As an argument to a system call, or as the value that specifies
  84       the system call number.  Because it could cause an incorrect
  85       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
  86
  87    3. As the address in a load or store.  Because it could cause an
  88       incorrect value to be used later, which could cause externally-visible
  89       behaviour (eg. via incorrect control flow or an incorrect system call
  90       argument)  [complainIfUndefined]
  91
  92    4. As the target address of a branch.  Because it could cause incorrect
  93       control flow.  [complainIfUndefined]
  94
  95    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
  96       an incorrect value into the external environment.
  97       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
  98
  99    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
 100       [complainIfUndefined]
 101
 102    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
 103       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
 104       requested it.  [in memcheck.h]
 105
 106
 107    Memcheck also complains, but should not, when an undefined value is used:
 108
 109    8. As the shift value in certain SIMD shift operations (but not in the
 110       standard integer shift operations).  This inconsistency is due to
 111       historical reasons.)  [complainIfUndefined]
 112
 113
 114    Memcheck does not complain, but should, when an undefined value is used:
 115
 116    9. As an input to a client request.  Because the client request may
 117       affect the visible behaviour -- see bug #144362 for an example
 118       involving the malloc replacements in vg_replace_malloc.c and
 119       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
 120       isn't identified.  That bug report also has some info on how to solve
 121       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
 122
 123
 124    In practice, 1 and 2 account for the vast majority of cases.
 125 */
 126
 127 /* Generation of addr-definedness, addr-validity and
 128    guard-definedness checks pertaining to loads and stores (Iex_Load,
 129    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
 130    loads/stores) was re-checked 11 May 2013. */
 131
 132
 133 /*------------------------------------------------------------*/
 134 /*--- Forward decls                                        ---*/
 135 /*------------------------------------------------------------*/
 136
 137 struct _MCEnv;
 138
 139 // See below for comments explaining what this is for.
 140 typedef
 141    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 142    HowUsed;
 143
 144 static IRType  shadowTypeV ( IRType ty );
 145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
 146                             HowUsed hu/*use HuOth if unknown*/ );
 147 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
 148
 149 static IRExpr *i128_const_zero(void);
 150
 151
 152 /*------------------------------------------------------------*/
 153 /*--- Memcheck running state, and tmp management.          ---*/
 154 /*------------------------------------------------------------*/
 155
 156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
 157    propagation scheme, and a more expensive, more precise vbit propagation
 158    scheme.  This enum describes, for such an IROp, which scheme to use. */
 159 typedef
 160    enum {
 161       // Use the cheaper, less-exact variant.
 162       DLcheap=4,
 163       // Choose between cheap and expensive based on analysis of the block
 164       // to be instrumented.  Note that the choice may be done on a
 165       // per-instance basis of the IROp that this DetailLevel describes.
 166       DLauto,
 167       // Use the more expensive, more-exact variant.
 168       DLexpensive
 169    }
 170    DetailLevel;
 171
 172
 173 /* A readonly part of the running state.  For IROps that have both a
 174    less-exact and more-exact interpretation, records which interpretation is
 175    to be used.  */
 176 typedef
 177    struct {
 178       // For Add32/64 and Sub32/64, all 3 settings are allowed.  For the
 179       // DLauto case, a per-instance decision is to be made by inspecting
 180       // the associated tmp's entry in MCEnv.tmpHowUsed.
 181       DetailLevel dl_Add32;
 182       DetailLevel dl_Add64;
 183       DetailLevel dl_Sub32;
 184       DetailLevel dl_Sub64;
 185       // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
 186       // allowed.
 187       DetailLevel dl_CmpEQ64_CmpNE64;
 188       DetailLevel dl_CmpEQ32_CmpNE32;
 189       DetailLevel dl_CmpEQ16_CmpNE16;
 190       DetailLevel dl_CmpEQ8_CmpNE8;
 191    }
 192    DetailLevelByOp;
 193
 194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
 195                                        DetailLevel dl )
 196 {
 197    dlbo->dl_Add32           = dl;
 198    dlbo->dl_Add64           = dl;
 199    dlbo->dl_Sub32           = dl;
 200    dlbo->dl_Sub64           = dl;
 201    dlbo->dl_CmpEQ64_CmpNE64 = dl;
 202    dlbo->dl_CmpEQ32_CmpNE32 = dl;
 203    dlbo->dl_CmpEQ16_CmpNE16 = dl;
 204    dlbo->dl_CmpEQ8_CmpNE8   = dl;
 205 }
 206
 207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
 208 {
 209    tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
 210    tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
 211    tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
 212    tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
 213    tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
 214              || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
 215    tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
 216              || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
 217    tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
 218              || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
 219    tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
 220              || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
 221 }
 222
 223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
 224                                      DetailLevel dl )
 225 {
 226    UInt n = 0;
 227    n += (dlbo->dl_Add32 == dl            ? 1 : 0);
 228    n += (dlbo->dl_Add64 == dl            ? 1 : 0);
 229    n += (dlbo->dl_Sub32 == dl            ? 1 : 0);
 230    n += (dlbo->dl_Sub64 == dl            ? 1 : 0);
 231    n += (dlbo->dl_CmpEQ64_CmpNE64 == dl  ? 1 : 0);
 232    n += (dlbo->dl_CmpEQ32_CmpNE32 == dl  ? 1 : 0);
 233    n += (dlbo->dl_CmpEQ16_CmpNE16 == dl  ? 1 : 0);
 234    n += (dlbo->dl_CmpEQ8_CmpNE8 == dl    ? 1 : 0);
 235    return n;
 236 }
 237
 238
 239 /* Carries info about a particular tmp.  The tmp's number is not
 240    recorded, as this is implied by (equal to) its index in the tmpMap
 241    in MCEnv.  The tmp's type is also not recorded, as this is present
 242    in MCEnv.sb->tyenv.
 243
 244    When .kind is Orig, .shadowV and .shadowB may give the identities
 245    of the temps currently holding the associated definedness (shadowV)
 246    and origin (shadowB) values, or these may be IRTemp_INVALID if code
 247    to compute such values has not yet been emitted.
 248
 249    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
 250    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
 251    illogical for a shadow tmp itself to be shadowed.
 252 */
 253 typedef
 254    enum { Orig=1, VSh=2, BSh=3 }
 255    TempKind;
 256
 257 typedef
 258    struct {
 259       TempKind kind;
 260       IRTemp   shadowV;
 261       IRTemp   shadowB;
 262    }
 263    TempMapEnt;
 264
 265
 266 /* A |HowUsed| value carries analysis results about how values are used,
 267    pertaining to whether we need to instrument integer adds expensively or
 268    not.  The running state carries a (readonly) mapping from original tmp to
 269    a HowUsed value for it.  A usage value can be one of three values,
 270    forming a 3-point chain lattice.
 271
 272       HuOth   ("Other") used in some arbitrary way
 273        |
 274       HuPCa   ("PCast") used *only* in effectively a PCast, in which all
 275        |      we care about is the all-defined vs not-all-defined distinction
 276        |
 277       HuUnU   ("Unused") not used at all.
 278
 279    The "safe" (don't-know) end of the lattice is "HuOth".  See comments
 280    below in |preInstrumentationAnalysis| for further details.
 281 */
 282 /* DECLARED ABOVE:
 283 typedef
 284    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 285    HowUsed;
 286 */
 287
 288 // Not actually necessary, but we don't want to waste D1 space.
 289 STATIC_ASSERT(sizeof(HowUsed) == 1);
 290
 291
 292 /* Carries around state during memcheck instrumentation. */
 293 typedef
 294    struct _MCEnv {
 295       /* MODIFIED: the superblock being constructed.  IRStmts are
 296          added. */
 297       IRSB* sb;
 298       Bool  trace;
 299
 300       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
 301          current kind and possibly shadow temps for each temp in the
 302          IRSB being constructed.  Note that it does not contain the
 303          type of each tmp.  If you want to know the type, look at the
 304          relevant entry in sb->tyenv.  It follows that at all times
 305          during the instrumentation process, the valid indices for
 306          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
 307          total number of Orig, V- and B- temps allocated so far.
 308
 309          The reason for this strange split (types in one place, all
 310          other info in another) is that we need the types to be
 311          attached to sb so as to make it possible to do
 312          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
 313          instrumentation process. */
 314       XArray* /* of TempMapEnt */ tmpMap;
 315
 316       /* READONLY: contains details of which ops should be expensively
 317          instrumented. */
 318       DetailLevelByOp dlbo;
 319
 320       /* READONLY: for each original tmp, how the tmp is used.  This is
 321          computed by |preInstrumentationAnalysis|.  Valid indices are
 322          0 .. #temps_in_sb-1 (same as for tmpMap). */
 323       HowUsed* tmpHowUsed;
 324
 325       /* READONLY: the guest layout.  This indicates which parts of
 326          the guest state should be regarded as 'always defined'. */
 327       const VexGuestLayout* layout;
 328
 329       /* READONLY: the host word type.  Needed for constructing
 330          arguments of type 'HWord' to be passed to helper functions.
 331          Ity_I32 or Ity_I64 only. */
 332       IRType hWordTy;
 333    }
 334    MCEnv;
 335
 336
 337 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
 338    demand), as they are encountered.  This is for two reasons.
 339
 340    (1) (less important reason): Many original tmps are unused due to
 341    initial IR optimisation, and we do not want to spaces in tables
 342    tracking them.
 343
 344    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
 345    table indexed [0 .. n_types-1], which gives the current shadow for
 346    each original tmp, or INVALID_IRTEMP if none is so far assigned.
 347    It is necessary to support making multiple assignments to a shadow
 348    -- specifically, after testing a shadow for definedness, it needs
 349    to be made defined.  But IR's SSA property disallows this.
 350
 351    (2) (more important reason): Therefore, when a shadow needs to get
 352    a new value, a new temporary is created, the value is assigned to
 353    that, and the tmpMap is updated to reflect the new binding.
 354
 355    A corollary is that if the tmpMap maps a given tmp to
 356    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
 357    there's a read-before-write error in the original tmps.  The IR
 358    sanity checker should catch all such anomalies, however.
 359 */
 360
 361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
 362    both the table in mce->sb and to our auxiliary mapping.  Note that
 363    newTemp may cause mce->tmpMap to resize, hence previous results
 364    from VG_(indexXA)(mce->tmpMap) are invalidated. */
 365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
 366 {
 367    Word       newIx;
 368    TempMapEnt ent;
 369    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
 370    ent.kind    = kind;
 371    ent.shadowV = IRTemp_INVALID;
 372    ent.shadowB = IRTemp_INVALID;
 373    newIx = VG_(addToXA)( mce->tmpMap, &ent );
 374    tl_assert(newIx == (Word)tmp);
 375    return tmp;
 376 }
 377
 378
 379 /* Find the tmp currently shadowing the given original tmp.  If none
 380    so far exists, allocate one.  */
 381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 382 {
 383    TempMapEnt* ent;
 384    /* VG_(indexXA) range-checks 'orig', hence no need to check
 385       here. */
 386    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 387    tl_assert(ent->kind == Orig);
 388    if (ent->shadowV == IRTemp_INVALID) {
 389       IRTemp tmpV
 390         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 391       /* newTemp may cause mce->tmpMap to resize, hence previous results
 392          from VG_(indexXA) are invalid. */
 393       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 394       tl_assert(ent->kind == Orig);
 395       tl_assert(ent->shadowV == IRTemp_INVALID);
 396       ent->shadowV = tmpV;
 397    }
 398    return ent->shadowV;
 399 }
 400
 401 /* Allocate a new shadow for the given original tmp.  This means any
 402    previous shadow is abandoned.  This is needed because it is
 403    necessary to give a new value to a shadow once it has been tested
 404    for undefinedness, but unfortunately IR's SSA property disallows
 405    this.  Instead we must abandon the old shadow, allocate a new one
 406    and use that instead.
 407
 408    This is the same as findShadowTmpV, except we don't bother to see
 409    if a shadow temp already existed -- we simply allocate a new one
 410    regardless. */
 411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 412 {
 413    TempMapEnt* ent;
 414    /* VG_(indexXA) range-checks 'orig', hence no need to check
 415       here. */
 416    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 417    tl_assert(ent->kind == Orig);
 418    if (1) {
 419       IRTemp tmpV
 420         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 421       /* newTemp may cause mce->tmpMap to resize, hence previous results
 422          from VG_(indexXA) are invalid. */
 423       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 424       tl_assert(ent->kind == Orig);
 425       ent->shadowV = tmpV;
 426    }
 427 }
 428
 429
 430 /*------------------------------------------------------------*/
 431 /*--- IRAtoms -- a subset of IRExprs                       ---*/
 432 /*------------------------------------------------------------*/
 433
 434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
 435    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
 436    input, most of this code deals in atoms.  Usefully, a value atom
 437    always has a V-value which is also an atom: constants are shadowed
 438    by constants, and temps are shadowed by the corresponding shadow
 439    temporary. */
 440
 441 typedef  IRExpr  IRAtom;
 442
 443 /* (used for sanity checks only): is this an atom which looks
 444    like it's from original code? */
 445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 446 {
 447    if (a1->tag == Iex_Const)
 448       return True;
 449    if (a1->tag == Iex_RdTmp) {
 450       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 451       return ent->kind == Orig;
 452    }
 453    return False;
 454 }
 455
 456 /* (used for sanity checks only): is this an atom which looks
 457    like it's from shadow code? */
 458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 459 {
 460    if (a1->tag == Iex_Const)
 461       return True;
 462    if (a1->tag == Iex_RdTmp) {
 463       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 464       return ent->kind == VSh || ent->kind == BSh;
 465    }
 466    return False;
 467 }
 468
 469 /* (used for sanity checks only): check that both args are atoms and
 470    are identically-kinded. */
 471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 472 {
 473    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 474       return True;
 475    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 476       return True;
 477    return False;
 478 }
 479
 480
 481 /*------------------------------------------------------------*/
 482 /*--- Type management                                      ---*/
 483 /*------------------------------------------------------------*/
 484
 485 /* Shadow state is always accessed using integer types.  This returns
 486    an integer type with the same size (as per sizeofIRType) as the
 487    given type.  The only valid shadow types are Bit, I8, I16, I32,
 488    I64, I128, V128, V256. */
 489
 490 static IRType shadowTypeV ( IRType ty )
 491 {
 492    switch (ty) {
 493       case Ity_I1:
 494       case Ity_I8:
 495       case Ity_I16:
 496       case Ity_I32:
 497       case Ity_I64:
 498       case Ity_I128: return ty;
 499       case Ity_F16:  return Ity_I16;
 500       case Ity_F32:  return Ity_I32;
 501       case Ity_D32:  return Ity_I32;
 502       case Ity_F64:  return Ity_I64;
 503       case Ity_D64:  return Ity_I64;
 504       case Ity_F128: return Ity_I128;
 505       case Ity_D128: return Ity_I128;
 506       case Ity_V128: return Ity_V128;
 507       case Ity_V256: return Ity_V256;
 508       default: ppIRType(ty);
 509                VG_(tool_panic)("memcheck:shadowTypeV");
 510    }
 511 }
 512
 513 /* Produce a 'defined' value of the given shadow type.  Should only be
 514    supplied shadow types (Bit/I8/I16/I32/UI64). */
 515 static IRExpr* definedOfType ( IRType ty ) {
 516    switch (ty) {
 517       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
 518       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
 519       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
 520       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
 521       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
 522       case Ity_I128: return i128_const_zero();
 523       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
 524       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
 525       default:       VG_(tool_panic)("memcheck:definedOfType");
 526    }
 527 }
 528
 529
 530 /*------------------------------------------------------------*/
 531 /*--- Constructing IR fragments                            ---*/
 532 /*------------------------------------------------------------*/
 533
 534 /* add stmt to a bb */
 535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
 536    if (mce->trace) {
 537       VG_(printf)("  %c: ", cat);
 538       ppIRStmt(st);
 539       VG_(printf)("\n");
 540    }
 541    addStmtToIRSB(mce->sb, st);
 542 }
 543
 544 /* assign value to tmp */
 545 static inline
 546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
 547    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
 548 }
 549
 550 /* build various kinds of expressions */
 551 #define triop(_op, _arg1, _arg2, _arg3) \
 552                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
 553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
 554 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
 555 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
 556 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
 557 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
 558 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 559 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 560 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 561 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 562
 563 /* Bind the given expression to a new temporary, and return the
 564    temporary.  This effectively converts an arbitrary expression into
 565    an atom.
 566
 567    'ty' is the type of 'e' and hence the type that the new temporary
 568    needs to be.  But passing it in is redundant, since we can deduce
 569    the type merely by inspecting 'e'.  So at least use that fact to
 570    assert that the two types agree. */
 571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 572 {
 573    TempKind k;
 574    IRTemp   t;
 575    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
 576
 577    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
 578    switch (cat) {
 579       case 'V': k = VSh;  break;
 580       case 'B': k = BSh;  break;
 581       case 'C': k = Orig; break;
 582                 /* happens when we are making up new "orig"
 583                    expressions, for IRCAS handling */
 584       default: tl_assert(0);
 585    }
 586    t = newTemp(mce, ty, k);
 587    assign(cat, mce, t, e);
 588    return mkexpr(t);
 589 }
 590
 591
 592 /*------------------------------------------------------------*/
 593 /*--- Helper functions for 128-bit ops                     ---*/
 594 /*------------------------------------------------------------*/
 595
 596 static IRExpr *i128_const_zero(void)
 597 {
 598    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
 599    return binop(Iop_64HLto128, z64, z64);
 600 }
 601
 602 /* There are no I128-bit loads and/or stores [as generated by any
 603    current front ends].  So we do not need to worry about that in
 604    expr2vbits_Load */
 605
 606
 607 /*------------------------------------------------------------*/
 608 /*--- Constructing definedness primitive ops               ---*/
 609 /*------------------------------------------------------------*/
 610
 611 /* --------- Defined-if-either-defined --------- */
 612
 613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 614    tl_assert(isShadowAtom(mce,a1));
 615    tl_assert(isShadowAtom(mce,a2));
 616    return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
 617 }
 618
 619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 620    tl_assert(isShadowAtom(mce,a1));
 621    tl_assert(isShadowAtom(mce,a2));
 622    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 623 }
 624
 625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 626    tl_assert(isShadowAtom(mce,a1));
 627    tl_assert(isShadowAtom(mce,a2));
 628    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 629 }
 630
 631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 632    tl_assert(isShadowAtom(mce,a1));
 633    tl_assert(isShadowAtom(mce,a2));
 634    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 635 }
 636
 637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 638    tl_assert(isShadowAtom(mce,a1));
 639    tl_assert(isShadowAtom(mce,a2));
 640    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 641 }
 642
 643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 644    tl_assert(isShadowAtom(mce,a1));
 645    tl_assert(isShadowAtom(mce,a2));
 646    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 647 }
 648
 649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 650    tl_assert(isShadowAtom(mce,a1));
 651    tl_assert(isShadowAtom(mce,a2));
 652    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
 653 }
 654
 655 /* --------- Undefined-if-either-undefined --------- */
 656
 657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 658    tl_assert(isShadowAtom(mce,a1));
 659    tl_assert(isShadowAtom(mce,a2));
 660    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
 661 }
 662
 663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 664    tl_assert(isShadowAtom(mce,a1));
 665    tl_assert(isShadowAtom(mce,a2));
 666    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 667 }
 668
 669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 670    tl_assert(isShadowAtom(mce,a1));
 671    tl_assert(isShadowAtom(mce,a2));
 672    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 673 }
 674
 675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 676    tl_assert(isShadowAtom(mce,a1));
 677    tl_assert(isShadowAtom(mce,a2));
 678    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 679 }
 680
 681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 682    tl_assert(isShadowAtom(mce,a1));
 683    tl_assert(isShadowAtom(mce,a2));
 684    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 685 }
 686
 687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 688    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
 689    tl_assert(isShadowAtom(mce,a1));
 690    tl_assert(isShadowAtom(mce,a2));
 691    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
 692    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
 693    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
 694    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
 695    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
 696    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
 697
 698    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
 699 }
 700
 701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 702    tl_assert(isShadowAtom(mce,a1));
 703    tl_assert(isShadowAtom(mce,a2));
 704    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 705 }
 706
 707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 708    tl_assert(isShadowAtom(mce,a1));
 709    tl_assert(isShadowAtom(mce,a2));
 710    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
 711 }
 712
 713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
 714    switch (vty) {
 715       case Ity_I8:   return mkUifU8(mce, a1, a2);
 716       case Ity_I16:  return mkUifU16(mce, a1, a2);
 717       case Ity_I32:  return mkUifU32(mce, a1, a2);
 718       case Ity_I64:  return mkUifU64(mce, a1, a2);
 719       case Ity_I128: return mkUifU128(mce, a1, a2);
 720       case Ity_V128: return mkUifUV128(mce, a1, a2);
 721       case Ity_V256: return mkUifUV256(mce, a1, a2);
 722       default:
 723          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
 724          VG_(tool_panic)("memcheck:mkUifU");
 725    }
 726 }
 727
 728 /* --------- The Left-family of operations. --------- */
 729
 730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
 731    tl_assert(isShadowAtom(mce,a1));
 732    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 733 }
 734
 735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
 736    tl_assert(isShadowAtom(mce,a1));
 737    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 738 }
 739
 740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
 741    tl_assert(isShadowAtom(mce,a1));
 742    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 743 }
 744
 745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
 746    tl_assert(isShadowAtom(mce,a1));
 747    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 748 }
 749
 750 /* --------- The Right-family of operations. --------- */
 751
 752 /* Unfortunately these are a lot more expensive then their Left
 753    counterparts.  Fortunately they are only very rarely used -- only for
 754    count-leading-zeroes instrumentation. */
 755
 756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
 757 {
 758    for (Int i = 1; i <= 16; i *= 2) {
 759       // a1 |= (a1 >>u i)
 760       IRAtom* tmp
 761          = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
 762       a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
 763    }
 764    return a1;
 765 }
 766
 767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
 768 {
 769    for (Int i = 1; i <= 32; i *= 2) {
 770       // a1 |= (a1 >>u i)
 771       IRAtom* tmp
 772          = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
 773       a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
 774    }
 775    return a1;
 776 }
 777
 778 /* --------- 'Improvement' functions for AND/OR. --------- */
 779
 780 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
 781    defined (0); all other -> undefined (1).
 782 */
 783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 784 {
 785    tl_assert(isOriginalAtom(mce, data));
 786    tl_assert(isShadowAtom(mce, vbits));
 787    tl_assert(sameKindedAtoms(data, vbits));
 788    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
 789 }
 790
 791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 792 {
 793    tl_assert(isOriginalAtom(mce, data));
 794    tl_assert(isShadowAtom(mce, vbits));
 795    tl_assert(sameKindedAtoms(data, vbits));
 796    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 797 }
 798
 799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 800 {
 801    tl_assert(isOriginalAtom(mce, data));
 802    tl_assert(isShadowAtom(mce, vbits));
 803    tl_assert(sameKindedAtoms(data, vbits));
 804    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 805 }
 806
 807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 808 {
 809    tl_assert(isOriginalAtom(mce, data));
 810    tl_assert(isShadowAtom(mce, vbits));
 811    tl_assert(sameKindedAtoms(data, vbits));
 812    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 813 }
 814
 815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 816 {
 817    tl_assert(isOriginalAtom(mce, data));
 818    tl_assert(isShadowAtom(mce, vbits));
 819    tl_assert(sameKindedAtoms(data, vbits));
 820    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 821 }
 822
 823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 824 {
 825    tl_assert(isOriginalAtom(mce, data));
 826    tl_assert(isShadowAtom(mce, vbits));
 827    tl_assert(sameKindedAtoms(data, vbits));
 828    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 829 }
 830
 831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 832 {
 833    tl_assert(isOriginalAtom(mce, data));
 834    tl_assert(isShadowAtom(mce, vbits));
 835    tl_assert(sameKindedAtoms(data, vbits));
 836    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
 837 }
 838
 839 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
 840    defined (0); all other -> undefined (1).
 841 */
 842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 843 {
 844    tl_assert(isOriginalAtom(mce, data));
 845    tl_assert(isShadowAtom(mce, vbits));
 846    tl_assert(sameKindedAtoms(data, vbits));
 847    return assignNew(
 848              'V', mce, Ity_I1,
 849              binop(Iop_Or1,
 850                    assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
 851                    vbits) );
 852 }
 853
 854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 855 {
 856    tl_assert(isOriginalAtom(mce, data));
 857    tl_assert(isShadowAtom(mce, vbits));
 858    tl_assert(sameKindedAtoms(data, vbits));
 859    return assignNew(
 860              'V', mce, Ity_I8,
 861              binop(Iop_Or8,
 862                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
 863                    vbits) );
 864 }
 865
 866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 867 {
 868    tl_assert(isOriginalAtom(mce, data));
 869    tl_assert(isShadowAtom(mce, vbits));
 870    tl_assert(sameKindedAtoms(data, vbits));
 871    return assignNew(
 872              'V', mce, Ity_I16,
 873              binop(Iop_Or16,
 874                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
 875                    vbits) );
 876 }
 877
 878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 879 {
 880    tl_assert(isOriginalAtom(mce, data));
 881    tl_assert(isShadowAtom(mce, vbits));
 882    tl_assert(sameKindedAtoms(data, vbits));
 883    return assignNew(
 884              'V', mce, Ity_I32,
 885              binop(Iop_Or32,
 886                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
 887                    vbits) );
 888 }
 889
 890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 891 {
 892    tl_assert(isOriginalAtom(mce, data));
 893    tl_assert(isShadowAtom(mce, vbits));
 894    tl_assert(sameKindedAtoms(data, vbits));
 895    return assignNew(
 896              'V', mce, Ity_I64,
 897              binop(Iop_Or64,
 898                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
 899                    vbits) );
 900 }
 901
 902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 903 {
 904    tl_assert(isOriginalAtom(mce, data));
 905    tl_assert(isShadowAtom(mce, vbits));
 906    tl_assert(sameKindedAtoms(data, vbits));
 907    return assignNew(
 908              'V', mce, Ity_V128,
 909              binop(Iop_OrV128,
 910                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
 911                    vbits) );
 912 }
 913
 914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 915 {
 916    tl_assert(isOriginalAtom(mce, data));
 917    tl_assert(isShadowAtom(mce, vbits));
 918    tl_assert(sameKindedAtoms(data, vbits));
 919    return assignNew(
 920              'V', mce, Ity_V256,
 921              binop(Iop_OrV256,
 922                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
 923                    vbits) );
 924 }
 925
 926 /* --------- Pessimising casts. --------- */
 927
 928 /* The function returns an expression of type DST_TY. If any of the VBITS
 929    is undefined (value == 1) the resulting expression has all bits set to
 930    1. Otherwise, all bits are 0. */
 931
 932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
 933 {
 934    IRType  src_ty;
 935    IRAtom* tmp1;
 936
 937    /* Note, dst_ty is a shadow type, not an original type. */
 938    tl_assert(isShadowAtom(mce,vbits));
 939    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
 940
 941    /* Fast-track some common cases */
 942    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
 943       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 944
 945    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
 946       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 947
 948    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
 949       /* PCast the arg, then clone it. */
 950       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 951       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 952    }
 953
 954    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
 955       /* PCast the arg, then clone it 4 times. */
 956       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 957       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 958       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 959    }
 960
 961    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
 962       /* PCast the arg, then clone it 8 times. */
 963       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 964       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 965       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 966       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
 967    }
 968
 969    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
 970       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
 971          the top half. */
 972       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 973       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
 974    }
 975
 976    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
 977       /* Use InterleaveHI64x2 to copy the top half of the vector into
 978          the bottom half.  Then we can UifU it with the original, throw
 979          away the upper half of the result, and PCast-I64-to-I64
 980          the lower half. */
 981       // Generates vbits[127:64] : vbits[127:64]
 982       IRAtom* hi64hi64
 983          = assignNew('V', mce, Ity_V128,
 984                      binop(Iop_InterleaveHI64x2, vbits, vbits));
 985       // Generates
 986       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
 987       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
 988       IRAtom* lohi64
 989          = mkUifUV128(mce, hi64hi64, vbits);
 990       // Generates UifU(vbits[127:64],vbits[63:0])
 991       IRAtom* lo64
 992          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
 993       // Generates
 994       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
 995       //   == PCast-to-I64( vbits[127:0] )
 996       IRAtom* res
 997          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
 998       return res;
 999    }
1000
1001    /* Else do it the slow way .. */
1002    /* First of all, collapse vbits down to a single bit. */
1003    tmp1   = NULL;
1004    switch (src_ty) {
1005       case Ity_I1:
1006          tmp1 = vbits;
1007          break;
1008       case Ity_I8:
1009          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010          break;
1011       case Ity_I16:
1012          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013          break;
1014       case Ity_I32:
1015          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016          break;
1017       case Ity_I64:
1018          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019          break;
1020       case Ity_I128: {
1021          /* Gah.  Chop it in half, OR the halves together, and compare
1022             that with zero. */
1023          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026          tmp1         = assignNew('V', mce, Ity_I1,
1027                                        unop(Iop_CmpNEZ64, tmp4));
1028          break;
1029       }
1030       case Ity_V128: {
1031          /* Chop it in half, OR the halves together, and compare that
1032           * with zero.
1033           */
1034          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037          tmp1         = assignNew('V', mce, Ity_I1,
1038                                        unop(Iop_CmpNEZ64, tmp4));
1039          break;
1040       }
1041       default:
1042          ppIRType(src_ty);
1043          VG_(tool_panic)("mkPCastTo(1)");
1044    }
1045    tl_assert(tmp1);
1046    /* Now widen up to the dst type. */
1047    switch (dst_ty) {
1048       case Ity_I1:
1049          return tmp1;
1050       case Ity_I8:
1051          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052       case Ity_I16:
1053          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054       case Ity_I32:
1055          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056       case Ity_I64:
1057          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058       case Ity_V128:
1059          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1060          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061          return tmp1;
1062       case Ity_I128:
1063          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1064          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065          return tmp1;
1066       case Ity_V256:
1067          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1068          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069                                                     tmp1, tmp1));
1070          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071                                                     tmp1, tmp1));
1072          return tmp1;
1073       default:
1074          ppIRType(dst_ty);
1075          VG_(tool_panic)("mkPCastTo(2)");
1076    }
1077 }
1078
1079 /* This is a minor variant.  It takes an arg of some type and returns
1080    a value of the same type.  The result consists entirely of Defined
1081    (zero) bits except its least significant bit, which is a PCast of
1082    the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1084 {
1085    if (ty == Ity_V128) {
1086       /* --- Case for V128 --- */
1087       IRAtom* varg128 = varg;
1088       // generates: PCast-to-I64(varg128)
1089       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090       // Now introduce zeros (defined bits) in the top 63 places
1091       // generates: Def--(63)--Def PCast-to-I1(varg128)
1092       IRAtom* d63pc
1093          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094       // generates: Def--(64)--Def
1095       IRAtom* d64
1096          = definedOfType(Ity_I64);
1097       // generates: Def--(127)--Def PCast-to-I1(varg128)
1098       IRAtom* res
1099          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100       return res;
1101    }
1102    if (ty == Ity_I64) {
1103       /* --- Case for I64 --- */
1104       // PCast to 64
1105       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106       // Zero (Def) out the top 63 bits
1107       IRAtom* res
1108          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109       return res;
1110    }
1111    /*NOTREACHED*/
1112    tl_assert(0);
1113 }
1114
1115 /* --------- Optimistic casts. --------- */
1116
1117 /* The function takes and returns an expression of type TY. If any of the
1118    VBITS indicate defined (value == 0) the resulting expression has all bits
1119    set to 0. Otherwise, all bits are 1.  In words, if any bits are defined
1120    then all bits are made to be defined.
1121
1122    In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1123 */
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1125 {
1126    IROp opSUB, opSHR, opSAR;
1127    UInt sh;
1128
1129    switch (ty) {
1130       case Ity_I64:
1131          opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132          break;
1133       case Ity_I32:
1134          opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135          break;
1136       case Ity_I16:
1137          opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138          break;
1139       case Ity_I8:
1140          opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141          break;
1142       default:
1143          ppIRType(ty);
1144          VG_(tool_panic)("mkOCastTo");
1145    }
1146
1147    IRAtom *shr1, *at;
1148    shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149    at   = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150    at   = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151    return at;
1152 }
1153
1154
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1156 /*
1157    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158    PCasting to Ity_U1.  However, sometimes it is necessary to be more
1159    accurate.  The insight is that the result is defined if two
1160    corresponding bits can be found, one from each argument, so that
1161    both bits are defined but are different -- that makes EQ say "No"
1162    and NE say "Yes".  Hence, we compute an improvement term and DifD
1163    it onto the "normal" (UifU) result.
1164
1165    The result is:
1166
1167    PCastTo<1> (
1168       -- naive version
1169       UifU<sz>(vxx, vyy)
1170
1171       `DifD<sz>`
1172
1173       -- improvement term
1174       OCast<sz>(vec)
1175    )
1176
1177    where
1178      vec contains 0 (defined) bits where the corresponding arg bits
1179      are defined but different, and 1 bits otherwise.
1180
1181      vec = Or<sz>( vxx,   // 0 iff bit defined
1182                    vyy,   // 0 iff bit defined
1183                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1184                  )
1185
1186      If any bit of vec is 0, the result is defined and so the
1187      improvement term should produce 0...0, else it should produce
1188      1...1.
1189
1190      Hence require for the improvement term:
1191
1192         OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1193
1194      which you can think of as an "optimistic cast" (OCast, the opposite of
1195      the normal "pessimistic cast" (PCast) family.  An OCast says all bits
1196      are defined if any bit is defined.
1197
1198      It is possible to show that
1199
1200          if vec == 1...1 then 1...1 else 0...0
1201
1202      can be implemented in straight-line code as
1203
1204          (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1205
1206    We note that vec contains the sub-term Or<sz>(vxx, vyy).  Since UifU is
1207    implemented with Or (since 1 signifies undefinedness), this is a
1208    duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209    a final version of:
1210
1211    let naive = UifU<sz>(vxx, vyy)
1212        vec   = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1213    in
1214        PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1215
1216    This was extensively re-analysed and checked on 6 July 05 and again
1217    in July 2017.
1218 */
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
1220                                     IRType  ty,
1221                                     IRAtom* vxx, IRAtom* vyy,
1222                                     IRAtom* xx,  IRAtom* yy )
1223 {
1224    IRAtom *naive, *vec, *improved, *final_cast;
1225    IROp   opDIFD, opUIFU, opOR, opXOR, opNOT;
1226
1227    tl_assert(isShadowAtom(mce,vxx));
1228    tl_assert(isShadowAtom(mce,vyy));
1229    tl_assert(isOriginalAtom(mce,xx));
1230    tl_assert(isOriginalAtom(mce,yy));
1231    tl_assert(sameKindedAtoms(vxx,xx));
1232    tl_assert(sameKindedAtoms(vyy,yy));
1233
1234    switch (ty) {
1235       case Ity_I8:
1236          opDIFD = Iop_And8;
1237          opUIFU = Iop_Or8;
1238          opOR   = Iop_Or8;
1239          opXOR  = Iop_Xor8;
1240          opNOT  = Iop_Not8;
1241          break;
1242       case Ity_I16:
1243          opDIFD = Iop_And16;
1244          opUIFU = Iop_Or16;
1245          opOR   = Iop_Or16;
1246          opXOR  = Iop_Xor16;
1247          opNOT  = Iop_Not16;
1248          break;
1249       case Ity_I32:
1250          opDIFD = Iop_And32;
1251          opUIFU = Iop_Or32;
1252          opOR   = Iop_Or32;
1253          opXOR  = Iop_Xor32;
1254          opNOT  = Iop_Not32;
1255          break;
1256       case Ity_I64:
1257          opDIFD = Iop_And64;
1258          opUIFU = Iop_Or64;
1259          opOR   = Iop_Or64;
1260          opXOR  = Iop_Xor64;
1261          opNOT  = Iop_Not64;
1262          break;
1263       default:
1264          VG_(tool_panic)("expensiveCmpEQorNE");
1265    }
1266
1267    naive
1268       = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1269
1270    vec
1271       = assignNew(
1272            'V', mce,ty,
1273            binop( opOR,
1274                   naive,
1275                   assignNew(
1276                      'V', mce,ty,
1277                      unop(opNOT,
1278                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1279
1280    improved
1281       = assignNew( 'V', mce,ty,
1282                    binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1283
1284    final_cast
1285       = mkPCastTo( mce, Ity_I1, improved );
1286
1287    return final_cast;
1288 }
1289
1290 /* Check if we can know, despite the uncertain bits, that xx is greater than yy.
1291    Notice that it's xx > yy and not the other way around.  This is Intel syntax
1292    with destination first.  It will appear reversed in gdb disassembly (AT&T
1293    syntax).
1294  */
1295 static IRAtom* expensiveCmpGT ( MCEnv*  mce,
1296                                 IROp opGT,
1297                                 IRAtom* vxx, IRAtom* vyy,
1298                                 IRAtom* xx,  IRAtom* yy )
1299 {
1300    IROp   opAND, opOR, opXOR, opNOT, opSHL;
1301    IRType ty;
1302    unsigned int word_size;
1303    Bool is_signed;
1304
1305    tl_assert(isShadowAtom(mce,vxx));
1306    tl_assert(isShadowAtom(mce,vyy));
1307    tl_assert(isOriginalAtom(mce,xx));
1308    tl_assert(isOriginalAtom(mce,yy));
1309    tl_assert(sameKindedAtoms(vxx,xx));
1310    tl_assert(sameKindedAtoms(vyy,yy));
1311
1312    switch (opGT) {
1313       case Iop_CmpGT64Sx2:
1314       case Iop_CmpGT64Ux2:
1315          opSHL = Iop_ShlN64x2;
1316          word_size = 64;
1317          break;
1318       case Iop_CmpGT32Sx4:
1319       case Iop_CmpGT32Ux4:
1320          opSHL = Iop_ShlN32x4;
1321          word_size = 32;
1322          break;
1323       case Iop_CmpGT16Sx8:
1324       case Iop_CmpGT16Ux8:
1325          opSHL = Iop_ShlN16x8;
1326          word_size = 16;
1327          break;
1328       case Iop_CmpGT8Sx16:
1329       case Iop_CmpGT8Ux16:
1330          opSHL = Iop_ShlN8x16;
1331          word_size = 8;
1332          break;
1333       default:
1334          VG_(tool_panic)("expensiveCmpGT");
1335    }
1336
1337    switch (opGT) {
1338       case Iop_CmpGT64Sx2:
1339       case Iop_CmpGT32Sx4:
1340       case Iop_CmpGT16Sx8:
1341       case Iop_CmpGT8Sx16:
1342          is_signed = True;
1343          break;
1344       case Iop_CmpGT64Ux2:
1345       case Iop_CmpGT32Ux4:
1346       case Iop_CmpGT16Ux8:
1347       case Iop_CmpGT8Ux16:
1348          is_signed = False;
1349          break;
1350       default:
1351          VG_(tool_panic)("expensiveCmpGT");
1352    }
1353
1354    ty = Ity_V128;
1355    opAND = Iop_AndV128;
1356    opOR   = Iop_OrV128;
1357    opXOR  = Iop_XorV128;
1358    opNOT  = Iop_NotV128;
1359
1360    IRAtom *MSBs;
1361    if (is_signed) {
1362       // For unsigned it's easy to make the min and max: Just set the unknown
1363       // bits all to 0s or 1s.  For signed it's harder because having a 1 in the
1364       // MSB makes a number smaller, not larger!  We can work around this by
1365       // flipping the MSB before and after computing the min and max values.
1366       IRAtom *all_ones = mkV128(0xffff);
1367       MSBs = assignNew('V', mce, ty, binop(opSHL, all_ones, mkU8(word_size-1)));
1368       xx = assignNew('V', mce, ty, binop(opXOR, xx, MSBs));
1369       yy = assignNew('V', mce, ty, binop(opXOR, yy, MSBs));
1370       // From here on out, we're dealing with MSB-flipped integers.
1371    }
1372    // We can combine xx and vxx to create two values: the largest that xx could
1373    // possibly be and the smallest that xx could possibly be.  Likewise, we can
1374    // do the same for yy.  We'll call those max_xx and min_xx and max_yy and
1375    // min_yy.
1376    IRAtom *not_vxx = assignNew('V', mce, ty, unop(opNOT, vxx));
1377    IRAtom *not_vyy = assignNew('V', mce, ty, unop(opNOT, vyy));
1378    IRAtom *max_xx = assignNew('V', mce, ty, binop(opOR, xx, vxx));
1379    IRAtom *min_xx = assignNew('V', mce, ty, binop(opAND, xx, not_vxx));
1380    IRAtom *max_yy = assignNew('V', mce, ty, binop(opOR, yy, vyy));
1381    IRAtom *min_yy = assignNew('V', mce, ty, binop(opAND, yy, not_vyy));
1382    if (is_signed) {
1383       // Unflip the MSBs.
1384       max_xx = assignNew('V', mce, ty, binop(opXOR, max_xx, MSBs));
1385       min_xx = assignNew('V', mce, ty, binop(opXOR, min_xx, MSBs));
1386       max_yy = assignNew('V', mce, ty, binop(opXOR, max_yy, MSBs));
1387       min_yy = assignNew('V', mce, ty, binop(opXOR, min_yy, MSBs));
1388    }
1389    IRAtom *min_xx_gt_max_yy = assignNew('V', mce, ty, binop(opGT, min_xx, max_yy));
1390    IRAtom *max_xx_gt_min_yy = assignNew('V', mce, ty, binop(opGT, max_xx, min_yy));
1391    // If min_xx is greater than max_yy then xx is surely greater than yy so we know
1392    // our answer for sure.  If max_xx is not greater than min_yy then xx can't
1393    // possible be greater than yy so again we know the answer for sure.  For all
1394    // other cases, we can't know.
1395    //
1396    // So the result is defined if:
1397    //
1398    // min_xx_gt_max_yy | ~max_xx_gt_min_yy
1399    //
1400    // Because defined in vbits is 0s and not 1s, we need to invert that:
1401    //
1402    // ~(min_xx_gt_max_yy | ~max_xx_gt_min_yy)
1403    //
1404    // We can use DeMorgan's Law to simplify the above:
1405    //
1406    // ~min_xx_gt_max_yy & max_xx_gt_min_yy
1407    IRAtom *not_min_xx_gt_max_yy = assignNew('V', mce, ty, unop(opNOT, min_xx_gt_max_yy));
1408    return assignNew('V', mce, ty, binop(opAND, not_min_xx_gt_max_yy, max_xx_gt_min_yy));
1409 }
1410
1411 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1412
1413 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1414
1415       CmpORD32S(x,y) = 1<<3   if  x <s y
1416                      = 1<<2   if  x >s y
1417                      = 1<<1   if  x == y
1418
1419    and similarly the unsigned variant.  The default interpretation is:
1420
1421       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1422                                   & (7<<1)
1423
1424    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1425    are zero and therefore defined (viz, zero).
1426
1427    Also deal with a special case better:
1428
1429       CmpORD32S(x,0)
1430
1431    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1432    will be defined even if the rest of x isn't.  In which case we do:
1433
1434       CmpORD32S#(x,x#,0,{impliedly 0}#)
1435          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1436            | (x# >>u 31) << 3      -- LT# = x#[31]
1437
1438    Analogous handling for CmpORD64{S,U}.
1439 */
1440 static Bool isZeroU32 ( IRAtom* e )
1441 {
1442    return
1443       toBool( e->tag == Iex_Const
1444               && e->Iex.Const.con->tag == Ico_U32
1445               && e->Iex.Const.con->Ico.U32 == 0 );
1446 }
1447
1448 static Bool isZeroU64 ( IRAtom* e )
1449 {
1450    return
1451       toBool( e->tag == Iex_Const
1452               && e->Iex.Const.con->tag == Ico_U64
1453               && e->Iex.Const.con->Ico.U64 == 0 );
1454 }
1455
1456 static IRAtom* doCmpORD ( MCEnv*  mce,
1457                           IROp    cmp_op,
1458                           IRAtom* xxhash, IRAtom* yyhash,
1459                           IRAtom* xx,     IRAtom* yy )
1460 {
1461    Bool   m64      = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1462    Bool   syned    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1463    IROp   opOR     = m64 ? Iop_Or64   : Iop_Or32;
1464    IROp   opAND    = m64 ? Iop_And64  : Iop_And32;
1465    IROp   opSHL    = m64 ? Iop_Shl64  : Iop_Shl32;
1466    IROp   opSHR    = m64 ? Iop_Shr64  : Iop_Shr32;
1467    IROp   op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1468    IRType ty       = m64 ? Ity_I64    : Ity_I32;
1469    Int    width    = m64 ? 64         : 32;
1470
1471    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1472
1473    tl_assert(isShadowAtom(mce,xxhash));
1474    tl_assert(isShadowAtom(mce,yyhash));
1475    tl_assert(isOriginalAtom(mce,xx));
1476    tl_assert(isOriginalAtom(mce,yy));
1477    tl_assert(sameKindedAtoms(xxhash,xx));
1478    tl_assert(sameKindedAtoms(yyhash,yy));
1479    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1480              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1481
1482    if (0) {
1483       ppIROp(cmp_op); VG_(printf)(" ");
1484       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1485    }
1486
1487    if (syned && isZero(yy)) {
1488       /* fancy interpretation */
1489       /* if yy is zero, then it must be fully defined (zero#). */
1490       tl_assert(isZero(yyhash));
1491       // This is still inaccurate, but I don't think it matters, since
1492       // nobody writes code of the form
1493       // "is <partially-undefined-value> signedly greater than zero?".
1494       // We therefore simply declare "x >s 0" to be undefined if any bit in
1495       // x is undefined.  That's clearly suboptimal in some cases.  Eg, if
1496       // the highest order bit is a defined 1 then x is negative so it
1497       // doesn't matter whether the remaining bits are defined or not.
1498       IRAtom* t_0_gt_0_0
1499          = assignNew(
1500               'V', mce,ty,
1501               binop(
1502                  opAND,
1503                  mkPCastTo(mce,ty, xxhash),
1504                  m64 ? mkU64(1<<2) : mkU32(1<<2)
1505               ));
1506       // For "x <s 0", we can just copy the definedness of the top bit of x
1507       // and we have a precise result.
1508       IRAtom* t_lt_0_0_0
1509          = assignNew(
1510               'V', mce,ty,
1511               binop(
1512                  opSHL,
1513                  assignNew(
1514                     'V', mce,ty,
1515                     binop(opSHR, xxhash, mkU8(width-1))),
1516                  mkU8(3)
1517               ));
1518       // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1519       IRAtom* t_0_0_eq_0
1520          = assignNew(
1521               'V', mce,ty,
1522               binop(
1523                  opSHL,
1524                  assignNew('V', mce,ty,
1525                     unop(
1526                     op1UtoWS,
1527                     expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1528                  ),
1529                  mkU8(1)
1530               ));
1531       return
1532          binop(
1533             opOR,
1534             assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1535             t_0_0_eq_0
1536          );
1537    } else {
1538       /* standard interpretation */
1539       IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1540       return
1541          binop(
1542             opAND,
1543             mkPCastTo( mce,ty,
1544                        mkUifU(mce,ty, xxhash,yyhash)),
1545             sevenLeft1
1546          );
1547    }
1548 }
1549
1550
1551 /*------------------------------------------------------------*/
1552 /*--- Emit a test and complaint if something is undefined. ---*/
1553 /*------------------------------------------------------------*/
1554
1555 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1556
1557
1558 /* Set the annotations on a dirty helper to indicate that the stack
1559    pointer and instruction pointers might be read.  This is the
1560    behaviour of all 'emit-a-complaint' style functions we might
1561    call. */
1562
1563 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1564    di->nFxState = 2;
1565    di->fxState[0].fx        = Ifx_Read;
1566    di->fxState[0].offset    = mce->layout->offset_SP;
1567    di->fxState[0].size      = mce->layout->sizeof_SP;
1568    di->fxState[0].nRepeats  = 0;
1569    di->fxState[0].repeatLen = 0;
1570    di->fxState[1].fx        = Ifx_Read;
1571    di->fxState[1].offset    = mce->layout->offset_IP;
1572    di->fxState[1].size      = mce->layout->sizeof_IP;
1573    di->fxState[1].nRepeats  = 0;
1574    di->fxState[1].repeatLen = 0;
1575 }
1576
1577
1578 /* Check the supplied *original* |atom| for undefinedness, and emit a
1579    complaint if so.  Once that happens, mark it as defined.  This is
1580    possible because the atom is either a tmp or literal.  If it's a
1581    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1582    be defined.  In fact as mentioned above, we will have to allocate a
1583    new tmp to carry the new 'defined' shadow value, and update the
1584    original->tmp mapping accordingly; we cannot simply assign a new
1585    value to an existing shadow tmp as this breaks SSAness.
1586
1587    The checks are performed, any resulting complaint emitted, and
1588    |atom|'s shadow temp set to 'defined', ONLY in the case that
1589    |guard| evaluates to True at run-time.  If it evaluates to False
1590    then no action is performed.  If |guard| is NULL (the usual case)
1591    then it is assumed to be always-true, and hence these actions are
1592    performed unconditionally.
1593
1594    This routine does not generate code to check the definedness of
1595    |guard|.  The caller is assumed to have taken care of that already.
1596 */
1597 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1598 {
1599    IRAtom*  vatom;
1600    IRType   ty;
1601    Int      sz;
1602    IRDirty* di;
1603    IRAtom*  cond;
1604    IRAtom*  origin;
1605    void*    fn;
1606    const HChar* nm;
1607    IRExpr** args;
1608    Int      nargs;
1609
1610    // Don't do V bit tests if we're not reporting undefined value errors.
1611    if (MC_(clo_mc_level) == 1)
1612       return;
1613
1614    if (guard)
1615       tl_assert(isOriginalAtom(mce, guard));
1616
1617    /* Since the original expression is atomic, there's no duplicated
1618       work generated by making multiple V-expressions for it.  So we
1619       don't really care about the possibility that someone else may
1620       also create a V-interpretion for it. */
1621    tl_assert(isOriginalAtom(mce, atom));
1622    vatom = expr2vbits( mce, atom, HuOth );
1623    tl_assert(isShadowAtom(mce, vatom));
1624    tl_assert(sameKindedAtoms(atom, vatom));
1625
1626    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1627
1628    /* sz is only used for constructing the error message */
1629    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1630
1631    cond = mkPCastTo( mce, Ity_I1, vatom );
1632    /* cond will be 0 if all defined, and 1 if any not defined. */
1633
1634    /* Get the origin info for the value we are about to check.  At
1635       least, if we are doing origin tracking.  If not, use a dummy
1636       zero origin. */
1637    if (MC_(clo_mc_level) == 3) {
1638       origin = schemeE( mce, atom );
1639       if (mce->hWordTy == Ity_I64) {
1640          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1641       }
1642    } else {
1643       origin = NULL;
1644    }
1645
1646    fn    = NULL;
1647    nm    = NULL;
1648    args  = NULL;
1649    nargs = -1;
1650
1651    switch (sz) {
1652       case 0:
1653          if (origin) {
1654             fn    = &MC_(helperc_value_check0_fail_w_o);
1655             nm    = "MC_(helperc_value_check0_fail_w_o)";
1656             args  = mkIRExprVec_1(origin);
1657             nargs = 1;
1658          } else {
1659             fn    = &MC_(helperc_value_check0_fail_no_o);
1660             nm    = "MC_(helperc_value_check0_fail_no_o)";
1661             args  = mkIRExprVec_0();
1662             nargs = 0;
1663          }
1664          break;
1665       case 1:
1666          if (origin) {
1667             fn    = &MC_(helperc_value_check1_fail_w_o);
1668             nm    = "MC_(helperc_value_check1_fail_w_o)";
1669             args  = mkIRExprVec_1(origin);
1670             nargs = 1;
1671          } else {
1672             fn    = &MC_(helperc_value_check1_fail_no_o);
1673             nm    = "MC_(helperc_value_check1_fail_no_o)";
1674             args  = mkIRExprVec_0();
1675             nargs = 0;
1676          }
1677          break;
1678       case 4:
1679          if (origin) {
1680             fn    = &MC_(helperc_value_check4_fail_w_o);
1681             nm    = "MC_(helperc_value_check4_fail_w_o)";
1682             args  = mkIRExprVec_1(origin);
1683             nargs = 1;
1684          } else {
1685             fn    = &MC_(helperc_value_check4_fail_no_o);
1686             nm    = "MC_(helperc_value_check4_fail_no_o)";
1687             args  = mkIRExprVec_0();
1688             nargs = 0;
1689          }
1690          break;
1691       case 8:
1692          if (origin) {
1693             fn    = &MC_(helperc_value_check8_fail_w_o);
1694             nm    = "MC_(helperc_value_check8_fail_w_o)";
1695             args  = mkIRExprVec_1(origin);
1696             nargs = 1;
1697          } else {
1698             fn    = &MC_(helperc_value_check8_fail_no_o);
1699             nm    = "MC_(helperc_value_check8_fail_no_o)";
1700             args  = mkIRExprVec_0();
1701             nargs = 0;
1702          }
1703          break;
1704       case 2:
1705       case 16:
1706          if (origin) {
1707             fn    = &MC_(helperc_value_checkN_fail_w_o);
1708             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1709             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1710             nargs = 2;
1711          } else {
1712             fn    = &MC_(helperc_value_checkN_fail_no_o);
1713             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1714             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1715             nargs = 1;
1716          }
1717          break;
1718       default:
1719          VG_(tool_panic)("unexpected szB");
1720    }
1721
1722    tl_assert(fn);
1723    tl_assert(nm);
1724    tl_assert(args);
1725    tl_assert(nargs >= 0 && nargs <= 2);
1726    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1727               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1728
1729    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1730                            VG_(fnptr_to_fnentry)( fn ), args );
1731    di->guard = cond; // and cond is PCast-to-1(atom#)
1732
1733    /* If the complaint is to be issued under a guard condition, AND
1734       that into the guard condition for the helper call. */
1735    if (guard) {
1736       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1737       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1738       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1739       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1740    }
1741
1742    setHelperAnns( mce, di );
1743    stmt( 'V', mce, IRStmt_Dirty(di));
1744
1745    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1746       defined -- but only in the case where the guard evaluates to
1747       True at run-time.  Do the update by setting the orig->shadow
1748       mapping for tmp to reflect the fact that this shadow is getting
1749       a new value. */
1750    tl_assert(isIRAtom(vatom));
1751    /* sameKindedAtoms ... */
1752    if (vatom->tag == Iex_RdTmp) {
1753       tl_assert(atom->tag == Iex_RdTmp);
1754       if (guard == NULL) {
1755          // guard is 'always True', hence update unconditionally
1756          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1757          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1758                           definedOfType(ty));
1759       } else {
1760          // update the temp only conditionally.  Do this by copying
1761          // its old value when the guard is False.
1762          // The old value ..
1763          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1764          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1765          IRAtom* new_tmpV
1766             = assignNew('V', mce, shadowTypeV(ty),
1767                         IRExpr_ITE(guard, definedOfType(ty),
1768                                           mkexpr(old_tmpV)));
1769          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1770       }
1771    }
1772 }
1773
1774
1775 /*------------------------------------------------------------*/
1776 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1777 /*------------------------------------------------------------*/
1778
1779 /* Examine the always-defined sections declared in layout to see if
1780    the (offset,size) section is within one.  Note, is is an error to
1781    partially fall into such a region: (offset,size) should either be
1782    completely in such a region or completely not-in such a region.
1783 */
1784 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1785 {
1786    Int minoffD, maxoffD, i;
1787    Int minoff = offset;
1788    Int maxoff = minoff + size - 1;
1789    tl_assert((minoff & ~0xFFFF) == 0);
1790    tl_assert((maxoff & ~0xFFFF) == 0);
1791
1792    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1793       minoffD = mce->layout->alwaysDefd[i].offset;
1794       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1795       tl_assert((minoffD & ~0xFFFF) == 0);
1796       tl_assert((maxoffD & ~0xFFFF) == 0);
1797
1798       if (maxoff < minoffD || maxoffD < minoff)
1799          continue; /* no overlap */
1800       if (minoff >= minoffD && maxoff <= maxoffD)
1801          return True; /* completely contained in an always-defd section */
1802
1803       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1804    }
1805    return False; /* could not find any containing section */
1806 }
1807
1808
1809 /* Generate into bb suitable actions to shadow this Put.  If the state
1810    slice is marked 'always defined', do nothing.  Otherwise, write the
1811    supplied V bits to the shadow state.  We can pass in either an
1812    original atom or a V-atom, but not both.  In the former case the
1813    relevant V-bits are then generated from the original.
1814    We assume here, that the definedness of GUARD has already been checked.
1815 */
1816 static
1817 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1818                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1819 {
1820    IRType ty;
1821
1822    // Don't do shadow PUTs if we're not doing undefined value checking.
1823    // Their absence lets Vex's optimiser remove all the shadow computation
1824    // that they depend on, which includes GETs of the shadow registers.
1825    if (MC_(clo_mc_level) == 1)
1826       return;
1827
1828    if (atom) {
1829       tl_assert(!vatom);
1830       tl_assert(isOriginalAtom(mce, atom));
1831       vatom = expr2vbits( mce, atom, HuOth );
1832    } else {
1833       tl_assert(vatom);
1834       tl_assert(isShadowAtom(mce, vatom));
1835    }
1836
1837    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1838    tl_assert(ty != Ity_I1);
1839    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1840       /* later: no ... */
1841       /* emit code to emit a complaint if any of the vbits are 1. */
1842       /* complainIfUndefined(mce, atom); */
1843    } else {
1844       /* Do a plain shadow Put. */
1845       if (guard) {
1846          /* If the guard expression evaluates to false we simply Put the value
1847             that is already stored in the guest state slot */
1848          IRAtom *cond, *iffalse;
1849
1850          cond    = assignNew('V', mce, Ity_I1, guard);
1851          iffalse = assignNew('V', mce, ty,
1852                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1853          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1854       }
1855       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1856    }
1857 }
1858
1859
1860 /* Return an expression which contains the V bits corresponding to the
1861    given GETI (passed in in pieces).
1862 */
1863 static
1864 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1865 {
1866    IRAtom* vatom;
1867    IRType  ty, tyS;
1868    Int     arrSize;;
1869    IRRegArray* descr = puti->descr;
1870    IRAtom*     ix    = puti->ix;
1871    Int         bias  = puti->bias;
1872    IRAtom*     atom  = puti->data;
1873
1874    // Don't do shadow PUTIs if we're not doing undefined value checking.
1875    // Their absence lets Vex's optimiser remove all the shadow computation
1876    // that they depend on, which includes GETIs of the shadow registers.
1877    if (MC_(clo_mc_level) == 1)
1878       return;
1879
1880    tl_assert(isOriginalAtom(mce,atom));
1881    vatom = expr2vbits( mce, atom, HuOth );
1882    tl_assert(sameKindedAtoms(atom, vatom));
1883    ty   = descr->elemTy;
1884    tyS  = shadowTypeV(ty);
1885    arrSize = descr->nElems * sizeofIRType(ty);
1886    tl_assert(ty != Ity_I1);
1887    tl_assert(isOriginalAtom(mce,ix));
1888    complainIfUndefined(mce, ix, NULL);
1889    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1890       /* later: no ... */
1891       /* emit code to emit a complaint if any of the vbits are 1. */
1892       /* complainIfUndefined(mce, atom); */
1893    } else {
1894       /* Do a cloned version of the Put that refers to the shadow
1895          area. */
1896       IRRegArray* new_descr
1897          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1898                          tyS, descr->nElems);
1899       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1900    }
1901 }
1902
1903
1904 /* Return an expression which contains the V bits corresponding to the
1905    given GET (passed in in pieces).
1906 */
1907 static
1908 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1909 {
1910    IRType tyS = shadowTypeV(ty);
1911    tl_assert(ty != Ity_I1);
1912    tl_assert(ty != Ity_I128);
1913    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1914       /* Always defined, return all zeroes of the relevant type */
1915       return definedOfType(tyS);
1916    } else {
1917       /* return a cloned version of the Get that refers to the shadow
1918          area. */
1919       /* FIXME: this isn't an atom! */
1920       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1921    }
1922 }
1923
1924
1925 /* Return an expression which contains the V bits corresponding to the
1926    given GETI (passed in in pieces).
1927 */
1928 static
1929 IRExpr* shadow_GETI ( MCEnv* mce,
1930                       IRRegArray* descr, IRAtom* ix, Int bias )
1931 {
1932    IRType ty   = descr->elemTy;
1933    IRType tyS  = shadowTypeV(ty);
1934    Int arrSize = descr->nElems * sizeofIRType(ty);
1935    tl_assert(ty != Ity_I1);
1936    tl_assert(isOriginalAtom(mce,ix));
1937    complainIfUndefined(mce, ix, NULL);
1938    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1939       /* Always defined, return all zeroes of the relevant type */
1940       return definedOfType(tyS);
1941    } else {
1942       /* return a cloned version of the Get that refers to the shadow
1943          area. */
1944       IRRegArray* new_descr
1945          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1946                          tyS, descr->nElems);
1947       return IRExpr_GetI( new_descr, ix, bias );
1948    }
1949 }
1950
1951
1952 /*------------------------------------------------------------*/
1953 /*--- Generating approximations for unknown operations,    ---*/
1954 /*--- using lazy-propagate semantics                       ---*/
1955 /*------------------------------------------------------------*/
1956
1957 /* Lazy propagation of undefinedness from two values, resulting in the
1958    specified shadow type.
1959 */
1960 static
1961 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1962 {
1963    IRAtom* at;
1964    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1965    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1966    tl_assert(isShadowAtom(mce,va1));
1967    tl_assert(isShadowAtom(mce,va2));
1968
1969    /* The general case is inefficient because PCast is an expensive
1970       operation.  Here are some special cases which use PCast only
1971       once rather than twice. */
1972
1973    /* I64 x I64 -> I64 */
1974    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1975       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1976       at = mkUifU(mce, Ity_I64, va1, va2);
1977       at = mkPCastTo(mce, Ity_I64, at);
1978       return at;
1979    }
1980
1981    /* I64 x I64 -> I32 */
1982    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1983       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1984       at = mkUifU(mce, Ity_I64, va1, va2);
1985       at = mkPCastTo(mce, Ity_I32, at);
1986       return at;
1987    }
1988
1989    /* I32 x I32 -> I32 */
1990    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1991       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1992       at = mkUifU(mce, Ity_I32, va1, va2);
1993       at = mkPCastTo(mce, Ity_I32, at);
1994       return at;
1995    }
1996
1997    if (0) {
1998       VG_(printf)("mkLazy2 ");
1999       ppIRType(t1);
2000       VG_(printf)("_");
2001       ppIRType(t2);
2002       VG_(printf)("_");
2003       ppIRType(finalVty);
2004       VG_(printf)("\n");
2005    }
2006
2007    /* General case: force everything via 32-bit intermediaries. */
2008    at = mkPCastTo(mce, Ity_I32, va1);
2009    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2010    at = mkPCastTo(mce, finalVty, at);
2011    return at;
2012 }
2013
2014
2015 /* 3-arg version of the above. */
2016 static
2017 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
2018                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
2019 {
2020    IRAtom* at;
2021    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2022    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2023    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2024    tl_assert(isShadowAtom(mce,va1));
2025    tl_assert(isShadowAtom(mce,va2));
2026    tl_assert(isShadowAtom(mce,va3));
2027
2028    /* The general case is inefficient because PCast is an expensive
2029       operation.  Here are some special cases which use PCast only
2030       twice rather than three times. */
2031
2032    /* I32 x I64 x I64 -> I64 */
2033    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2034    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2035        && finalVty == Ity_I64) {
2036       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
2037       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
2038          mode indication which is fully defined, this should get
2039          folded out later. */
2040       at = mkPCastTo(mce, Ity_I64, va1);
2041       /* Now fold in 2nd and 3rd args. */
2042       at = mkUifU(mce, Ity_I64, at, va2);
2043       at = mkUifU(mce, Ity_I64, at, va3);
2044       /* and PCast once again. */
2045       at = mkPCastTo(mce, Ity_I64, at);
2046       return at;
2047    }
2048
2049    /* I32 x I8 x I64 -> I64 */
2050    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
2051        && finalVty == Ity_I64) {
2052       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
2053       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
2054        * rounding mode indication which is fully defined, this should
2055        * get folded out later.
2056       */
2057       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2058       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2059       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
2060       at = mkUifU(mce, Ity_I64, at, va3);
2061       /* and PCast once again. */
2062       at = mkPCastTo(mce, Ity_I64, at);
2063       return at;
2064    }
2065
2066    /* I32 x I64 x I64 -> I32 */
2067    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2068        && finalVty == Ity_I32) {
2069       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
2070       at = mkPCastTo(mce, Ity_I64, va1);
2071       at = mkUifU(mce, Ity_I64, at, va2);
2072       at = mkUifU(mce, Ity_I64, at, va3);
2073       at = mkPCastTo(mce, Ity_I32, at);
2074       return at;
2075    }
2076
2077    /* I32 x I32 x I32 -> I32 */
2078    /* 32-bit FP idiom, as (eg) happens on ARM */
2079    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
2080        && finalVty == Ity_I32) {
2081       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
2082       at = va1;
2083       at = mkUifU(mce, Ity_I32, at, va2);
2084       at = mkUifU(mce, Ity_I32, at, va3);
2085       at = mkPCastTo(mce, Ity_I32, at);
2086       return at;
2087    }
2088
2089    /* I32 x I16 x I16 -> I16 */
2090    /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
2091    if (t1 == Ity_I32 && t2 == Ity_I16 && t3 == Ity_I16
2092        && finalVty == Ity_I16) {
2093       if (0) VG_(printf)("mkLazy3: I32 x I16 x I16 -> I16\n");
2094       at = mkPCastTo(mce, Ity_I16, va1);
2095       at = mkUifU(mce, Ity_I16, at, va2);
2096       at = mkUifU(mce, Ity_I16, at, va3);
2097       at = mkPCastTo(mce, Ity_I16, at);
2098       return at;
2099    }
2100
2101    /* I32 x I128 x I128 -> I128 */
2102    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2103    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
2104        && finalVty == Ity_I128) {
2105       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
2106       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
2107          mode indication which is fully defined, this should get
2108          folded out later. */
2109       at = mkPCastTo(mce, Ity_I128, va1);
2110       /* Now fold in 2nd and 3rd args. */
2111       at = mkUifU(mce, Ity_I128, at, va2);
2112       at = mkUifU(mce, Ity_I128, at, va3);
2113       /* and PCast once again. */
2114       at = mkPCastTo(mce, Ity_I128, at);
2115       return at;
2116    }
2117
2118    /* I32 x I8 x I128 -> I128 */
2119    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2120    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
2121        && finalVty == Ity_I128) {
2122       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
2123       /* Use I64 as an intermediate type, which means PCasting all 3
2124          args to I64 to start with. 1st arg is typically a rounding
2125          mode indication which is fully defined, so we hope that it
2126          will get folded out later. */
2127       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2128       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2129       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
2130       /* Now UifU all three together. */
2131       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
2132       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
2133       /* and PCast once again. */
2134       at = mkPCastTo(mce, Ity_I128, at);
2135       return at;
2136    }
2137    if (1) {
2138       VG_(printf)("mkLazy3: ");
2139       ppIRType(t1);
2140       VG_(printf)(" x ");
2141       ppIRType(t2);
2142       VG_(printf)(" x ");
2143       ppIRType(t3);
2144       VG_(printf)(" -> ");
2145       ppIRType(finalVty);
2146       VG_(printf)("\n");
2147    }
2148
2149    tl_assert(0);
2150    /* General case: force everything via 32-bit intermediaries. */
2151    /*
2152    at = mkPCastTo(mce, Ity_I32, va1);
2153    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2154    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2155    at = mkPCastTo(mce, finalVty, at);
2156    return at;
2157    */
2158 }
2159
2160
2161 /* 4-arg version of the above. */
2162 static
2163 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2164                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2165 {
2166    IRAtom* at;
2167    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2168    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2169    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2170    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2171    tl_assert(isShadowAtom(mce,va1));
2172    tl_assert(isShadowAtom(mce,va2));
2173    tl_assert(isShadowAtom(mce,va3));
2174    tl_assert(isShadowAtom(mce,va4));
2175
2176    /* The general case is inefficient because PCast is an expensive
2177       operation.  Here are some special cases which use PCast only
2178       twice rather than three times. */
2179
2180    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2181
2182    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2183        && finalVty == Ity_I128) {
2184       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2185       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
2186          mode indication which is fully defined, this should get
2187          folded out later. */
2188       at = mkPCastTo(mce, Ity_I128, va1);
2189       /* Now fold in 2nd, 3rd, 4th args. */
2190       at = mkUifU(mce, Ity_I128, at, va2);
2191       at = mkUifU(mce, Ity_I128, at, va3);
2192       at = mkUifU(mce, Ity_I128, at, va4);
2193       /* and PCast once again. */
2194       at = mkPCastTo(mce, Ity_I128, at);
2195       return at;
2196    }
2197
2198    /* I32 x I64 x I64 x I64 -> I64 */
2199    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2200        && finalVty == Ity_I64) {
2201       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2202       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
2203          mode indication which is fully defined, this should get
2204          folded out later. */
2205       at = mkPCastTo(mce, Ity_I64, va1);
2206       /* Now fold in 2nd, 3rd, 4th args. */
2207       at = mkUifU(mce, Ity_I64, at, va2);
2208       at = mkUifU(mce, Ity_I64, at, va3);
2209       at = mkUifU(mce, Ity_I64, at, va4);
2210       /* and PCast once again. */
2211       at = mkPCastTo(mce, Ity_I64, at);
2212       return at;
2213    }
2214    /* I32 x I32 x I32 x I32 -> I32 */
2215    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2216    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2217        && finalVty == Ity_I32) {
2218       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2219       at = va1;
2220       /* Now fold in 2nd, 3rd, 4th args. */
2221       at = mkUifU(mce, Ity_I32, at, va2);
2222       at = mkUifU(mce, Ity_I32, at, va3);
2223       at = mkUifU(mce, Ity_I32, at, va4);
2224       at = mkPCastTo(mce, Ity_I32, at);
2225       return at;
2226    }
2227
2228    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2229        && finalVty == Ity_I32) {
2230       if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2231       at = mkPCastTo(mce, Ity_I8, va1);
2232       /* Now fold in 2nd, 3rd, 4th args. */
2233       at = mkUifU(mce, Ity_I8, at, va2);
2234       at = mkUifU(mce, Ity_I8, at, va3);
2235       at = mkUifU(mce, Ity_I8, at, va4);
2236       at = mkPCastTo(mce, Ity_I32, at);
2237       return at;
2238    }
2239
2240    if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2241        && finalVty == Ity_I64) {
2242       if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2243       at = mkPCastTo(mce, Ity_I8, va1);
2244       /* Now fold in 2nd, 3rd, 4th args. */
2245       at = mkUifU(mce, Ity_I8, at, va2);
2246       at = mkUifU(mce, Ity_I8, at, va3);
2247       at = mkUifU(mce, Ity_I8, at, va4);
2248       at = mkPCastTo(mce, Ity_I64, at);
2249       return at;
2250    }
2251
2252    if (1) {
2253       VG_(printf)("mkLazy4: ");
2254       ppIRType(t1);
2255       VG_(printf)(" x ");
2256       ppIRType(t2);
2257       VG_(printf)(" x ");
2258       ppIRType(t3);
2259       VG_(printf)(" x ");
2260       ppIRType(t4);
2261       VG_(printf)(" -> ");
2262       ppIRType(finalVty);
2263       VG_(printf)("\n");
2264    }
2265
2266    tl_assert(0);
2267 }
2268
2269
2270 /* Do the lazy propagation game from a null-terminated vector of
2271    atoms.  This is presumably the arguments to a helper call, so the
2272    IRCallee info is also supplied in order that we can know which
2273    arguments should be ignored (via the .mcx_mask field).
2274 */
2275 static
2276 IRAtom* mkLazyN ( MCEnv* mce,
2277                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2278 {
2279    Int     i;
2280    IRAtom* here;
2281    IRAtom* curr;
2282    IRType  mergeTy;
2283    Bool    mergeTy64 = True;
2284
2285    /* Decide on the type of the merge intermediary.  If all relevant
2286       args are I64, then it's I64.  In all other circumstances, use
2287       I32. */
2288    for (i = 0; exprvec[i]; i++) {
2289       tl_assert(i < 32);
2290       tl_assert(isOriginalAtom(mce, exprvec[i]));
2291       if (cee->mcx_mask & (1<<i))
2292          continue;
2293       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2294          mergeTy64 = False;
2295    }
2296
2297    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
2298    curr    = definedOfType(mergeTy);
2299
2300    for (i = 0; exprvec[i]; i++) {
2301       tl_assert(i < 32);
2302       tl_assert(isOriginalAtom(mce, exprvec[i]));
2303       /* Only take notice of this arg if the callee's mc-exclusion
2304          mask does not say it is to be excluded. */
2305       if (cee->mcx_mask & (1<<i)) {
2306          /* the arg is to be excluded from definedness checking.  Do
2307             nothing. */
2308          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2309       } else {
2310          /* calculate the arg's definedness, and pessimistically merge
2311             it in. */
2312          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2313          curr = mergeTy64
2314                    ? mkUifU64(mce, here, curr)
2315                    : mkUifU32(mce, here, curr);
2316       }
2317    }
2318    return mkPCastTo(mce, finalVtype, curr );
2319 }
2320
2321
2322 /*------------------------------------------------------------*/
2323 /*--- Generating expensive sequences for exact carry-chain ---*/
2324 /*--- propagation in add/sub and related operations.       ---*/
2325 /*------------------------------------------------------------*/
2326
2327 static
2328 IRAtom* expensiveAddSub ( MCEnv*  mce,
2329                           Bool    add,
2330                           IRType  ty,
2331                           IRAtom* qaa, IRAtom* qbb,
2332                           IRAtom* aa,  IRAtom* bb )
2333 {
2334    IRAtom *a_min, *b_min, *a_max, *b_max;
2335    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
2336
2337    tl_assert(isShadowAtom(mce,qaa));
2338    tl_assert(isShadowAtom(mce,qbb));
2339    tl_assert(isOriginalAtom(mce,aa));
2340    tl_assert(isOriginalAtom(mce,bb));
2341    tl_assert(sameKindedAtoms(qaa,aa));
2342    tl_assert(sameKindedAtoms(qbb,bb));
2343
2344    switch (ty) {
2345       case Ity_I32:
2346          opAND = Iop_And32;
2347          opOR  = Iop_Or32;
2348          opXOR = Iop_Xor32;
2349          opNOT = Iop_Not32;
2350          opADD = Iop_Add32;
2351          opSUB = Iop_Sub32;
2352          break;
2353       case Ity_I64:
2354          opAND = Iop_And64;
2355          opOR  = Iop_Or64;
2356          opXOR = Iop_Xor64;
2357          opNOT = Iop_Not64;
2358          opADD = Iop_Add64;
2359          opSUB = Iop_Sub64;
2360          break;
2361       default:
2362          VG_(tool_panic)("expensiveAddSub");
2363    }
2364
2365    // a_min = aa & ~qaa
2366    a_min = assignNew('V', mce,ty,
2367                      binop(opAND, aa,
2368                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
2369
2370    // b_min = bb & ~qbb
2371    b_min = assignNew('V', mce,ty,
2372                      binop(opAND, bb,
2373                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
2374
2375    // a_max = aa | qaa
2376    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2377
2378    // b_max = bb | qbb
2379    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2380
2381    if (add) {
2382       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2383       return
2384       assignNew('V', mce,ty,
2385          binop( opOR,
2386                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2387                 assignNew('V', mce,ty,
2388                    binop( opXOR,
2389                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2390                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2391                    )
2392                 )
2393          )
2394       );
2395    } else {
2396       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2397       return
2398       assignNew('V', mce,ty,
2399          binop( opOR,
2400                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2401                 assignNew('V', mce,ty,
2402                    binop( opXOR,
2403                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2404                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2405                    )
2406                 )
2407          )
2408       );
2409    }
2410
2411 }
2412
2413
2414 static
2415 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2416                                        IRAtom* atom, IRAtom* vatom )
2417 {
2418    IRType ty;
2419    IROp xorOp, subOp, andOp;
2420    IRExpr *one;
2421    IRAtom *improver, *improved;
2422    tl_assert(isShadowAtom(mce,vatom));
2423    tl_assert(isOriginalAtom(mce,atom));
2424    tl_assert(sameKindedAtoms(atom,vatom));
2425
2426    switch (czop) {
2427       case Iop_Ctz32: case Iop_CtzNat32:
2428          ty = Ity_I32;
2429          xorOp = Iop_Xor32;
2430          subOp = Iop_Sub32;
2431          andOp = Iop_And32;
2432          one = mkU32(1);
2433          break;
2434       case Iop_Ctz64: case Iop_CtzNat64:
2435          ty = Ity_I64;
2436          xorOp = Iop_Xor64;
2437          subOp = Iop_Sub64;
2438          andOp = Iop_And64;
2439          one = mkU64(1);
2440          break;
2441       default:
2442          ppIROp(czop);
2443          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2444    }
2445
2446    // improver = atom ^ (atom - 1)
2447    //
2448    // That is, improver has its low ctz(atom)+1 bits equal to one;
2449    // higher bits (if any) equal to zero.  So it's exactly the right
2450    // mask to use to remove the irrelevant undefined input bits.
2451    /* Here are some examples:
2452          atom   = U...U 1 0...0
2453          atom-1 = U...U 0 1...1
2454          ^ed    = 0...0 1 11111, which correctly describes which bits of |atom|
2455                                  actually influence the result
2456       A boundary case
2457          atom   = 0...0
2458          atom-1 = 1...1
2459          ^ed    = 11111, also a correct mask for the input: all input bits
2460                          are relevant
2461       Another boundary case
2462          atom   = 1..1 1
2463          atom-1 = 1..1 0
2464          ^ed    = 0..0 1, also a correct mask: only the rightmost input bit
2465                           is relevant
2466       Now with misc U bits interspersed:
2467          atom   = U...U 1 0 U...U 0 1 0...0
2468          atom-1 = U...U 1 0 U...U 0 0 1...1
2469          ^ed    = 0...0 0 0 0...0 0 1 1...1, also correct
2470       (Per re-check/analysis of 14 Nov 2018)
2471    */
2472    improver = assignNew('V', mce,ty,
2473                         binop(xorOp,
2474                               atom,
2475                               assignNew('V', mce, ty,
2476                                         binop(subOp, atom, one))));
2477
2478    // improved = vatom & improver
2479    //
2480    // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2481    // bits as "defined".
2482    improved = assignNew('V', mce, ty,
2483                         binop(andOp, vatom, improver));
2484
2485    // Return pessimizing cast of improved.
2486    return mkPCastTo(mce, ty, improved);
2487 }
2488
2489 static
2490 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2491                                       IRAtom* atom, IRAtom* vatom )
2492 {
2493    IRType ty;
2494    IROp shrOp, notOp, andOp;
2495    IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2496    IRAtom *improver, *improved;
2497    tl_assert(isShadowAtom(mce,vatom));
2498    tl_assert(isOriginalAtom(mce,atom));
2499    tl_assert(sameKindedAtoms(atom,vatom));
2500
2501    switch (czop) {
2502       case Iop_Clz32: case Iop_ClzNat32:
2503          ty = Ity_I32;
2504          shrOp = Iop_Shr32;
2505          notOp = Iop_Not32;
2506          andOp = Iop_And32;
2507          mkRight = mkRight32;
2508          break;
2509       case Iop_Clz64: case Iop_ClzNat64:
2510          ty = Ity_I64;
2511          shrOp = Iop_Shr64;
2512          notOp = Iop_Not64;
2513          andOp = Iop_And64;
2514          mkRight = mkRight64;
2515          break;
2516       default:
2517          ppIROp(czop);
2518          VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2519    }
2520
2521    // This is in principle very similar to how expensiveCountTrailingZeroes
2522    // works.  That function computed an "improver", which it used to mask
2523    // off all but the rightmost 1-bit and the zeroes to the right of it,
2524    // hence removing irrelevant bits from the input.  Here, we play the
2525    // exact same game but with the left-vs-right roles interchanged.
2526    // Unfortunately calculation of the improver in this case is
2527    // significantly more expensive.
2528    //
2529    // improver = ~(RIGHT(atom) >>u 1)
2530    //
2531    // That is, improver has its upper clz(atom)+1 bits equal to one;
2532    // lower bits (if any) equal to zero.  So it's exactly the right
2533    // mask to use to remove the irrelevant undefined input bits.
2534    /* Here are some examples:
2535          atom             = 0...0 1 U...U
2536          R(atom)          = 0...0 1 1...1
2537          R(atom) >>u 1    = 0...0 0 1...1
2538          ~(R(atom) >>u 1) = 1...1 1 0...0
2539                             which correctly describes which bits of |atom|
2540                             actually influence the result
2541       A boundary case
2542          atom             = 0...0
2543          R(atom)          = 0...0
2544          R(atom) >>u 1    = 0...0
2545          ~(R(atom) >>u 1) = 1...1
2546                             also a correct mask for the input: all input bits
2547                             are relevant
2548       Another boundary case
2549          atom             = 1 1..1
2550          R(atom)          = 1 1..1
2551          R(atom) >>u 1    = 0 1..1
2552          ~(R(atom) >>u 1) = 1 0..0
2553                             also a correct mask: only the leftmost input bit
2554                             is relevant
2555       Now with misc U bits interspersed:
2556          atom             = 0...0 1 U...U 0 1 U...U
2557          R(atom)          = 0...0 1 1...1 1 1 1...1
2558          R(atom) >>u 1    = 0...0 0 1...1 1 1 1...1
2559          ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2560       (Per initial implementation of 15 Nov 2018)
2561    */
2562    improver = mkRight(mce, atom);
2563    improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2564    improver = assignNew('V', mce, ty, unop(notOp, improver));
2565
2566    // improved = vatom & improver
2567    //
2568    // That is, treat any V bits to the right of the leftmost clz(atom)+1
2569    // bits as "defined".
2570    improved = assignNew('V', mce, ty,
2571                         binop(andOp, vatom, improver));
2572
2573    // Return pessimizing cast of improved.
2574    return mkPCastTo(mce, ty, improved);
2575 }
2576
2577
2578 /*------------------------------------------------------------*/
2579 /*--- Scalar shifts.                                       ---*/
2580 /*------------------------------------------------------------*/
2581
2582 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2583    idea is to shift the definedness bits by the original shift amount.
2584    This introduces 0s ("defined") in new positions for left shifts and
2585    unsigned right shifts, and copies the top definedness bit for
2586    signed right shifts.  So, conveniently, applying the original shift
2587    operator to the definedness bits for the left arg is exactly the
2588    right thing to do:
2589
2590       (qaa << bb)
2591
2592    However if the shift amount is undefined then the whole result
2593    is undefined.  Hence need:
2594
2595       (qaa << bb) `UifU` PCast(qbb)
2596
2597    If the shift amount bb is a literal than qbb will say 'all defined'
2598    and the UifU and PCast will get folded out by post-instrumentation
2599    optimisation.
2600 */
2601 static IRAtom* scalarShift ( MCEnv*  mce,
2602                              IRType  ty,
2603                              IROp    original_op,
2604                              IRAtom* qaa, IRAtom* qbb,
2605                              IRAtom* aa,  IRAtom* bb )
2606 {
2607    tl_assert(isShadowAtom(mce,qaa));
2608    tl_assert(isShadowAtom(mce,qbb));
2609    tl_assert(isOriginalAtom(mce,aa));
2610    tl_assert(isOriginalAtom(mce,bb));
2611    tl_assert(sameKindedAtoms(qaa,aa));
2612    tl_assert(sameKindedAtoms(qbb,bb));
2613    return
2614       assignNew(
2615          'V', mce, ty,
2616          mkUifU( mce, ty,
2617                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2618                  mkPCastTo(mce, ty, qbb)
2619          )
2620    );
2621 }
2622
2623
2624 /*------------------------------------------------------------*/
2625 /*--- Helpers for dealing with vector primops.             ---*/
2626 /*------------------------------------------------------------*/
2627
2628 /* Vector pessimisation -- pessimise within each lane individually. */
2629
2630 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2631 {
2632    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2633 }
2634
2635 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2636 {
2637    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2638 }
2639
2640 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2641 {
2642    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2643 }
2644
2645 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2646 {
2647    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2648 }
2649
2650 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2651 {
2652    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2653 }
2654
2655 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2656 {
2657    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2658 }
2659
2660 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2661 {
2662    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2663 }
2664
2665 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2666 {
2667    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2668 }
2669
2670 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2671 {
2672    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2673 }
2674
2675 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2676 {
2677    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2678 }
2679
2680 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2681 {
2682    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2683 }
2684
2685 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2686 {
2687    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2688 }
2689
2690 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2691 {
2692    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2693 }
2694
2695 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2696 {
2697    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2698 }
2699
2700
2701 /* Here's a simple scheme capable of handling ops derived from SSE1
2702    code and while only generating ops that can be efficiently
2703    implemented in SSE1. */
2704
2705 /* All-lanes versions are straightforward:
2706
2707    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2708
2709    unary32Fx4(x,y)    ==> PCast32x4(x#)
2710
2711    Lowest-lane-only versions are more complex:
2712
2713    binary32F0x4(x,y)  ==> SetV128lo32(
2714                              x#,
2715                              PCast32(V128to32(UifUV128(x#,y#)))
2716                           )
2717
2718    This is perhaps not so obvious.  In particular, it's faster to
2719    do a V128-bit UifU and then take the bottom 32 bits than the more
2720    obvious scheme of taking the bottom 32 bits of each operand
2721    and doing a 32-bit UifU.  Basically since UifU is fast and
2722    chopping lanes off vector values is slow.
2723
2724    Finally:
2725
2726    unary32F0x4(x)     ==> SetV128lo32(
2727                              x#,
2728                              PCast32(V128to32(x#))
2729                           )
2730
2731    Where:
2732
2733    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2734    PCast32x4(v#) = CmpNEZ32x4(v#)
2735 */
2736
2737 static
2738 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2739 {
2740    IRAtom* at;
2741    tl_assert(isShadowAtom(mce, vatomX));
2742    tl_assert(isShadowAtom(mce, vatomY));
2743    at = mkUifUV128(mce, vatomX, vatomY);
2744    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2745    return at;
2746 }
2747
2748 static
2749 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2750 {
2751    IRAtom* at;
2752    tl_assert(isShadowAtom(mce, vatomX));
2753    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2754    return at;
2755 }
2756
2757 static
2758 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2759 {
2760    IRAtom* at;
2761    tl_assert(isShadowAtom(mce, vatomX));
2762    tl_assert(isShadowAtom(mce, vatomY));
2763    at = mkUifUV128(mce, vatomX, vatomY);
2764    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2765    at = mkPCastTo(mce, Ity_I32, at);
2766    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2767    return at;
2768 }
2769
2770 static
2771 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2772 {
2773    IRAtom* at;
2774    tl_assert(isShadowAtom(mce, vatomX));
2775    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2776    at = mkPCastTo(mce, Ity_I32, at);
2777    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2778    return at;
2779 }
2780
2781 /* --- ... and ... 64Fx2 versions of the same ... --- */
2782
2783 static
2784 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2785 {
2786    IRAtom* at;
2787    tl_assert(isShadowAtom(mce, vatomX));
2788    tl_assert(isShadowAtom(mce, vatomY));
2789    at = mkUifUV128(mce, vatomX, vatomY);
2790    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2791    return at;
2792 }
2793
2794 static
2795 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2796 {
2797    IRAtom* at;
2798    tl_assert(isShadowAtom(mce, vatomX));
2799    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2800    return at;
2801 }
2802
2803 static
2804 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2805 {
2806    IRAtom* at;
2807    tl_assert(isShadowAtom(mce, vatomX));
2808    tl_assert(isShadowAtom(mce, vatomY));
2809    at = mkUifUV128(mce, vatomX, vatomY);
2810    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2811    at = mkPCastTo(mce, Ity_I64, at);
2812    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2813    return at;
2814 }
2815
2816 static
2817 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2818 {
2819    IRAtom* at;
2820    tl_assert(isShadowAtom(mce, vatomX));
2821    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2822    at = mkPCastTo(mce, Ity_I64, at);
2823    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2824    return at;
2825 }
2826
2827 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2828
2829 static
2830 IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2831 {
2832    IRAtom* at;
2833    tl_assert(isShadowAtom(mce, vatomX));
2834    tl_assert(isShadowAtom(mce, vatomY));
2835    at = mkUifUV128(mce, vatomX, vatomY);
2836    at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at));
2837    return at;
2838 }
2839
2840 static
2841 IRAtom* unary16Fx8 ( MCEnv* mce, IRAtom* vatomX )
2842 {
2843    IRAtom* at;
2844    tl_assert(isShadowAtom(mce, vatomX));
2845    at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, vatomX));
2846    return at;
2847 }
2848
2849 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2850    implemented.
2851 */
2852
2853 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2854
2855 static
2856 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2857 {
2858    IRAtom* at;
2859    tl_assert(isShadowAtom(mce, vatomX));
2860    tl_assert(isShadowAtom(mce, vatomY));
2861    at = mkUifU64(mce, vatomX, vatomY);
2862    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2863    return at;
2864 }
2865
2866 static
2867 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2868 {
2869    IRAtom* at;
2870    tl_assert(isShadowAtom(mce, vatomX));
2871    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2872    return at;
2873 }
2874
2875 /* --- ... and ... 64Fx4 versions of the same ... --- */
2876
2877 static
2878 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2879 {
2880    IRAtom* at;
2881    tl_assert(isShadowAtom(mce, vatomX));
2882    tl_assert(isShadowAtom(mce, vatomY));
2883    at = mkUifUV256(mce, vatomX, vatomY);
2884    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2885    return at;
2886 }
2887
2888 static
2889 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2890 {
2891    IRAtom* at;
2892    tl_assert(isShadowAtom(mce, vatomX));
2893    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2894    return at;
2895 }
2896
2897 /* --- ... and ... 32Fx8 versions of the same ... --- */
2898
2899 static
2900 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2901 {
2902    IRAtom* at;
2903    tl_assert(isShadowAtom(mce, vatomX));
2904    tl_assert(isShadowAtom(mce, vatomY));
2905    at = mkUifUV256(mce, vatomX, vatomY);
2906    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2907    return at;
2908 }
2909
2910 static
2911 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2912 {
2913    IRAtom* at;
2914    tl_assert(isShadowAtom(mce, vatomX));
2915    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2916    return at;
2917 }
2918
2919 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2920
2921 static
2922 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2923                                        IRAtom* vatomX, IRAtom* vatomY )
2924 {
2925    /* This is the same as binary64Fx2, except that we subsequently
2926       pessimise vRM (definedness of the rounding mode), widen to 128
2927       bits and UifU it into the result.  As with the scalar cases, if
2928       the RM is a constant then it is defined and so this extra bit
2929       will get constant-folded out later. */
2930    // "do" the vector args
2931    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2932    // PCast the RM, and widen it to 128 bits
2933    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2934    // Roll it into the result
2935    t1 = mkUifUV128(mce, t1, t2);
2936    return t1;
2937 }
2938
2939 /* --- ... and ... 32Fx4 versions of the same --- */
2940
2941 static
2942 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2943                                        IRAtom* vatomX, IRAtom* vatomY )
2944 {
2945    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2946    // PCast the RM, and widen it to 128 bits
2947    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2948    // Roll it into the result
2949    t1 = mkUifUV128(mce, t1, t2);
2950    return t1;
2951 }
2952
2953 /* --- ... and ... 64Fx4 versions of the same --- */
2954
2955 static
2956 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2957                                        IRAtom* vatomX, IRAtom* vatomY )
2958 {
2959    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2960    // PCast the RM, and widen it to 256 bits
2961    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2962    // Roll it into the result
2963    t1 = mkUifUV256(mce, t1, t2);
2964    return t1;
2965 }
2966
2967 /* --- ... and ... 16Fx8 versions of the same --- */
2968
2969 static
2970 IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2971                                        IRAtom* vatomX, IRAtom* vatomY )
2972 {
2973    IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY);
2974    // PCast the RM, and widen it to 128 bits
2975    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2976    // Roll it into the result
2977    t1 = mkUifUV128(mce, t1, t2);
2978    return t1;
2979 }
2980
2981 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2982    implemented.
2983 */
2984
2985 /* --- ... and ... 32Fx8 versions of the same --- */
2986
2987 static
2988 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2989                                        IRAtom* vatomX, IRAtom* vatomY )
2990 {
2991    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2992    // PCast the RM, and widen it to 256 bits
2993    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2994    // Roll it into the result
2995    t1 = mkUifUV256(mce, t1, t2);
2996    return t1;
2997 }
2998
2999 /* --- 64Fx2 unary FP ops, with rounding mode --- */
3000
3001 static
3002 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3003 {
3004    /* Same scheme as binary64Fx2_w_rm. */
3005    // "do" the vector arg
3006    IRAtom* t1 = unary64Fx2(mce, vatomX);
3007    // PCast the RM, and widen it to 128 bits
3008    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3009    // Roll it into the result
3010    t1 = mkUifUV128(mce, t1, t2);
3011    return t1;
3012 }
3013
3014 /* --- ... and ... 32Fx4 versions of the same --- */
3015
3016 static
3017 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3018 {
3019    /* Same scheme as binaryFx4_w_rm. */
3020    IRAtom* t1 = unary32Fx4(mce, vatomX);
3021    // PCast the RM, and widen it to 128 bits
3022    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3023    // Roll it into the result
3024    t1 = mkUifUV128(mce, t1, t2);
3025    return t1;
3026 }
3027
3028 /* --- ... and ... 16Fx8 versions of the same --- */
3029
3030 static
3031 IRAtom* unary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3032 {
3033    /* Same scheme as binaryFx4_w_rm. */
3034    IRAtom* t1 = unary16Fx8(mce, vatomX);
3035    // PCast the RM, and widen it to 128 bits
3036    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3037    // Roll it into the result
3038    t1 = mkUifUV128(mce, t1, t2);
3039    return t1;
3040 }
3041
3042 /* --- ... and ... 32Fx8 versions of the same --- */
3043
3044 static
3045 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3046 {
3047    /* Same scheme as unary32Fx8_w_rm. */
3048    IRAtom* t1 = unary32Fx8(mce, vatomX);
3049    // PCast the RM, and widen it to 256 bits
3050    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
3051    // Roll it into the result
3052    t1 = mkUifUV256(mce, t1, t2);
3053    return t1;
3054 }
3055
3056
3057 /* --- --- Vector saturated narrowing --- --- */
3058
3059 /* We used to do something very clever here, but on closer inspection
3060    (2011-Jun-15), and in particular bug #279698, it turns out to be
3061    wrong.  Part of the problem came from the fact that for a long
3062    time, the IR primops to do with saturated narrowing were
3063    underspecified and managed to confuse multiple cases which needed
3064    to be separate: the op names had a signedness qualifier, but in
3065    fact the source and destination signednesses needed to be specified
3066    independently, so the op names really need two independent
3067    signedness specifiers.
3068
3069    As of 2011-Jun-15 (ish) the underspecification was sorted out
3070    properly.  The incorrect instrumentation remained, though.  That
3071    has now (2011-Oct-22) been fixed.
3072
3073    What we now do is simple:
3074
3075    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
3076    number of lanes, X is the source lane width and signedness, and Y
3077    is the destination lane width and signedness.  In all cases the
3078    destination lane width is half the source lane width, so the names
3079    have a bit of redundancy, but are at least easy to read.
3080
3081    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
3082    to unsigned 16s.
3083
3084    Let Vanilla(OP) be a function that takes OP, one of these
3085    saturating narrowing ops, and produces the same "shaped" narrowing
3086    op which is not saturating, but merely dumps the most significant
3087    bits.  "same shape" means that the lane numbers and widths are the
3088    same as with OP.
3089
3090    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
3091                   = Iop_NarrowBin32to16x8,
3092    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
3093    dumping the top half of each lane.
3094
3095    So, with that in place, the scheme is simple, and it is simple to
3096    pessimise each lane individually and then apply Vanilla(OP) so as
3097    to get the result in the right "shape".  If the original OP is
3098    QNarrowBinXtoYxZ then we produce
3099
3100    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
3101
3102    or for the case when OP is unary (Iop_QNarrowUn*)
3103
3104    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
3105 */
3106 static
3107 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
3108 {
3109    switch (qnarrowOp) {
3110       /* Binary: (128, 128) -> 128 */
3111       case Iop_QNarrowBin16Sto8Ux16:
3112       case Iop_QNarrowBin16Sto8Sx16:
3113       case Iop_QNarrowBin16Uto8Ux16:
3114       case Iop_QNarrowBin64Sto32Sx4:
3115       case Iop_QNarrowBin64Uto32Ux4:
3116          return Iop_NarrowBin16to8x16;
3117       case Iop_QNarrowBin32Sto16Ux8:
3118       case Iop_QNarrowBin32Sto16Sx8:
3119       case Iop_QNarrowBin32Uto16Ux8:
3120          return Iop_NarrowBin32to16x8;
3121       /* Binary: (64, 64) -> 64 */
3122       case Iop_QNarrowBin32Sto16Sx4:
3123          return Iop_NarrowBin32to16x4;
3124       case Iop_QNarrowBin16Sto8Ux8:
3125       case Iop_QNarrowBin16Sto8Sx8:
3126          return Iop_NarrowBin16to8x8;
3127       /* Unary: 128 -> 64 */
3128       case Iop_QNarrowUn64Uto32Ux2:
3129       case Iop_QNarrowUn64Sto32Sx2:
3130       case Iop_QNarrowUn64Sto32Ux2:
3131          return Iop_NarrowUn64to32x2;
3132       case Iop_QNarrowUn32Uto16Ux4:
3133       case Iop_QNarrowUn32Sto16Sx4:
3134       case Iop_QNarrowUn32Sto16Ux4:
3135       case Iop_F32toF16x4_DEP:
3136          return Iop_NarrowUn32to16x4;
3137       case Iop_QNarrowUn16Uto8Ux8:
3138       case Iop_QNarrowUn16Sto8Sx8:
3139       case Iop_QNarrowUn16Sto8Ux8:
3140          return Iop_NarrowUn16to8x8;
3141       default:
3142          ppIROp(qnarrowOp);
3143          VG_(tool_panic)("vanillaNarrowOpOfShape");
3144    }
3145 }
3146
3147 static
3148 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
3149                               IRAtom* vatom1, IRAtom* vatom2)
3150 {
3151    IRAtom *at1, *at2, *at3;
3152    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3153    switch (narrow_op) {
3154       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
3155       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
3156       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
3157       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
3158       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
3159       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
3160       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
3161       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
3162       default: VG_(tool_panic)("vectorNarrowBinV128");
3163    }
3164    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3165    tl_assert(isShadowAtom(mce,vatom1));
3166    tl_assert(isShadowAtom(mce,vatom2));
3167    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3168    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
3169    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
3170    return at3;
3171 }
3172
3173 static
3174 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
3175                             IRAtom* vatom1, IRAtom* vatom2)
3176 {
3177    IRAtom *at1, *at2, *at3;
3178    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3179    switch (narrow_op) {
3180       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
3181       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
3182       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
3183       default: VG_(tool_panic)("vectorNarrowBin64");
3184    }
3185    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3186    tl_assert(isShadowAtom(mce,vatom1));
3187    tl_assert(isShadowAtom(mce,vatom2));
3188    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
3189    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3190    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3191    return at3;
3192 }
3193
3194 static
3195 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3196                              IRAtom* vatom1)
3197 {
3198    IRAtom *at1, *at2;
3199    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3200    tl_assert(isShadowAtom(mce,vatom1));
3201    /* For vanilla narrowing (non-saturating), we can just apply
3202       the op directly to the V bits. */
3203    switch (narrow_op) {
3204       case Iop_NarrowUn16to8x8:
3205       case Iop_NarrowUn32to16x4:
3206       case Iop_NarrowUn64to32x2:
3207       case Iop_F32toF16x4_DEP:
3208          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3209          return at1;
3210       default:
3211          break; /* Do Plan B */
3212    }
3213    /* Plan B: for ops that involve a saturation operation on the args,
3214       we must PCast before the vanilla narrow. */
3215    switch (narrow_op) {
3216       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
3217       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
3218       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
3219       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3220       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3221       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3222       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3223       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3224       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3225       default: VG_(tool_panic)("vectorNarrowUnV128");
3226    }
3227    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3228    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3229    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3230    return at2;
3231 }
3232
3233 static
3234 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3235                          IRAtom* vatom1)
3236 {
3237    IRAtom *at1, *at2;
3238    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3239    switch (longen_op) {
3240       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
3241       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
3242       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3243       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3244       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3245       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3246       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
3247       default: VG_(tool_panic)("vectorWidenI64");
3248    }
3249    tl_assert(isShadowAtom(mce,vatom1));
3250    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3251    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3252    return at2;
3253 }
3254
3255
3256 /* --- --- Vector integer arithmetic --- --- */
3257
3258 /* Simple ... UifU the args and per-lane pessimise the results. */
3259
3260 /* --- V256-bit versions --- */
3261
3262 static
3263 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3264 {
3265    IRAtom* at;
3266    at = mkUifUV256(mce, vatom1, vatom2);
3267    at = mkPCast8x32(mce, at);
3268    return at;
3269 }
3270
3271 static
3272 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3273 {
3274    IRAtom* at;
3275    at = mkUifUV256(mce, vatom1, vatom2);
3276    at = mkPCast16x16(mce, at);
3277    return at;
3278 }
3279
3280 static
3281 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3282 {
3283    IRAtom* at;
3284    at = mkUifUV256(mce, vatom1, vatom2);
3285    at = mkPCast32x8(mce, at);
3286    return at;
3287 }
3288
3289 static
3290 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3291 {
3292    IRAtom* at;
3293    at = mkUifUV256(mce, vatom1, vatom2);
3294    at = mkPCast64x4(mce, at);
3295    return at;
3296 }
3297
3298 /* --- V128-bit versions --- */
3299
3300 static
3301 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3302 {
3303    IRAtom* at;
3304    at = mkUifUV128(mce, vatom1, vatom2);
3305    at = mkPCast8x16(mce, at);
3306    return at;
3307 }
3308
3309 static
3310 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3311 {
3312    IRAtom* at;
3313    at = mkUifUV128(mce, vatom1, vatom2);
3314    at = mkPCast16x8(mce, at);
3315    return at;
3316 }
3317
3318 static
3319 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3320 {
3321    IRAtom* at;
3322    at = mkUifUV128(mce, vatom1, vatom2);
3323    at = mkPCast32x4(mce, at);
3324    return at;
3325 }
3326
3327 static
3328 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3329 {
3330    IRAtom* at;
3331    at = mkUifUV128(mce, vatom1, vatom2);
3332    at = mkPCast64x2(mce, at);
3333    return at;
3334 }
3335
3336 static
3337 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3338 {
3339    IRAtom* at;
3340    at = mkUifUV128(mce, vatom1, vatom2);
3341    at = mkPCast128x1(mce, at);
3342    return at;
3343 }
3344
3345 /* --- 64-bit versions --- */
3346
3347 static
3348 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3349 {
3350    IRAtom* at;
3351    at = mkUifU64(mce, vatom1, vatom2);
3352    at = mkPCast8x8(mce, at);
3353    return at;
3354 }
3355
3356 static
3357 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3358 {
3359    IRAtom* at;
3360    at = mkUifU64(mce, vatom1, vatom2);
3361    at = mkPCast16x4(mce, at);
3362    return at;
3363 }
3364
3365 static
3366 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3367 {
3368    IRAtom* at;
3369    at = mkUifU64(mce, vatom1, vatom2);
3370    at = mkPCast32x2(mce, at);
3371    return at;
3372 }
3373
3374 static
3375 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3376 {
3377    IRAtom* at;
3378    at = mkUifU64(mce, vatom1, vatom2);
3379    at = mkPCastTo(mce, Ity_I64, at);
3380    return at;
3381 }
3382
3383 /* --- 32-bit versions --- */
3384
3385 static
3386 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3387 {
3388    IRAtom* at;
3389    at = mkUifU32(mce, vatom1, vatom2);
3390    at = mkPCast8x4(mce, at);
3391    return at;
3392 }
3393
3394 static
3395 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3396 {
3397    IRAtom* at;
3398    at = mkUifU32(mce, vatom1, vatom2);
3399    at = mkPCast16x2(mce, at);
3400    return at;
3401 }
3402
3403
3404 /*------------------------------------------------------------*/
3405 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
3406 /*------------------------------------------------------------*/
3407
3408 static
3409 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3410                          IROp op,
3411                          IRAtom* atom1, IRAtom* atom2,
3412                          IRAtom* atom3, IRAtom* atom4 )
3413 {
3414    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3415    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3416    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3417    IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3418
3419    tl_assert(isOriginalAtom(mce,atom1));
3420    tl_assert(isOriginalAtom(mce,atom2));
3421    tl_assert(isOriginalAtom(mce,atom3));
3422    tl_assert(isOriginalAtom(mce,atom4));
3423    tl_assert(isShadowAtom(mce,vatom1));
3424    tl_assert(isShadowAtom(mce,vatom2));
3425    tl_assert(isShadowAtom(mce,vatom3));
3426    tl_assert(isShadowAtom(mce,vatom4));
3427    tl_assert(sameKindedAtoms(atom1,vatom1));
3428    tl_assert(sameKindedAtoms(atom2,vatom2));
3429    tl_assert(sameKindedAtoms(atom3,vatom3));
3430    tl_assert(sameKindedAtoms(atom4,vatom4));
3431    switch (op) {
3432       case Iop_MAddF64:
3433       case Iop_MAddF64r32:
3434       case Iop_MSubF64:
3435       case Iop_MSubF64r32:
3436          /* I32(rm) x F64 x F64 x F64 -> F64 */
3437          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3438
3439       case Iop_MAddF32:
3440       case Iop_MSubF32:
3441          /* I32(rm) x F32 x F32 x F32 -> F32 */
3442          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3443
3444       case Iop_MAddF128:
3445       case Iop_MSubF128:
3446       case Iop_NegMAddF128:
3447       case Iop_NegMSubF128:
3448          /* I32(rm) x F128 x F128 x F128 -> F128 */
3449          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3450
3451       /* V256-bit data-steering */
3452       case Iop_64x4toV256:
3453          return assignNew('V', mce, Ity_V256,
3454                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3455
3456       /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3457       case Iop_Rotx32:
3458          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3459       case Iop_Rotx64:
3460          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3461       default:
3462          ppIROp(op);
3463          VG_(tool_panic)("memcheck:expr2vbits_Qop");
3464    }
3465 }
3466
3467
3468 static
3469 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3470                            IROp op,
3471                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3472 {
3473    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3474    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3475    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3476
3477    tl_assert(isOriginalAtom(mce,atom1));
3478    tl_assert(isOriginalAtom(mce,atom2));
3479    tl_assert(isOriginalAtom(mce,atom3));
3480    tl_assert(isShadowAtom(mce,vatom1));
3481    tl_assert(isShadowAtom(mce,vatom2));
3482    tl_assert(isShadowAtom(mce,vatom3));
3483    tl_assert(sameKindedAtoms(atom1,vatom1));
3484    tl_assert(sameKindedAtoms(atom2,vatom2));
3485    tl_assert(sameKindedAtoms(atom3,vatom3));
3486    switch (op) {
3487       case Iop_AddF128:
3488       case Iop_SubF128:
3489       case Iop_MulF128:
3490       case Iop_DivF128:
3491       case Iop_AddD128:
3492       case Iop_SubD128:
3493       case Iop_MulD128:
3494       case Iop_DivD128:
3495       case Iop_QuantizeD128:
3496          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3497          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3498       case Iop_AddF64:
3499       case Iop_AddD64:
3500       case Iop_AddF64r32:
3501       case Iop_SubF64:
3502       case Iop_SubD64:
3503       case Iop_SubF64r32:
3504       case Iop_MulF64:
3505       case Iop_MulD64:
3506       case Iop_MulF64r32:
3507       case Iop_DivF64:
3508       case Iop_DivD64:
3509       case Iop_DivF64r32:
3510       case Iop_ScaleF64:
3511       case Iop_Yl2xF64:
3512       case Iop_Yl2xp1F64:
3513       case Iop_AtanF64:
3514       case Iop_PRemF64:
3515       case Iop_PRem1F64:
3516       case Iop_QuantizeD64:
3517          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3518          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3519       case Iop_PRemC3210F64:
3520       case Iop_PRem1C3210F64:
3521          /* I32(rm) x F64 x F64 -> I32 */
3522          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3523       case Iop_AddF32:
3524       case Iop_SubF32:
3525       case Iop_MulF32:
3526       case Iop_DivF32:
3527          /* I32(rm) x F32 x F32 -> I32 */
3528          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3529       case Iop_AddF16:
3530       case Iop_SubF16:
3531          /* I32(rm) x F16 x F16 -> I16 */
3532          return mkLazy3(mce, Ity_I16, vatom1, vatom2, vatom3);
3533       case Iop_SignificanceRoundD64:
3534          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3535          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3536       case Iop_SignificanceRoundD128:
3537          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3538          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3539       case Iop_SliceV128:
3540          /* (V128, V128, I8) -> V128 */
3541          complainIfUndefined(mce, atom3, NULL);
3542          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3543       case Iop_Slice64:
3544          /* (I64, I64, I8) -> I64 */
3545          complainIfUndefined(mce, atom3, NULL);
3546          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3547       case Iop_SetElem8x8:
3548       case Iop_SetElem16x4:
3549       case Iop_SetElem32x2:
3550          complainIfUndefined(mce, atom2, NULL);
3551          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3552
3553       case Iop_SetElem8x16:
3554       case Iop_SetElem16x8:
3555       case Iop_SetElem32x4:
3556       case Iop_SetElem64x2:
3557          complainIfUndefined(mce, atom2, NULL);
3558          return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3559
3560       /* Int 128-bit Integer three arg  */
3561       case Iop_2xMultU64Add128CarryOut:
3562       case Iop_Perm8x16x2:
3563          /* (V128, V128, V128) -> V128 */
3564             complainIfUndefined(mce, atom3, NULL);
3565             return mkUifUV128(
3566                    mce,
3567                    assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3568                    mkPCast8x16(mce, vatom3)
3569                 );
3570
3571       /* Vector FP with rounding mode as the first arg */
3572       case Iop_Add64Fx2:
3573       case Iop_Sub64Fx2:
3574       case Iop_Mul64Fx2:
3575       case Iop_Div64Fx2:
3576       case Iop_Scale2_64Fx2:
3577          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3578
3579       case Iop_Add32Fx4:
3580       case Iop_Sub32Fx4:
3581       case Iop_Mul32Fx4:
3582       case Iop_Div32Fx4:
3583       case Iop_Scale2_32Fx4:
3584         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3585
3586       case Iop_Add64Fx4:
3587       case Iop_Sub64Fx4:
3588       case Iop_Mul64Fx4:
3589       case Iop_Div64Fx4:
3590          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3591
3592       /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3593          IR is implemented.
3594       */
3595       case Iop_Add16Fx8:
3596       case Iop_Sub16Fx8:
3597         return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3598
3599       case Iop_Add32Fx8:
3600       case Iop_Sub32Fx8:
3601       case Iop_Mul32Fx8:
3602       case Iop_Div32Fx8:
3603          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3604
3605       case Iop_F32x4_2toQ16x8:
3606          return assignNew('V', mce, Ity_V128,
3607                           binop(Iop_PackEvenLanes16x8,
3608                                 unary32Fx4_w_rm(mce, vatom1, vatom2),
3609                                 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3610       case Iop_F64x2_2toQ32x4:
3611          return assignNew('V', mce, Ity_V128,
3612                           binop(Iop_PackEvenLanes32x4,
3613                                 unary64Fx2_w_rm(mce, vatom1, vatom2),
3614                                 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3615
3616       default:
3617          ppIROp(op);
3618          VG_(tool_panic)("memcheck:expr2vbits_Triop");
3619    }
3620 }
3621
3622
3623 static
3624 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3625                            IROp op,
3626                            IRAtom* atom1, IRAtom* atom2,
3627                            HowUsed hu/*use HuOth if unknown*/ )
3628 {
3629    IRType  and_or_ty = Ity_INVALID;
3630    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3631    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3632    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3633
3634    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3635    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3636
3637    tl_assert(isOriginalAtom(mce,atom1));
3638    tl_assert(isOriginalAtom(mce,atom2));
3639    tl_assert(isShadowAtom(mce,vatom1));
3640    tl_assert(isShadowAtom(mce,vatom2));
3641    tl_assert(sameKindedAtoms(atom1,vatom1));
3642    tl_assert(sameKindedAtoms(atom2,vatom2));
3643    switch (op) {
3644
3645       /* 32-bit SIMD */
3646
3647       case Iop_Add16x2:
3648       case Iop_HAdd16Ux2:
3649       case Iop_HAdd16Sx2:
3650       case Iop_Sub16x2:
3651       case Iop_HSub16Ux2:
3652       case Iop_HSub16Sx2:
3653       case Iop_QAdd16Sx2:
3654       case Iop_QSub16Sx2:
3655       case Iop_QSub16Ux2:
3656       case Iop_QAdd16Ux2:
3657          return binary16Ix2(mce, vatom1, vatom2);
3658
3659       case Iop_Add8x4:
3660       case Iop_HAdd8Ux4:
3661       case Iop_HAdd8Sx4:
3662       case Iop_Sub8x4:
3663       case Iop_HSub8Ux4:
3664       case Iop_HSub8Sx4:
3665       case Iop_QSub8Ux4:
3666       case Iop_QAdd8Ux4:
3667       case Iop_QSub8Sx4:
3668       case Iop_QAdd8Sx4:
3669          return binary8Ix4(mce, vatom1, vatom2);
3670
3671       /* 64-bit SIMD */
3672
3673       case Iop_ShrN8x8:
3674       case Iop_ShrN16x4:
3675       case Iop_ShrN32x2:
3676       case Iop_SarN8x8:
3677       case Iop_SarN16x4:
3678       case Iop_SarN32x2:
3679       case Iop_ShlN16x4:
3680       case Iop_ShlN32x2:
3681       case Iop_ShlN8x8:
3682          /* Same scheme as with all other shifts. */
3683          complainIfUndefined(mce, atom2, NULL);
3684          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3685
3686       case Iop_QNarrowBin32Sto16Sx4:
3687       case Iop_QNarrowBin16Sto8Sx8:
3688       case Iop_QNarrowBin16Sto8Ux8:
3689          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3690
3691       case Iop_Min8Ux8:
3692       case Iop_Min8Sx8:
3693       case Iop_Max8Ux8:
3694       case Iop_Max8Sx8:
3695       case Iop_Avg8Ux8:
3696       case Iop_QSub8Sx8:
3697       case Iop_QSub8Ux8:
3698       case Iop_Sub8x8:
3699       case Iop_CmpGT8Sx8:
3700       case Iop_CmpGT8Ux8:
3701       case Iop_CmpEQ8x8:
3702       case Iop_QAdd8Sx8:
3703       case Iop_QAdd8Ux8:
3704       case Iop_QSal8x8:
3705       case Iop_QShl8x8:
3706       case Iop_Add8x8:
3707       case Iop_Mul8x8:
3708       case Iop_PolynomialMul8x8:
3709          return binary8Ix8(mce, vatom1, vatom2);
3710
3711       case Iop_Min16Sx4:
3712       case Iop_Min16Ux4:
3713       case Iop_Max16Sx4:
3714       case Iop_Max16Ux4:
3715       case Iop_Avg16Ux4:
3716       case Iop_QSub16Ux4:
3717       case Iop_QSub16Sx4:
3718       case Iop_Sub16x4:
3719       case Iop_Mul16x4:
3720       case Iop_MulHi16Sx4:
3721       case Iop_MulHi16Ux4:
3722       case Iop_CmpGT16Sx4:
3723       case Iop_CmpGT16Ux4:
3724       case Iop_CmpEQ16x4:
3725       case Iop_QAdd16Sx4:
3726       case Iop_QAdd16Ux4:
3727       case Iop_QSal16x4:
3728       case Iop_QShl16x4:
3729       case Iop_Add16x4:
3730       case Iop_QDMulHi16Sx4:
3731       case Iop_QRDMulHi16Sx4:
3732          return binary16Ix4(mce, vatom1, vatom2);
3733
3734       case Iop_Sub32x2:
3735       case Iop_Mul32x2:
3736       case Iop_Max32Sx2:
3737       case Iop_Max32Ux2:
3738       case Iop_Min32Sx2:
3739       case Iop_Min32Ux2:
3740       case Iop_CmpGT32Sx2:
3741       case Iop_CmpGT32Ux2:
3742       case Iop_CmpEQ32x2:
3743       case Iop_Add32x2:
3744       case Iop_QAdd32Ux2:
3745       case Iop_QAdd32Sx2:
3746       case Iop_QSub32Ux2:
3747       case Iop_QSub32Sx2:
3748       case Iop_QSal32x2:
3749       case Iop_QShl32x2:
3750       case Iop_QDMulHi32Sx2:
3751       case Iop_QRDMulHi32Sx2:
3752          return binary32Ix2(mce, vatom1, vatom2);
3753
3754       case Iop_QSub64Ux1:
3755       case Iop_QSub64Sx1:
3756       case Iop_QAdd64Ux1:
3757       case Iop_QAdd64Sx1:
3758       case Iop_QSal64x1:
3759       case Iop_QShl64x1:
3760       case Iop_Sal64x1:
3761          return binary64Ix1(mce, vatom1, vatom2);
3762
3763       case Iop_QShlNsatSU8x8:
3764       case Iop_QShlNsatUU8x8:
3765       case Iop_QShlNsatSS8x8:
3766          complainIfUndefined(mce, atom2, NULL);
3767          return mkPCast8x8(mce, vatom1);
3768
3769       case Iop_QShlNsatSU16x4:
3770       case Iop_QShlNsatUU16x4:
3771       case Iop_QShlNsatSS16x4:
3772          complainIfUndefined(mce, atom2, NULL);
3773          return mkPCast16x4(mce, vatom1);
3774
3775       case Iop_QShlNsatSU32x2:
3776       case Iop_QShlNsatUU32x2:
3777       case Iop_QShlNsatSS32x2:
3778          complainIfUndefined(mce, atom2, NULL);
3779          return mkPCast32x2(mce, vatom1);
3780
3781       case Iop_QShlNsatSU64x1:
3782       case Iop_QShlNsatUU64x1:
3783       case Iop_QShlNsatSS64x1:
3784          complainIfUndefined(mce, atom2, NULL);
3785          return mkPCast32x2(mce, vatom1);
3786
3787       case Iop_PwMax32Sx2:
3788       case Iop_PwMax32Ux2:
3789       case Iop_PwMin32Sx2:
3790       case Iop_PwMin32Ux2:
3791       case Iop_PwMax32Fx2:
3792       case Iop_PwMin32Fx2:
3793          return assignNew('V', mce, Ity_I64,
3794                           binop(Iop_PwMax32Ux2,
3795                                 mkPCast32x2(mce, vatom1),
3796                                 mkPCast32x2(mce, vatom2)));
3797
3798       case Iop_PwMax16Sx4:
3799       case Iop_PwMax16Ux4:
3800       case Iop_PwMin16Sx4:
3801       case Iop_PwMin16Ux4:
3802          return assignNew('V', mce, Ity_I64,
3803                           binop(Iop_PwMax16Ux4,
3804                                 mkPCast16x4(mce, vatom1),
3805                                 mkPCast16x4(mce, vatom2)));
3806
3807       case Iop_PwMax8Sx8:
3808       case Iop_PwMax8Ux8:
3809       case Iop_PwMin8Sx8:
3810       case Iop_PwMin8Ux8:
3811          return assignNew('V', mce, Ity_I64,
3812                           binop(Iop_PwMax8Ux8,
3813                                 mkPCast8x8(mce, vatom1),
3814                                 mkPCast8x8(mce, vatom2)));
3815
3816       case Iop_PwAdd32x2:
3817       case Iop_PwAdd32Fx2:
3818          return mkPCast32x2(mce,
3819                assignNew('V', mce, Ity_I64,
3820                          binop(Iop_PwAdd32x2,
3821                                mkPCast32x2(mce, vatom1),
3822                                mkPCast32x2(mce, vatom2))));
3823
3824       case Iop_PwAdd16x4:
3825          return mkPCast16x4(mce,
3826                assignNew('V', mce, Ity_I64,
3827                          binop(op, mkPCast16x4(mce, vatom1),
3828                                    mkPCast16x4(mce, vatom2))));
3829
3830       case Iop_PwAdd8x8:
3831          return mkPCast8x8(mce,
3832                assignNew('V', mce, Ity_I64,
3833                          binop(op, mkPCast8x8(mce, vatom1),
3834                                    mkPCast8x8(mce, vatom2))));
3835
3836       case Iop_Shl8x8:
3837       case Iop_Shr8x8:
3838       case Iop_Sar8x8:
3839       case Iop_Sal8x8:
3840          return mkUifU64(mce,
3841                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3842                    mkPCast8x8(mce,vatom2)
3843                 );
3844
3845       case Iop_Shl16x4:
3846       case Iop_Shr16x4:
3847       case Iop_Sar16x4:
3848       case Iop_Sal16x4:
3849          return mkUifU64(mce,
3850                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3851                    mkPCast16x4(mce,vatom2)
3852                 );
3853
3854       case Iop_Shl32x2:
3855       case Iop_Shr32x2:
3856       case Iop_Sar32x2:
3857       case Iop_Sal32x2:
3858          return mkUifU64(mce,
3859                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3860                    mkPCast32x2(mce,vatom2)
3861                 );
3862
3863       /* 64-bit data-steering */
3864       case Iop_InterleaveLO32x2:
3865       case Iop_InterleaveLO16x4:
3866       case Iop_InterleaveLO8x8:
3867       case Iop_InterleaveHI32x2:
3868       case Iop_InterleaveHI16x4:
3869       case Iop_InterleaveHI8x8:
3870       case Iop_CatOddLanes8x8:
3871       case Iop_CatEvenLanes8x8:
3872       case Iop_CatOddLanes16x4:
3873       case Iop_CatEvenLanes16x4:
3874       case Iop_InterleaveOddLanes8x8:
3875       case Iop_InterleaveEvenLanes8x8:
3876       case Iop_InterleaveOddLanes16x4:
3877       case Iop_InterleaveEvenLanes16x4:
3878          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3879
3880       case Iop_GetElem8x8:
3881          complainIfUndefined(mce, atom2, NULL);
3882          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3883       case Iop_GetElem16x4:
3884          complainIfUndefined(mce, atom2, NULL);
3885          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3886       case Iop_GetElem32x2:
3887          complainIfUndefined(mce, atom2, NULL);
3888          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3889
3890       /* Perm8x8: rearrange values in left arg using steering values from
3891          right arg.  So rearrange the vbits in the same way but pessimise wrt
3892          steering values.  We assume that unused bits in the steering value
3893          are defined zeros, so we can safely PCast within each lane of the the
3894          steering value without having to take precautions to avoid a
3895          dependency on those unused bits.
3896
3897          This is also correct for PermOrZero8x8, but it is a bit subtle.  For
3898          each lane, if bit 7 of the steering value is zero, then we'll steer
3899          the shadow value exactly as per Perm8x8.  If that bit is one, then
3900          the operation will set the resulting (concrete) value to zero.  That
3901          means it is defined, and should have a shadow value of zero.  Hence
3902          in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3903          as Perm8x8) and then pessimise against the steering values.  */
3904       case Iop_Perm8x8:
3905       case Iop_PermOrZero8x8:
3906          return mkUifU64(
3907                    mce,
3908                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3909                    mkPCast8x8(mce, vatom2)
3910                 );
3911
3912       /* V128-bit SIMD */
3913
3914       case Iop_I32StoF32x4:
3915       case Iop_F32toI32Sx4:
3916       case Iop_Sqrt16Fx8:
3917          return unary16Fx8_w_rm(mce, vatom1, vatom2);
3918       case Iop_Sqrt32Fx4:
3919          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3920       case Iop_Sqrt64Fx2:
3921          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3922
3923       case Iop_ShrN8x16:
3924       case Iop_ShrN16x8:
3925       case Iop_ShrN32x4:
3926       case Iop_ShrN64x2:
3927       case Iop_SarN8x16:
3928       case Iop_SarN16x8:
3929       case Iop_SarN32x4:
3930       case Iop_SarN64x2:
3931       case Iop_ShlN8x16:
3932       case Iop_ShlN16x8:
3933       case Iop_ShlN32x4:
3934       case Iop_ShlN64x2:
3935          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3936             this is wrong now, scalar shifts are done properly lazily.
3937             Vector shifts should be fixed too. */
3938          complainIfUndefined(mce, atom2, NULL);
3939          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3940
3941       /* V x V shifts/rotates are done using the standard lazy scheme. */
3942       /* For the non-rounding variants of bi-di vector x vector
3943          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3944          But note that this is overly pessimistic, because in fact only
3945          the bottom 8 bits of each lane of the second argument are taken
3946          into account when shifting.  So really we ought to ignore
3947          undefinedness in bits 8 and above of each lane in the
3948          second argument. */
3949       case Iop_Shl8x16:
3950       case Iop_Shr8x16:
3951       case Iop_Sar8x16:
3952       case Iop_Sal8x16:
3953       case Iop_Rol8x16:
3954       case Iop_Sh8Sx16:
3955       case Iop_Sh8Ux16:
3956          return mkUifUV128(mce,
3957                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3958                    mkPCast8x16(mce,vatom2)
3959                 );
3960
3961       case Iop_Shl16x8:
3962       case Iop_Shr16x8:
3963       case Iop_Sar16x8:
3964       case Iop_Sal16x8:
3965       case Iop_Rol16x8:
3966       case Iop_Sh16Sx8:
3967       case Iop_Sh16Ux8:
3968          return mkUifUV128(mce,
3969                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3970                    mkPCast16x8(mce,vatom2)
3971                 );
3972
3973       case Iop_Shl32x4:
3974       case Iop_Shr32x4:
3975       case Iop_Sar32x4:
3976       case Iop_Sal32x4:
3977       case Iop_Rol32x4:
3978       case Iop_Sh32Sx4:
3979       case Iop_Sh32Ux4:
3980          return mkUifUV128(mce,
3981                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3982                    mkPCast32x4(mce,vatom2)
3983                 );
3984
3985       case Iop_Shl64x2:
3986       case Iop_Shr64x2:
3987       case Iop_Sar64x2:
3988       case Iop_Sal64x2:
3989       case Iop_Rol64x2:
3990       case Iop_Sh64Sx2:
3991       case Iop_Sh64Ux2:
3992          return mkUifUV128(mce,
3993                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3994                    mkPCast64x2(mce,vatom2)
3995                 );
3996
3997       /* For the rounding variants of bi-di vector x vector shifts, the
3998          rounding adjustment can cause undefinedness to propagate through
3999          the entire lane, in the worst case.  Too complex to handle
4000          properly .. just UifU the arguments and then PCast them.
4001          Suboptimal but safe. */
4002       case Iop_Rsh8Sx16:
4003       case Iop_Rsh8Ux16:
4004          return binary8Ix16(mce, vatom1, vatom2);
4005       case Iop_Rsh16Sx8:
4006       case Iop_Rsh16Ux8:
4007          return binary16Ix8(mce, vatom1, vatom2);
4008       case Iop_Rsh32Sx4:
4009       case Iop_Rsh32Ux4:
4010          return binary32Ix4(mce, vatom1, vatom2);
4011       case Iop_Rsh64Sx2:
4012       case Iop_Rsh64Ux2:
4013          return binary64Ix2(mce, vatom1, vatom2);
4014
4015       case Iop_F32ToFixed32Ux4_RZ:
4016       case Iop_F32ToFixed32Sx4_RZ:
4017       case Iop_Fixed32UToF32x4_RN:
4018       case Iop_Fixed32SToF32x4_RN:
4019          complainIfUndefined(mce, atom2, NULL);
4020          return mkPCast32x4(mce, vatom1);
4021
4022       case Iop_F32ToFixed32Ux2_RZ:
4023       case Iop_F32ToFixed32Sx2_RZ:
4024       case Iop_Fixed32UToF32x2_RN:
4025       case Iop_Fixed32SToF32x2_RN:
4026          complainIfUndefined(mce, atom2, NULL);
4027          return mkPCast32x2(mce, vatom1);
4028
4029       case Iop_QSub8Ux16:
4030       case Iop_QSub8Sx16:
4031       case Iop_Sub8x16:
4032       case Iop_Min8Ux16:
4033       case Iop_Min8Sx16:
4034       case Iop_Max8Ux16:
4035       case Iop_Max8Sx16:
4036       case Iop_CmpEQ8x16:
4037       case Iop_Avg8Ux16:
4038       case Iop_Avg8Sx16:
4039       case Iop_QAdd8Ux16:
4040       case Iop_QAdd8Sx16:
4041       case Iop_QAddExtUSsatSS8x16:
4042       case Iop_QAddExtSUsatUU8x16:
4043       case Iop_QSal8x16:
4044       case Iop_QShl8x16:
4045       case Iop_Add8x16:
4046       case Iop_Mul8x16:
4047       case Iop_MulHi8Sx16:
4048       case Iop_MulHi8Ux16:
4049       case Iop_PolynomialMul8x16:
4050       case Iop_PolynomialMulAdd8x16:
4051          return binary8Ix16(mce, vatom1, vatom2);
4052
4053       case Iop_QSub16Ux8:
4054       case Iop_QSub16Sx8:
4055       case Iop_Sub16x8:
4056       case Iop_Mul16x8:
4057       case Iop_MulHi16Sx8:
4058       case Iop_MulHi16Ux8:
4059       case Iop_Min16Sx8:
4060       case Iop_Min16Ux8:
4061       case Iop_Max16Sx8:
4062       case Iop_Max16Ux8:
4063       case Iop_CmpEQ16x8:
4064       case Iop_Avg16Ux8:
4065       case Iop_Avg16Sx8:
4066       case Iop_QAdd16Ux8:
4067       case Iop_QAdd16Sx8:
4068       case Iop_QAddExtUSsatSS16x8:
4069       case Iop_QAddExtSUsatUU16x8:
4070       case Iop_QSal16x8:
4071       case Iop_QShl16x8:
4072       case Iop_Add16x8:
4073       case Iop_QDMulHi16Sx8:
4074       case Iop_QRDMulHi16Sx8:
4075       case Iop_PolynomialMulAdd16x8:
4076       /* PwExtUSMulQAdd8x16 is a bit subtle.  The effect of it is that each
4077          16-bit chunk of the output is formed from corresponding 16-bit chunks
4078          of the input args, so we can treat it like an other binary 16x8
4079          operation.  That's despite it having '8x16' in its name. */
4080       case Iop_PwExtUSMulQAdd8x16:
4081          return binary16Ix8(mce, vatom1, vatom2);
4082
4083       case Iop_CmpGT64Sx2:
4084       case Iop_CmpGT64Ux2:
4085       case Iop_CmpGT32Sx4:
4086       case Iop_CmpGT32Ux4:
4087       case Iop_CmpGT16Sx8:
4088       case Iop_CmpGT16Ux8:
4089       case Iop_CmpGT8Sx16:
4090       case Iop_CmpGT8Ux16:
4091          return expensiveCmpGT(mce, op,
4092                                vatom1, vatom2, atom1, atom2);
4093       case Iop_Sub32x4:
4094       case Iop_CmpEQ32x4:
4095       case Iop_QAdd32Sx4:
4096       case Iop_QAdd32Ux4:
4097       case Iop_QSub32Sx4:
4098       case Iop_QSub32Ux4:
4099       case Iop_QAddExtUSsatSS32x4:
4100       case Iop_QAddExtSUsatUU32x4:
4101       case Iop_QSal32x4:
4102       case Iop_QShl32x4:
4103       case Iop_Avg32Ux4:
4104       case Iop_Avg32Sx4:
4105       case Iop_Add32x4:
4106       case Iop_Max32Ux4:
4107       case Iop_Max32Sx4:
4108       case Iop_Min32Ux4:
4109       case Iop_Min32Sx4:
4110       case Iop_Mul32x4:
4111       case Iop_MulHi32Sx4:
4112       case Iop_MulHi32Ux4:
4113       case Iop_QDMulHi32Sx4:
4114       case Iop_QRDMulHi32Sx4:
4115       case Iop_PolynomialMulAdd32x4:
4116          return binary32Ix4(mce, vatom1, vatom2);
4117
4118       case Iop_Sub64x2:
4119       case Iop_Add64x2:
4120       case Iop_Avg64Ux2:
4121       case Iop_Avg64Sx2:
4122       case Iop_Max64Sx2:
4123       case Iop_Max64Ux2:
4124       case Iop_Min64Sx2:
4125       case Iop_Min64Ux2:
4126       case Iop_CmpEQ64x2:
4127       case Iop_QSal64x2:
4128       case Iop_QShl64x2:
4129       case Iop_QAdd64Ux2:
4130       case Iop_QAdd64Sx2:
4131       case Iop_QSub64Ux2:
4132       case Iop_QSub64Sx2:
4133       case Iop_QAddExtUSsatSS64x2:
4134       case Iop_QAddExtSUsatUU64x2:
4135       case Iop_PolynomialMulAdd64x2:
4136       case Iop_CipherV128:
4137       case Iop_CipherLV128:
4138       case Iop_NCipherV128:
4139       case Iop_NCipherLV128:
4140       case Iop_MulI128by10E:
4141       case Iop_MulI128by10ECarry:
4142         return binary64Ix2(mce, vatom1, vatom2);
4143
4144       case Iop_Add128x1:
4145       case Iop_Sub128x1:
4146       case Iop_CmpNEZ128x1:
4147          return binary128Ix1(mce, vatom1, vatom2);
4148
4149       case Iop_DivU128:
4150       case Iop_DivS128:
4151       case Iop_DivU128E:
4152       case Iop_DivS128E:
4153       case Iop_ModU128:
4154       case Iop_ModS128:
4155          /* I128 x I128 -> I128 */
4156          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4157
4158       case Iop_QNarrowBin64Sto32Sx4:
4159       case Iop_QNarrowBin64Uto32Ux4:
4160       case Iop_QNarrowBin32Sto16Sx8:
4161       case Iop_QNarrowBin32Uto16Ux8:
4162       case Iop_QNarrowBin32Sto16Ux8:
4163       case Iop_QNarrowBin16Sto8Sx16:
4164       case Iop_QNarrowBin16Uto8Ux16:
4165       case Iop_QNarrowBin16Sto8Ux16:
4166          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
4167
4168       case Iop_Min64Fx2:
4169       case Iop_Max64Fx2:
4170       case Iop_CmpLT64Fx2:
4171       case Iop_CmpLE64Fx2:
4172       case Iop_CmpEQ64Fx2:
4173       case Iop_CmpUN64Fx2:
4174       case Iop_RecipStep64Fx2:
4175       case Iop_RSqrtStep64Fx2:
4176          return binary64Fx2(mce, vatom1, vatom2);
4177
4178       case Iop_CmpLT16Fx8:
4179       case Iop_CmpLE16Fx8:
4180       case Iop_CmpEQ16Fx8:
4181          return binary16Fx8(mce, vatom1, vatom2);
4182
4183       case Iop_Sub64F0x2:
4184       case Iop_Mul64F0x2:
4185       case Iop_Min64F0x2:
4186       case Iop_Max64F0x2:
4187       case Iop_Div64F0x2:
4188       case Iop_CmpLT64F0x2:
4189       case Iop_CmpLE64F0x2:
4190       case Iop_CmpEQ64F0x2:
4191       case Iop_CmpUN64F0x2:
4192       case Iop_Add64F0x2:
4193          return binary64F0x2(mce, vatom1, vatom2);
4194
4195       case Iop_Min32Fx4:
4196       case Iop_Max32Fx4:
4197       case Iop_CmpLT32Fx4:
4198       case Iop_CmpLE32Fx4:
4199       case Iop_CmpEQ32Fx4:
4200       case Iop_CmpUN32Fx4:
4201       case Iop_CmpGT32Fx4:
4202       case Iop_CmpGE32Fx4:
4203       case Iop_RecipStep32Fx4:
4204       case Iop_RSqrtStep32Fx4:
4205          return binary32Fx4(mce, vatom1, vatom2);
4206
4207       case Iop_Sub32Fx2:
4208       case Iop_Mul32Fx2:
4209       case Iop_Min32Fx2:
4210       case Iop_Max32Fx2:
4211       case Iop_CmpEQ32Fx2:
4212       case Iop_CmpGT32Fx2:
4213       case Iop_CmpGE32Fx2:
4214       case Iop_Add32Fx2:
4215       case Iop_RecipStep32Fx2:
4216       case Iop_RSqrtStep32Fx2:
4217          return binary32Fx2(mce, vatom1, vatom2);
4218
4219       case Iop_Sub32F0x4:
4220       case Iop_Mul32F0x4:
4221       case Iop_Min32F0x4:
4222       case Iop_Max32F0x4:
4223       case Iop_Div32F0x4:
4224       case Iop_CmpLT32F0x4:
4225       case Iop_CmpLE32F0x4:
4226       case Iop_CmpEQ32F0x4:
4227       case Iop_CmpUN32F0x4:
4228       case Iop_Add32F0x4:
4229          return binary32F0x4(mce, vatom1, vatom2);
4230
4231       case Iop_QShlNsatSU8x16:
4232       case Iop_QShlNsatUU8x16:
4233       case Iop_QShlNsatSS8x16:
4234          complainIfUndefined(mce, atom2, NULL);
4235          return mkPCast8x16(mce, vatom1);
4236
4237       case Iop_QShlNsatSU16x8:
4238       case Iop_QShlNsatUU16x8:
4239       case Iop_QShlNsatSS16x8:
4240          complainIfUndefined(mce, atom2, NULL);
4241          return mkPCast16x8(mce, vatom1);
4242
4243       case Iop_QShlNsatSU32x4:
4244       case Iop_QShlNsatUU32x4:
4245       case Iop_QShlNsatSS32x4:
4246          complainIfUndefined(mce, atom2, NULL);
4247          return mkPCast32x4(mce, vatom1);
4248
4249       case Iop_QShlNsatSU64x2:
4250       case Iop_QShlNsatUU64x2:
4251       case Iop_QShlNsatSS64x2:
4252          complainIfUndefined(mce, atom2, NULL);
4253          return mkPCast32x4(mce, vatom1);
4254
4255       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4256          To make this simpler, do the following:
4257          * complain if the shift amount (the I8) is undefined
4258          * pcast each lane at the wide width
4259          * truncate each lane to half width
4260          * pcast the resulting 64-bit value to a single bit and use
4261            that as the least significant bit of the upper half of the
4262            result. */
4263       case Iop_QandQShrNnarrow64Uto32Ux2:
4264       case Iop_QandQSarNnarrow64Sto32Sx2:
4265       case Iop_QandQSarNnarrow64Sto32Ux2:
4266       case Iop_QandQRShrNnarrow64Uto32Ux2:
4267       case Iop_QandQRSarNnarrow64Sto32Sx2:
4268       case Iop_QandQRSarNnarrow64Sto32Ux2:
4269       case Iop_QandQShrNnarrow32Uto16Ux4:
4270       case Iop_QandQSarNnarrow32Sto16Sx4:
4271       case Iop_QandQSarNnarrow32Sto16Ux4:
4272       case Iop_QandQRShrNnarrow32Uto16Ux4:
4273       case Iop_QandQRSarNnarrow32Sto16Sx4:
4274       case Iop_QandQRSarNnarrow32Sto16Ux4:
4275       case Iop_QandQShrNnarrow16Uto8Ux8:
4276       case Iop_QandQSarNnarrow16Sto8Sx8:
4277       case Iop_QandQSarNnarrow16Sto8Ux8:
4278       case Iop_QandQRShrNnarrow16Uto8Ux8:
4279       case Iop_QandQRSarNnarrow16Sto8Sx8:
4280       case Iop_QandQRSarNnarrow16Sto8Ux8:
4281       {
4282          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4283          IROp opNarrow = Iop_INVALID;
4284          switch (op) {
4285             case Iop_QandQShrNnarrow64Uto32Ux2:
4286             case Iop_QandQSarNnarrow64Sto32Sx2:
4287             case Iop_QandQSarNnarrow64Sto32Ux2:
4288             case Iop_QandQRShrNnarrow64Uto32Ux2:
4289             case Iop_QandQRSarNnarrow64Sto32Sx2:
4290             case Iop_QandQRSarNnarrow64Sto32Ux2:
4291                fnPessim = mkPCast64x2;
4292                opNarrow = Iop_NarrowUn64to32x2;
4293                break;
4294             case Iop_QandQShrNnarrow32Uto16Ux4:
4295             case Iop_QandQSarNnarrow32Sto16Sx4:
4296             case Iop_QandQSarNnarrow32Sto16Ux4:
4297             case Iop_QandQRShrNnarrow32Uto16Ux4:
4298             case Iop_QandQRSarNnarrow32Sto16Sx4:
4299             case Iop_QandQRSarNnarrow32Sto16Ux4:
4300                fnPessim = mkPCast32x4;
4301                opNarrow = Iop_NarrowUn32to16x4;
4302                break;
4303             case Iop_QandQShrNnarrow16Uto8Ux8:
4304             case Iop_QandQSarNnarrow16Sto8Sx8:
4305             case Iop_QandQSarNnarrow16Sto8Ux8:
4306             case Iop_QandQRShrNnarrow16Uto8Ux8:
4307             case Iop_QandQRSarNnarrow16Sto8Sx8:
4308             case Iop_QandQRSarNnarrow16Sto8Ux8:
4309                fnPessim = mkPCast16x8;
4310                opNarrow = Iop_NarrowUn16to8x8;
4311                break;
4312             default:
4313                tl_assert(0);
4314          }
4315          complainIfUndefined(mce, atom2, NULL);
4316          // Pessimised shift result
4317          IRAtom* shV
4318             = fnPessim(mce, vatom1);
4319          // Narrowed, pessimised shift result
4320          IRAtom* shVnarrowed
4321             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4322          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4323          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4324          // and assemble the result
4325          return assignNew('V', mce, Ity_V128,
4326                           binop(Iop_64HLtoV128, qV, shVnarrowed));
4327       }
4328
4329       case Iop_Mull32Sx2:
4330       case Iop_Mull32Ux2:
4331       case Iop_QDMull32Sx2:
4332          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4333                                     mkUifU64(mce, vatom1, vatom2));
4334
4335       case Iop_Mull16Sx4:
4336       case Iop_Mull16Ux4:
4337       case Iop_QDMull16Sx4:
4338          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4339                                     mkUifU64(mce, vatom1, vatom2));
4340
4341       case Iop_Mull8Sx8:
4342       case Iop_Mull8Ux8:
4343       case Iop_PolynomialMull8x8:
4344          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4345                                     mkUifU64(mce, vatom1, vatom2));
4346
4347       case Iop_PwAdd32x4:
4348          return mkPCast32x4(mce,
4349                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4350                      mkPCast32x4(mce, vatom2))));
4351
4352       case Iop_PwAdd16x8:
4353          return mkPCast16x8(mce,
4354                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4355                      mkPCast16x8(mce, vatom2))));
4356
4357       case Iop_PwAdd8x16:
4358          return mkPCast8x16(mce,
4359                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4360                      mkPCast8x16(mce, vatom2))));
4361
4362       /* V128-bit data-steering */
4363       case Iop_SetV128lo32:
4364       case Iop_SetV128lo64:
4365       case Iop_64HLtoV128:
4366       case Iop_InterleaveLO64x2:
4367       case Iop_InterleaveLO32x4:
4368       case Iop_InterleaveLO16x8:
4369       case Iop_InterleaveLO8x16:
4370       case Iop_InterleaveHI64x2:
4371       case Iop_InterleaveHI32x4:
4372       case Iop_InterleaveHI16x8:
4373       case Iop_InterleaveHI8x16:
4374       case Iop_CatOddLanes8x16:
4375       case Iop_CatOddLanes16x8:
4376       case Iop_CatOddLanes32x4:
4377       case Iop_CatEvenLanes8x16:
4378       case Iop_CatEvenLanes16x8:
4379       case Iop_CatEvenLanes32x4:
4380       case Iop_InterleaveOddLanes8x16:
4381       case Iop_InterleaveOddLanes16x8:
4382       case Iop_InterleaveOddLanes32x4:
4383       case Iop_InterleaveEvenLanes8x16:
4384       case Iop_InterleaveEvenLanes16x8:
4385       case Iop_InterleaveEvenLanes32x4:
4386       case Iop_PackOddLanes8x16:
4387       case Iop_PackOddLanes16x8:
4388       case Iop_PackOddLanes32x4:
4389       case Iop_PackEvenLanes8x16:
4390       case Iop_PackEvenLanes16x8:
4391       case Iop_PackEvenLanes32x4:
4392          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4393
4394       case Iop_GetElem8x16:
4395          complainIfUndefined(mce, atom2, NULL);
4396          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4397       case Iop_GetElem16x8:
4398          complainIfUndefined(mce, atom2, NULL);
4399          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4400       case Iop_GetElem32x4:
4401          complainIfUndefined(mce, atom2, NULL);
4402          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4403       case Iop_GetElem64x2:
4404          complainIfUndefined(mce, atom2, NULL);
4405          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4406
4407       /* Perm8x16: rearrange values in left arg using steering values
4408          from right arg.  So rearrange the vbits in the same way but
4409          pessimise wrt steering values.  Perm32x4 ditto. */
4410       /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4411       case Iop_Perm8x16:
4412       case Iop_PermOrZero8x16:
4413          return mkUifUV128(
4414                    mce,
4415                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4416                    mkPCast8x16(mce, vatom2)
4417                 );
4418       case Iop_Perm32x4:
4419          return mkUifUV128(
4420                    mce,
4421                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4422                    mkPCast32x4(mce, vatom2)
4423                 );
4424
4425      /* These two take the lower half of each 16-bit lane, sign/zero
4426         extend it to 32, and multiply together, producing a 32x4
4427         result (and implicitly ignoring half the operand bits).  So
4428         treat it as a bunch of independent 16x8 operations, but then
4429         do 32-bit shifts left-right to copy the lower half results
4430         (which are all 0s or all 1s due to PCasting in binary16Ix8)
4431         into the upper half of each result lane. */
4432       case Iop_MullEven16Ux8:
4433       case Iop_MullEven16Sx8: {
4434          IRAtom* at;
4435          at = binary16Ix8(mce,vatom1,vatom2);
4436          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4437          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4438          return at;
4439       }
4440
4441       /* Same deal as Iop_MullEven16{S,U}x8 */
4442       case Iop_MullEven8Ux16:
4443       case Iop_MullEven8Sx16: {
4444          IRAtom* at;
4445          at = binary8Ix16(mce,vatom1,vatom2);
4446          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4447          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4448          return at;
4449       }
4450
4451       /* Same deal as Iop_MullEven16{S,U}x8 */
4452       case Iop_MullEven32Ux4:
4453       case Iop_MullEven32Sx4: {
4454          IRAtom* at;
4455          at = binary32Ix4(mce,vatom1,vatom2);
4456          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4457          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4458          return at;
4459       }
4460
4461       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4462          32x4 -> 16x8 laneage, discarding the upper half of each lane.
4463          Simply apply same op to the V bits, since this really no more
4464          than a data steering operation. */
4465       case Iop_NarrowBin32to16x8:
4466       case Iop_NarrowBin16to8x16:
4467       case Iop_NarrowBin64to32x4:
4468          return assignNew('V', mce, Ity_V128,
4469                                     binop(op, vatom1, vatom2));
4470
4471       case Iop_ShrV128:
4472       case Iop_SarV128:
4473       case Iop_ShlV128:
4474       case Iop_I128StoBCD128:
4475          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
4476             this is wrong now, scalar shifts are done properly lazily.
4477             Vector shifts should be fixed too. */
4478          complainIfUndefined(mce, atom2, NULL);
4479          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4480
4481       case Iop_I128UtoF128:      /* I128 -> F128 */
4482       case Iop_I128StoF128:      /* I128 -> F128 */
4483          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4484
4485       case Iop_BCDAdd:
4486       case Iop_BCDSub:
4487          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4488
4489       /* SHA Iops */
4490       case Iop_SHA256:
4491       case Iop_SHA512:
4492          complainIfUndefined(mce, atom2, NULL);
4493          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4494
4495       /* I128-bit data-steering */
4496       case Iop_64HLto128:
4497          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4498
4499       /* V256-bit SIMD */
4500
4501       case Iop_Max64Fx4:
4502       case Iop_Min64Fx4:
4503          return binary64Fx4(mce, vatom1, vatom2);
4504
4505       case Iop_Max32Fx8:
4506       case Iop_Min32Fx8:
4507          return binary32Fx8(mce, vatom1, vatom2);
4508
4509       /* V256-bit data-steering */
4510       case Iop_V128HLtoV256:
4511          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4512
4513       /* Scalar floating point */
4514
4515       case Iop_F32toI64S:
4516       case Iop_F32toI64U:
4517          /* I32(rm) x F32 -> I64 */
4518          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4519
4520       case Iop_I64StoF32:
4521          /* I32(rm) x I64 -> F32 */
4522          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4523
4524       case Iop_RoundF64toInt:
4525       case Iop_RoundF64toF32:
4526       case Iop_F64toI64S:
4527       case Iop_F64toI64U:
4528       case Iop_I64StoF64:
4529       case Iop_I64UtoF64:
4530       case Iop_SinF64:
4531       case Iop_CosF64:
4532       case Iop_TanF64:
4533       case Iop_2xm1F64:
4534       case Iop_SqrtF64:
4535       case Iop_RecpExpF64:
4536          /* I32(rm) x I64/F64 -> I64/F64 */
4537          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4538
4539       case Iop_ShlD64:
4540       case Iop_ShrD64:
4541       case Iop_RoundD64toInt:
4542          /* I32(rm) x D64 -> D64 */
4543          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4544
4545       case Iop_ShlD128:
4546       case Iop_ShrD128:
4547       case Iop_RoundD128toInt:
4548          /* I32(rm) x D128 -> D128 */
4549          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4550
4551       case Iop_RoundF128toInt:
4552          /* I32(rm) x F128 -> F128 */
4553          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4554
4555       case Iop_D64toI64S:
4556       case Iop_D64toI64U:
4557       case Iop_I64StoD64:
4558       case Iop_I64UtoD64:
4559          /* I32(rm) x I64/D64 -> D64/I64 */
4560          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4561
4562       case Iop_F32toD32:
4563       case Iop_F64toD32:
4564       case Iop_F128toD32:
4565       case Iop_D32toF32:
4566       case Iop_D64toF32:
4567       case Iop_D128toF32:
4568          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4569          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4570
4571       case Iop_F32toD64:
4572       case Iop_F64toD64:
4573       case Iop_F128toD64:
4574       case Iop_D32toF64:
4575       case Iop_D64toF64:
4576       case Iop_D128toF64:
4577          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4578          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4579
4580       case Iop_F32toD128:
4581       case Iop_F64toD128:
4582       case Iop_F128toD128:
4583       case Iop_D32toF128:
4584       case Iop_D64toF128:
4585       case Iop_D128toF128:
4586       case Iop_I128StoD128:
4587          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4588          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4589
4590       case Iop_SqrtF16:
4591          /* I32(rm) x F16 -> F16 */
4592          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4593
4594       case Iop_RoundF32toInt:
4595       case Iop_SqrtF32:
4596       case Iop_RecpExpF32:
4597          /* I32(rm) x I32/F32 -> I32/F32 */
4598          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4599
4600       case Iop_SqrtF128:
4601          /* I32(rm) x F128 -> F128 */
4602          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4603
4604       case Iop_I32StoF32:
4605       case Iop_I32UtoF32:
4606       case Iop_F32toI32S:
4607       case Iop_F32toI32U:
4608          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4609          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4610
4611       case Iop_F64toF16:
4612       case Iop_F32toF16:
4613          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4614          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4615
4616       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
4617       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
4618       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
4619       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
4620       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
4621          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4622
4623       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
4624       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
4625       case Iop_D128toI128S:   /* IRRoundingMode(I32) x D128 -> signed I128 */
4626          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4627
4628       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
4629       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
4630       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
4631       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
4632       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
4633       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
4634          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4635
4636       case Iop_F64HLtoF128:
4637       case Iop_D64HLtoD128:
4638          return assignNew('V', mce, Ity_I128,
4639                           binop(Iop_64HLto128, vatom1, vatom2));
4640
4641       case Iop_F64toI32U:
4642       case Iop_F64toI32S:
4643       case Iop_F64toF32:
4644       case Iop_I64UtoF32:
4645       case Iop_D64toI32U:
4646       case Iop_D64toI32S:
4647          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4648          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4649
4650       case Iop_D64toD32:
4651          /* First arg is I32 (rounding mode), second is D64 (data). */
4652          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4653
4654       case Iop_F64toI16S:
4655          /* First arg is I32 (rounding mode), second is F64 (data). */
4656          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4657
4658       case Iop_InsertExpD64:
4659          /*  I64 x I64 -> D64 */
4660          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4661
4662       case Iop_InsertExpD128:
4663          /*  I64 x I128 -> D128 */
4664          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4665
4666       case Iop_CmpF16:
4667       case Iop_CmpF32:
4668       case Iop_CmpF64:
4669       case Iop_CmpF128:
4670       case Iop_CmpD64:
4671       case Iop_CmpD128:
4672       case Iop_CmpExpD64:
4673       case Iop_CmpExpD128:
4674          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4675
4676       case Iop_MaxNumF32:
4677       case Iop_MinNumF32:
4678          /* F32 x F32 -> F32 */
4679          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4680
4681       case Iop_MaxNumF64:
4682       case Iop_MinNumF64:
4683          /* F64 x F64 -> F64 */
4684          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4685
4686       /* non-FP after here */
4687
4688       case Iop_DivModU64to32:
4689       case Iop_DivModS64to32:
4690          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4691
4692       case Iop_DivModU128to64:
4693       case Iop_DivModS128to64:
4694          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4695
4696       case Iop_8HLto16:
4697          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4698       case Iop_16HLto32:
4699          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4700       case Iop_32HLto64:
4701          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4702
4703       case Iop_DivModU64to64:
4704       case Iop_DivModS64to64: {
4705          IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4706          return assignNew('V', mce, Ity_I128,
4707                           binop(Iop_64HLto128, vTmp64, vTmp64));
4708       }
4709
4710       case Iop_MullS64:
4711       case Iop_MullU64: {
4712          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4713          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4714          return assignNew('V', mce, Ity_I128,
4715                           binop(Iop_64HLto128, vHi64, vLo64));
4716       }
4717
4718       case Iop_DivModU32to32:
4719       case Iop_DivModS32to32: {
4720          IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4721          return assignNew('V', mce, Ity_I64,
4722                           binop(Iop_32HLto64, vTmp32, vTmp32));
4723       }
4724
4725       case Iop_MullS32:
4726       case Iop_MullU32: {
4727          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4728          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4729          return assignNew('V', mce, Ity_I64,
4730                           binop(Iop_32HLto64, vHi32, vLo32));
4731       }
4732
4733       case Iop_MullS16:
4734       case Iop_MullU16: {
4735          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4736          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4737          return assignNew('V', mce, Ity_I32,
4738                           binop(Iop_16HLto32, vHi16, vLo16));
4739       }
4740
4741       case Iop_MullS8:
4742       case Iop_MullU8: {
4743          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4744          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4745          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4746       }
4747
4748       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4749       case Iop_DivS32:
4750       case Iop_DivU32:
4751       case Iop_DivU32E:
4752       case Iop_DivS32E:
4753       case Iop_QAdd32S: /* could probably do better */
4754       case Iop_QSub32S: /* could probably do better */
4755          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4756
4757       case Iop_DivS64:
4758       case Iop_DivU64:
4759       case Iop_DivS64E:
4760       case Iop_DivU64E:
4761          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4762
4763       case Iop_Add32:
4764          if (mce->dlbo.dl_Add32 == DLexpensive
4765              || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4766              return expensiveAddSub(mce,True,Ity_I32,
4767                                     vatom1,vatom2, atom1,atom2);
4768          } else {
4769              goto cheap_AddSub32;
4770          }
4771       case Iop_Sub32:
4772          if (mce->dlbo.dl_Sub32 == DLexpensive
4773              || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4774              return expensiveAddSub(mce,False,Ity_I32,
4775                                     vatom1,vatom2, atom1,atom2);
4776          } else {
4777              goto cheap_AddSub32;
4778          }
4779
4780       cheap_AddSub32:
4781       case Iop_Mul32:
4782          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4783
4784       case Iop_CmpORD32S:
4785       case Iop_CmpORD32U:
4786       case Iop_CmpORD64S:
4787       case Iop_CmpORD64U:
4788          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4789
4790       case Iop_Add64:
4791          if (mce->dlbo.dl_Add64 == DLexpensive
4792              || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4793              return expensiveAddSub(mce,True,Ity_I64,
4794                                     vatom1,vatom2, atom1,atom2);
4795          } else {
4796              goto cheap_AddSub64;
4797          }
4798       case Iop_Sub64:
4799          if (mce->dlbo.dl_Sub64 == DLexpensive
4800              || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4801              return expensiveAddSub(mce,False,Ity_I64,
4802                                     vatom1,vatom2, atom1,atom2);
4803          } else {
4804              goto cheap_AddSub64;
4805          }
4806
4807       cheap_AddSub64:
4808       case Iop_Mul64:
4809          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4810
4811       case Iop_Mul16:
4812       case Iop_Add16:
4813       case Iop_Sub16:
4814          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4815
4816       case Iop_Mul8:
4817       case Iop_Sub8:
4818       case Iop_Add8:
4819          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4820
4821       ////---- CmpXX64
4822       case Iop_CmpEQ64: case Iop_CmpNE64:
4823          if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4824             goto expensive_cmp64;
4825          else
4826             goto cheap_cmp64;
4827
4828       expensive_cmp64:
4829       case Iop_ExpCmpNE64:
4830          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4831
4832       cheap_cmp64:
4833       case Iop_CmpLE64S: case Iop_CmpLE64U:
4834       case Iop_CmpLT64U: case Iop_CmpLT64S:
4835          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4836
4837       ////---- CmpXX32
4838       case Iop_CmpEQ32: case Iop_CmpNE32:
4839          if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4840             goto expensive_cmp32;
4841          else
4842             goto cheap_cmp32;
4843
4844       expensive_cmp32:
4845       case Iop_ExpCmpNE32:
4846          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4847
4848       cheap_cmp32:
4849       case Iop_CmpLE32S: case Iop_CmpLE32U:
4850       case Iop_CmpLT32U: case Iop_CmpLT32S:
4851          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4852
4853       ////---- CmpXX16
4854       case Iop_CmpEQ16: case Iop_CmpNE16:
4855          if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4856             goto expensive_cmp16;
4857          else
4858             goto cheap_cmp16;
4859
4860       expensive_cmp16:
4861       case Iop_ExpCmpNE16:
4862          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4863
4864       cheap_cmp16:
4865          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4866
4867       ////---- CmpXX8
4868       case Iop_CmpEQ8: case Iop_CmpNE8:
4869          if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4870             goto expensive_cmp8;
4871          else
4872             goto cheap_cmp8;
4873
4874       expensive_cmp8:
4875          return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4876
4877       cheap_cmp8:
4878          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4879
4880       ////---- end CmpXX{64,32,16,8}
4881
4882       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4883       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4884       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4885       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4886          /* Just say these all produce a defined result, regardless
4887             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4888          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4889
4890       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4891          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4892
4893       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4894          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4895
4896       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4897          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4898
4899       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4900          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4901
4902       case Iop_AndV256:
4903          uifu = mkUifUV256; difd = mkDifDV256;
4904          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4905       case Iop_AndV128:
4906          uifu = mkUifUV128; difd = mkDifDV128;
4907          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4908       case Iop_And64:
4909          uifu = mkUifU64; difd = mkDifD64;
4910          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4911       case Iop_And32:
4912          uifu = mkUifU32; difd = mkDifD32;
4913          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4914       case Iop_And16:
4915          uifu = mkUifU16; difd = mkDifD16;
4916          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4917       case Iop_And8:
4918          uifu = mkUifU8; difd = mkDifD8;
4919          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4920       case Iop_And1:
4921          uifu = mkUifU1; difd = mkDifD1;
4922          and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4923
4924       case Iop_OrV256:
4925          uifu = mkUifUV256; difd = mkDifDV256;
4926          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4927       case Iop_OrV128:
4928          uifu = mkUifUV128; difd = mkDifDV128;
4929          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4930       case Iop_Or64:
4931          uifu = mkUifU64; difd = mkDifD64;
4932          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4933       case Iop_Or32:
4934          uifu = mkUifU32; difd = mkDifD32;
4935          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4936       case Iop_Or16:
4937          uifu = mkUifU16; difd = mkDifD16;
4938          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4939       case Iop_Or8:
4940          uifu = mkUifU8; difd = mkDifD8;
4941          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4942       case Iop_Or1:
4943          uifu = mkUifU1; difd = mkDifD1;
4944          and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4945
4946       do_And_Or:
4947          return assignNew('V', mce, and_or_ty,
4948             difd(mce, uifu(mce, vatom1, vatom2),
4949                       difd(mce, improve(mce, atom1, vatom1),
4950                                 improve(mce, atom2, vatom2) ) ) );
4951
4952       case Iop_Xor8:
4953          return mkUifU8(mce, vatom1, vatom2);
4954       case Iop_Xor16:
4955          return mkUifU16(mce, vatom1, vatom2);
4956       case Iop_Xor32:
4957          return mkUifU32(mce, vatom1, vatom2);
4958       case Iop_Xor64:
4959          return mkUifU64(mce, vatom1, vatom2);
4960       case Iop_XorV128:
4961          return mkUifUV128(mce, vatom1, vatom2);
4962       case Iop_XorV256:
4963          return mkUifUV256(mce, vatom1, vatom2);
4964
4965       /* V256-bit SIMD */
4966
4967       case Iop_ShrN16x16:
4968       case Iop_ShrN32x8:
4969       case Iop_ShrN64x4:
4970       case Iop_SarN16x16:
4971       case Iop_SarN32x8:
4972       case Iop_ShlN16x16:
4973       case Iop_ShlN32x8:
4974       case Iop_ShlN64x4:
4975          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4976             this is wrong now, scalar shifts are done properly lazily.
4977             Vector shifts should be fixed too. */
4978          complainIfUndefined(mce, atom2, NULL);
4979          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4980
4981       case Iop_QSub8Ux32:
4982       case Iop_QSub8Sx32:
4983       case Iop_Sub8x32:
4984       case Iop_Min8Ux32:
4985       case Iop_Min8Sx32:
4986       case Iop_Max8Ux32:
4987       case Iop_Max8Sx32:
4988       case Iop_CmpGT8Sx32:
4989       case Iop_CmpEQ8x32:
4990       case Iop_Avg8Ux32:
4991       case Iop_QAdd8Ux32:
4992       case Iop_QAdd8Sx32:
4993       case Iop_Add8x32:
4994          return binary8Ix32(mce, vatom1, vatom2);
4995
4996       case Iop_QSub16Ux16:
4997       case Iop_QSub16Sx16:
4998       case Iop_Sub16x16:
4999       case Iop_Mul16x16:
5000       case Iop_MulHi16Sx16:
5001       case Iop_MulHi16Ux16:
5002       case Iop_Min16Sx16:
5003       case Iop_Min16Ux16:
5004       case Iop_Max16Sx16:
5005       case Iop_Max16Ux16:
5006       case Iop_CmpGT16Sx16:
5007       case Iop_CmpEQ16x16:
5008       case Iop_Avg16Ux16:
5009       case Iop_QAdd16Ux16:
5010       case Iop_QAdd16Sx16:
5011       case Iop_Add16x16:
5012          return binary16Ix16(mce, vatom1, vatom2);
5013
5014       case Iop_Sub32x8:
5015       case Iop_CmpGT32Sx8:
5016       case Iop_CmpEQ32x8:
5017       case Iop_Add32x8:
5018       case Iop_Max32Ux8:
5019       case Iop_Max32Sx8:
5020       case Iop_Min32Ux8:
5021       case Iop_Min32Sx8:
5022       case Iop_Mul32x8:
5023          return binary32Ix8(mce, vatom1, vatom2);
5024
5025       case Iop_Sub64x4:
5026       case Iop_Add64x4:
5027       case Iop_CmpEQ64x4:
5028       case Iop_CmpGT64Sx4:
5029          return binary64Ix4(mce, vatom1, vatom2);
5030
5031       case Iop_I32StoF32x8:
5032       case Iop_F32toI32Sx8:
5033          return unary32Fx8_w_rm(mce, vatom1, vatom2);
5034
5035       /* Perm32x8: rearrange values in left arg using steering values
5036          from right arg.  So rearrange the vbits in the same way but
5037          pessimise wrt steering values. */
5038       case Iop_Perm32x8:
5039          return mkUifUV256(
5040                    mce,
5041                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
5042                    mkPCast32x8(mce, vatom2)
5043                 );
5044
5045       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
5046          Handle the shifted results in the same way that other
5047          binary Q ops are handled, eg QSub: UifU the two args,
5048          then pessimise -- which is binaryNIxM.  But for the upper
5049          V128, we require to generate just 1 bit which is the
5050          pessimised shift result, with 127 defined zeroes above it.
5051
5052          Note that this overly pessimistic in that in fact only the
5053          bottom 8 bits of each lane of the second arg determine the shift
5054          amount.  Really we ought to ignore any undefinedness in the
5055          rest of the lanes of the second arg. */
5056       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
5057       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
5058       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
5059       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
5060       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
5061       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
5062       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
5063       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
5064       {
5065          // The function to generate the pessimised shift result
5066          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
5067          switch (op) {
5068             case Iop_QandSQsh64x2:
5069             case Iop_QandUQsh64x2:
5070             case Iop_QandSQRsh64x2:
5071             case Iop_QandUQRsh64x2:
5072                binaryNIxM = binary64Ix2;
5073                break;
5074             case Iop_QandSQsh32x4:
5075             case Iop_QandUQsh32x4:
5076             case Iop_QandSQRsh32x4:
5077             case Iop_QandUQRsh32x4:
5078                binaryNIxM = binary32Ix4;
5079                break;
5080             case Iop_QandSQsh16x8:
5081             case Iop_QandUQsh16x8:
5082             case Iop_QandSQRsh16x8:
5083             case Iop_QandUQRsh16x8:
5084                binaryNIxM = binary16Ix8;
5085                break;
5086             case Iop_QandSQsh8x16:
5087             case Iop_QandUQsh8x16:
5088             case Iop_QandSQRsh8x16:
5089             case Iop_QandUQRsh8x16:
5090                binaryNIxM = binary8Ix16;
5091                break;
5092             default:
5093                tl_assert(0);
5094          }
5095          tl_assert(binaryNIxM);
5096          // Pessimised shift result, shV[127:0]
5097          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
5098          // Generates: Def--(127)--Def PCast-to-I1(shV)
5099          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
5100          // and assemble the result
5101          return assignNew('V', mce, Ity_V256,
5102                           binop(Iop_V128HLtoV256, qV, shV));
5103       }
5104
5105       case Iop_F32toF16x4: {
5106          // First, PCast the input vector, retaining the 32x4 format.
5107          IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
5108          // Now truncate each 32 bit lane to 16 bits.  Since we already PCasted
5109          // the input, we're not going to lose any information.
5110          IRAtom* pcHI64
5111             = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
5112          IRAtom* pcLO64
5113             = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
5114          IRAtom* narrowed
5115             = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
5116                                                  pcHI64, pcLO64)); // 16x4
5117          // Finally, roll in any badness from the rounding mode.
5118          IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
5119          return mkUifU64(mce, narrowed, rmPCasted);
5120       }
5121
5122       case Iop_F32toF16x8: {
5123          // Same scheme as for Iop_F32toF16x4.
5124          IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
5125          IRAtom* pcHI128
5126             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
5127                                                  pcasted)); // 32x4
5128          IRAtom* pcLO128
5129             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
5130                                                  pcasted)); // 32x4
5131          IRAtom* narrowed
5132             = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
5133                                                   pcHI128, pcLO128)); // 16x8
5134          // Finally, roll in any badness from the rounding mode.
5135          IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
5136          return mkUifUV128(mce, narrowed, rmPCasted);
5137       }
5138
5139       default:
5140          ppIROp(op);
5141          VG_(tool_panic)("memcheck:expr2vbits_Binop");
5142    }
5143 }
5144
5145
5146 static
5147 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
5148 {
5149    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5150       selection of shadow operation implicitly duplicates the logic in
5151       do_shadow_LoadG and should be kept in sync (in the very unlikely
5152       event that the interpretation of such widening ops changes in
5153       future).  See comment in do_shadow_LoadG. */
5154    IRAtom* vatom = expr2vbits( mce, atom, HuOth );
5155    tl_assert(isOriginalAtom(mce,atom));
5156    switch (op) {
5157
5158       case Iop_Abs64Fx2:
5159       case Iop_Neg64Fx2:
5160       case Iop_RSqrtEst64Fx2:
5161       case Iop_RecipEst64Fx2:
5162       case Iop_Log2_64Fx2:
5163          return unary64Fx2(mce, vatom);
5164
5165       case Iop_Sqrt64F0x2:
5166          return unary64F0x2(mce, vatom);
5167
5168       case Iop_Sqrt32Fx8:
5169       case Iop_RSqrtEst32Fx8:
5170       case Iop_RecipEst32Fx8:
5171          return unary32Fx8(mce, vatom);
5172
5173       case Iop_Sqrt64Fx4:
5174          return unary64Fx4(mce, vatom);
5175
5176       case Iop_RecipEst32Fx4:
5177       case Iop_I32UtoF32x4_DEP:
5178       case Iop_I32StoF32x4_DEP:
5179       case Iop_QF32toI32Ux4_RZ:
5180       case Iop_QF32toI32Sx4_RZ:
5181       case Iop_RoundF32x4_RM:
5182       case Iop_RoundF32x4_RP:
5183       case Iop_RoundF32x4_RN:
5184       case Iop_RoundF32x4_RZ:
5185       case Iop_RecipEst32Ux4:
5186       case Iop_Abs32Fx4:
5187       case Iop_Neg32Fx4:
5188       case Iop_RSqrtEst32Fx4:
5189       case Iop_Log2_32Fx4:
5190       case Iop_Exp2_32Fx4:
5191          return unary32Fx4(mce, vatom);
5192
5193       case Iop_I32UtoF32x2_DEP:
5194       case Iop_I32StoF32x2_DEP:
5195       case Iop_RecipEst32Fx2:
5196       case Iop_RecipEst32Ux2:
5197       case Iop_Abs32Fx2:
5198       case Iop_Neg32Fx2:
5199       case Iop_RSqrtEst32Fx2:
5200          return unary32Fx2(mce, vatom);
5201
5202       case Iop_Sqrt32F0x4:
5203       case Iop_RSqrtEst32F0x4:
5204       case Iop_RecipEst32F0x4:
5205          return unary32F0x4(mce, vatom);
5206
5207       case Iop_Abs16Fx8:
5208       case Iop_Neg16Fx8:
5209          return unary16Fx8(mce, vatom);
5210
5211       // These are self-shadowing.
5212       case Iop_32UtoV128:
5213       case Iop_64UtoV128:
5214       case Iop_Dup8x16:
5215       case Iop_Dup16x8:
5216       case Iop_Dup32x4:
5217       case Iop_Reverse1sIn8_x16:
5218       case Iop_Reverse8sIn16_x8:
5219       case Iop_Reverse8sIn32_x4:
5220       case Iop_Reverse16sIn32_x4:
5221       case Iop_Reverse8sIn64_x2:
5222       case Iop_Reverse16sIn64_x2:
5223       case Iop_Reverse32sIn64_x2:
5224       case Iop_V256toV128_1: case Iop_V256toV128_0:
5225       case Iop_ZeroHI64ofV128:
5226       case Iop_ZeroHI96ofV128:
5227       case Iop_ZeroHI112ofV128:
5228       case Iop_ZeroHI120ofV128:
5229       case Iop_ReinterpI128asV128:  /* I128 -> V128 */
5230          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5231
5232       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
5233       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
5234          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5235
5236       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
5237       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
5238          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5239
5240       case Iop_NegF128:
5241       case Iop_AbsF128:
5242       case Iop_RndF128:
5243       case Iop_TruncF128toI128S: /* F128 -> I128S */
5244       case Iop_TruncF128toI128U: /* F128 -> I128U */
5245       case Iop_ReinterpV128asI128:  /* V128 -> I128 */
5246       case Iop_ReinterpI128asF128:
5247       case Iop_ReinterpF128asI128:
5248          return mkPCastTo(mce, Ity_I128, vatom);
5249
5250       case Iop_BCD128toI128S:
5251       case Iop_MulI128by10:
5252       case Iop_MulI128by10Carry:
5253       case Iop_F16toF64x2:
5254       case Iop_F64toF16x2_DEP:
5255          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5256          return vatom;
5257
5258       case Iop_ReinterpI32asF32:
5259       case Iop_ReinterpF32asI32:
5260          return assignNew('V', mce, Ity_I32, vatom);
5261
5262       case Iop_ReinterpF64asI64:
5263       case Iop_ReinterpI64asF64:
5264       case Iop_ReinterpI64asD64:
5265       case Iop_ReinterpD64asI64:
5266          return assignNew('V', mce, Ity_I64, vatom);
5267
5268       case Iop_I32StoF128: /* signed I32 -> F128 */
5269       case Iop_I64StoF128: /* signed I64 -> F128 */
5270       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5271       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5272       case Iop_F32toF128:  /* F32 -> F128 */
5273       case Iop_F64toF128:  /* F64 -> F128 */
5274       case Iop_I32StoD128: /* signed I64 -> D128 */
5275       case Iop_I64StoD128: /* signed I64 -> D128 */
5276       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5277       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5278          return mkPCastTo(mce, Ity_I128, vatom);
5279
5280       case Iop_F16toF64:
5281       case Iop_F32toF64:
5282       case Iop_I32StoF64:
5283       case Iop_I32UtoF64:
5284       case Iop_NegF64:
5285       case Iop_AbsF64:
5286       case Iop_RSqrtEst5GoodF64:
5287       case Iop_RoundF64toF64_NEAREST:
5288       case Iop_RoundF64toF64_NegINF:
5289       case Iop_RoundF64toF64_PosINF:
5290       case Iop_RoundF64toF64_ZERO:
5291       case Iop_D32toD64:
5292       case Iop_I32StoD64:
5293       case Iop_I32UtoD64:
5294       case Iop_ExtractExpD64:    /* D64  -> I64 */
5295       case Iop_ExtractExpD128:   /* D128 -> I64 */
5296       case Iop_ExtractSigD64:    /* D64  -> I64 */
5297       case Iop_ExtractSigD128:   /* D128 -> I64 */
5298       case Iop_DPBtoBCD:
5299       case Iop_BCDtoDPB:
5300          return mkPCastTo(mce, Ity_I64, vatom);
5301
5302       case Iop_D64toD128:
5303          return mkPCastTo(mce, Ity_I128, vatom);
5304
5305       case Iop_TruncF64asF32:
5306       case Iop_NegF32:
5307       case Iop_AbsF32:
5308       case Iop_F16toF32:
5309          return mkPCastTo(mce, Ity_I32, vatom);
5310
5311       case Iop_AbsF16:
5312       case Iop_NegF16:
5313          return mkPCastTo(mce, Ity_I16, vatom);
5314
5315       case Iop_Ctz32: case Iop_CtzNat32:
5316       case Iop_Ctz64: case Iop_CtzNat64:
5317          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5318
5319       case Iop_Clz32: case Iop_ClzNat32:
5320       case Iop_Clz64: case Iop_ClzNat64:
5321          return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5322
5323       // PopCount32: this is slightly pessimistic.  It is true that the
5324       // result depends on all input bits, so that aspect of the PCast is
5325       // correct.  However, regardless of the input, only the lowest 5 bits
5326       // out of the output can ever be undefined.  So we could actually
5327       // "improve" the results here by marking the top 27 bits of output as
5328       // defined.  A similar comment applies for PopCount64.
5329       case Iop_PopCount32:
5330          return mkPCastTo(mce, Ity_I32, vatom);
5331       case Iop_PopCount64:
5332          return mkPCastTo(mce, Ity_I64, vatom);
5333
5334       // These are self-shadowing.
5335       case Iop_1Uto64:
5336       case Iop_1Sto64:
5337       case Iop_8Uto64:
5338       case Iop_8Sto64:
5339       case Iop_16Uto64:
5340       case Iop_16Sto64:
5341       case Iop_32Sto64:
5342       case Iop_32Uto64:
5343       case Iop_V128to64:
5344       case Iop_V128HIto64:
5345       case Iop_128HIto64:
5346       case Iop_128to64:
5347       case Iop_Dup8x8:
5348       case Iop_Dup16x4:
5349       case Iop_Dup32x2:
5350       case Iop_Reverse8sIn16_x4:
5351       case Iop_Reverse8sIn32_x2:
5352       case Iop_Reverse16sIn32_x2:
5353       case Iop_Reverse8sIn64_x1:
5354       case Iop_Reverse16sIn64_x1:
5355       case Iop_Reverse32sIn64_x1:
5356       case Iop_V256to64_0: case Iop_V256to64_1:
5357       case Iop_V256to64_2: case Iop_V256to64_3:
5358          return assignNew('V', mce, Ity_I64, unop(op, vatom));
5359
5360       // These are self-shadowing.
5361       case Iop_64to32:
5362       case Iop_64HIto32:
5363       case Iop_1Uto32:
5364       case Iop_1Sto32:
5365       case Iop_8Uto32:
5366       case Iop_16Uto32:
5367       case Iop_16Sto32:
5368       case Iop_8Sto32:
5369       case Iop_V128to32:
5370       case Iop_Reverse8sIn32_x1:
5371          return assignNew('V', mce, Ity_I32, unop(op, vatom));
5372
5373       // These are self-shadowing.
5374       case Iop_1Sto16:
5375       case Iop_8Sto16:
5376       case Iop_8Uto16:
5377       case Iop_32to16:
5378       case Iop_32HIto16:
5379       case Iop_64to16:
5380       case Iop_GetMSBs8x16:
5381          return assignNew('V', mce, Ity_I16, unop(op, vatom));
5382
5383       // These are self-shadowing.
5384       case Iop_1Uto8:
5385       case Iop_1Sto8:
5386       case Iop_16to8:
5387       case Iop_16HIto8:
5388       case Iop_32to8:
5389       case Iop_64to8:
5390       case Iop_GetMSBs8x8:
5391          return assignNew('V', mce, Ity_I8, unop(op, vatom));
5392
5393       case Iop_32to1:
5394          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5395
5396       case Iop_64to1:
5397          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5398
5399       case Iop_NotV256:
5400       case Iop_NotV128:
5401       case Iop_Not64:
5402       case Iop_Not32:
5403       case Iop_Not16:
5404       case Iop_Not8:
5405       case Iop_Not1:
5406          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5407          return vatom;
5408
5409       case Iop_CmpNEZ8x8:
5410       case Iop_Cnt8x8:
5411       case Iop_Clz8x8:
5412       case Iop_Cls8x8:
5413       case Iop_Abs8x8:
5414          return mkPCast8x8(mce, vatom);
5415
5416       case Iop_CmpNEZ8x16:
5417       case Iop_Cnt8x16:
5418       case Iop_Clz8x16:
5419       case Iop_Cls8x16:
5420       case Iop_Abs8x16:
5421       case Iop_Ctz8x16:
5422          return mkPCast8x16(mce, vatom);
5423
5424       case Iop_CmpNEZ16x4:
5425       case Iop_Clz16x4:
5426       case Iop_Cls16x4:
5427       case Iop_Abs16x4:
5428          return mkPCast16x4(mce, vatom);
5429
5430       case Iop_CmpNEZ16x8:
5431       case Iop_Clz16x8:
5432       case Iop_Cls16x8:
5433       case Iop_Abs16x8:
5434       case Iop_Ctz16x8:
5435          return mkPCast16x8(mce, vatom);
5436
5437       case Iop_CmpNEZ32x2:
5438       case Iop_Clz32x2:
5439       case Iop_Cls32x2:
5440       case Iop_F32toI32Ux2_RZ:
5441       case Iop_F32toI32Sx2_RZ:
5442       case Iop_Abs32x2:
5443          return mkPCast32x2(mce, vatom);
5444
5445       case Iop_CmpNEZ32x4:
5446       case Iop_Clz32x4:
5447       case Iop_Cls32x4:
5448       case Iop_F32toI32Ux4_RZ:
5449       case Iop_F32toI32Sx4_RZ:
5450       case Iop_Abs32x4:
5451       case Iop_RSqrtEst32Ux4:
5452       case Iop_Ctz32x4:
5453          return mkPCast32x4(mce, vatom);
5454
5455       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5456       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5457       case Iop_CmpwNEZ32:
5458          return mkPCastTo(mce, Ity_I32, vatom);
5459
5460       case Iop_TruncF128toI64S: /* F128 -> I64S */
5461       case Iop_TruncF128toI64U: /* F128 -> I64U */
5462       case Iop_CmpwNEZ64:
5463          return mkPCastTo(mce, Ity_I64, vatom);
5464
5465       case Iop_CmpNEZ64x2:
5466       case Iop_CipherSV128:
5467       case Iop_Clz64x2:
5468       case Iop_Abs64x2:
5469       case Iop_Ctz64x2:
5470          return mkPCast64x2(mce, vatom);
5471
5472       // This is self-shadowing.
5473       case Iop_PwBitMtxXpose64x2:
5474          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5475
5476       case Iop_NarrowUn16to8x8:
5477       case Iop_NarrowUn32to16x4:
5478       case Iop_NarrowUn64to32x2:
5479       case Iop_QNarrowUn16Sto8Sx8:
5480       case Iop_QNarrowUn16Sto8Ux8:
5481       case Iop_QNarrowUn16Uto8Ux8:
5482       case Iop_QNarrowUn32Sto16Sx4:
5483       case Iop_QNarrowUn32Sto16Ux4:
5484       case Iop_QNarrowUn32Uto16Ux4:
5485       case Iop_QNarrowUn64Sto32Sx2:
5486       case Iop_QNarrowUn64Sto32Ux2:
5487       case Iop_QNarrowUn64Uto32Ux2:
5488          return vectorNarrowUnV128(mce, op, vatom);
5489
5490       // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5491       // right.
5492       case Iop_F32toF16x4_DEP:
5493          return vectorNarrowUnV128(mce, op, vatom);
5494
5495       case Iop_Widen8Sto16x8:
5496       case Iop_Widen8Uto16x8:
5497       case Iop_Widen16Sto32x4:
5498       case Iop_Widen16Uto32x4:
5499       case Iop_Widen32Sto64x2:
5500       case Iop_Widen32Uto64x2:
5501          return vectorWidenI64(mce, op, vatom);
5502
5503       case Iop_F16toF32x4:
5504          // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5505          // OK by accident if -- as seems likely -- the F16 to F32 conversion
5506          // preserves will generate an output 32 bits with at least one 1 bit
5507          // set if there's one or more 1 bits set in the input 16 bits.  More
5508          // correct code for this is just below, but commented out, so as to
5509          // avoid short-term backend failures on targets that can't do
5510          // Iop_Interleave{LO,HI}16x4.
5511          return vectorWidenI64(mce, op, vatom);
5512
5513       case Iop_F16toF32x8: {
5514          // PCast the input at 16x8.  This makes each lane hold either all
5515          // zeroes or all ones.
5516          IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5517          // Now double the width of each lane to 32 bits.  Because the lanes are
5518          // all zeroes or all ones, we can just copy the each lane twice into
5519          // the result.  Here's the low half:
5520          IRAtom* widenedLO // :: I32x4
5521             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5522                                                   pcasted, pcasted));
5523          // And the high half:
5524          IRAtom* widenedHI // :: I32x4
5525             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5526                                                   pcasted, pcasted));
5527          // Glue them back together:
5528          return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5529                                                     widenedHI, widenedLO));
5530       }
5531
5532       // See comment just above, for Iop_F16toF32x4
5533       //case Iop_F16toF32x4: {
5534       //   // Same scheme as F16toF32x4
5535       //   IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5536       //   IRAtom* widenedLO // :: I32x2
5537       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5538       //                                            pcasted, pcasted));
5539       //   IRAtom* widenedHI // :: I32x4
5540       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5541       //                                            pcasted, pcasted));
5542       //   // Glue them back together:
5543       //   return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5544       //                                              widenedHI, widenedLO));
5545       //}
5546
5547       case Iop_PwAddL32Ux2:
5548       case Iop_PwAddL32Sx2:
5549          return mkPCastTo(mce, Ity_I64,
5550                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5551
5552       case Iop_PwAddL16Ux4:
5553       case Iop_PwAddL16Sx4:
5554          return mkPCast32x2(mce,
5555                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5556
5557       case Iop_PwAddL8Ux8:
5558       case Iop_PwAddL8Sx8:
5559          return mkPCast16x4(mce,
5560                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5561
5562       case Iop_PwAddL32Ux4:
5563       case Iop_PwAddL32Sx4:
5564          return mkPCast64x2(mce,
5565                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5566
5567       case Iop_PwAddL64Ux2:
5568          return mkPCast128x1(mce,
5569                assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5570
5571       case Iop_PwAddL16Ux8:
5572       case Iop_PwAddL16Sx8:
5573          return mkPCast32x4(mce,
5574                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5575
5576       case Iop_PwAddL8Ux16:
5577       case Iop_PwAddL8Sx16:
5578          return mkPCast16x8(mce,
5579                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5580
5581       case Iop_I64UtoF32:
5582       default:
5583          ppIROp(op);
5584          VG_(tool_panic)("memcheck:expr2vbits_Unop");
5585    }
5586 }
5587
5588
5589 /* Worker function -- do not call directly.  See comments on
5590    expr2vbits_Load for the meaning of |guard|.
5591
5592    Generates IR to (1) perform a definedness test of |addr|, (2)
5593    perform a validity test of |addr|, and (3) return the Vbits for the
5594    location indicated by |addr|.  All of this only happens when
5595    |guard| is NULL or |guard| evaluates to True at run time.
5596
5597    If |guard| evaluates to False at run time, the returned value is
5598    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5599    performed.
5600
5601    The definedness of |guard| itself is not checked.  That is assumed
5602    to have been done before this point, by the caller. */
5603 static
5604 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5605                               IREndness end, IRType ty,
5606                               IRAtom* addr, UInt bias, IRAtom* guard )
5607 {
5608    tl_assert(isOriginalAtom(mce,addr));
5609    tl_assert(end == Iend_LE || end == Iend_BE);
5610
5611    /* First, emit a definedness test for the address.  This also sets
5612       the address (shadow) to 'defined' following the test. */
5613    complainIfUndefined( mce, addr, guard );
5614
5615    /* Now cook up a call to the relevant helper function, to read the data V
5616       bits from shadow memory.  Note that I128 loads are done by pretending
5617       we're doing a V128 load, and then converting the resulting V128 vbits
5618       word to an I128, right at the end of this function -- see `castedToI128`
5619       below.  (It's only a minor hack :-) This pertains to bug 444399. */
5620    ty = shadowTypeV(ty);
5621
5622    void*        helper           = NULL;
5623    const HChar* hname            = NULL;
5624    Bool         ret_via_outparam = False;
5625
5626    if (end == Iend_LE) {
5627       switch (ty) {
5628          case Ity_V256: helper = &MC_(helperc_LOADV256le);
5629                         hname = "MC_(helperc_LOADV256le)";
5630                         ret_via_outparam = True;
5631                         break;
5632          case Ity_I128: // fallthrough.  See comment above.
5633          case Ity_V128: helper = &MC_(helperc_LOADV128le);
5634                         hname = "MC_(helperc_LOADV128le)";
5635                         ret_via_outparam = True;
5636                         break;
5637          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
5638                         hname = "MC_(helperc_LOADV64le)";
5639                         break;
5640          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
5641                         hname = "MC_(helperc_LOADV32le)";
5642                         break;
5643          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
5644                         hname = "MC_(helperc_LOADV16le)";
5645                         break;
5646          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5647                         hname = "MC_(helperc_LOADV8)";
5648                         break;
5649          default:       ppIRType(ty);
5650                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5651       }
5652    } else {
5653       switch (ty) {
5654          case Ity_V256: helper = &MC_(helperc_LOADV256be);
5655                         hname = "MC_(helperc_LOADV256be)";
5656                         ret_via_outparam = True;
5657                         break;
5658          case Ity_V128: helper = &MC_(helperc_LOADV128be);
5659                         hname = "MC_(helperc_LOADV128be)";
5660                         ret_via_outparam = True;
5661                         break;
5662          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
5663                         hname = "MC_(helperc_LOADV64be)";
5664                         break;
5665          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
5666                         hname = "MC_(helperc_LOADV32be)";
5667                         break;
5668          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
5669                         hname = "MC_(helperc_LOADV16be)";
5670                         break;
5671          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5672                         hname = "MC_(helperc_LOADV8)";
5673                         break;
5674          default:       ppIRType(ty);
5675                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5676       }
5677    }
5678
5679    tl_assert(helper);
5680    tl_assert(hname);
5681
5682    /* Generate the actual address into addrAct. */
5683    IRAtom* addrAct;
5684    if (bias == 0) {
5685       addrAct = addr;
5686    } else {
5687       IROp    mkAdd;
5688       IRAtom* eBias;
5689       IRType  tyAddr  = mce->hWordTy;
5690       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5691       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5692       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5693       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5694    }
5695
5696    /* We need to have a place to park the V bits we're just about to
5697       read. */
5698    IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
5699
5700    /* Here's the call. */
5701    IRDirty* di;
5702    if (ret_via_outparam) {
5703       di = unsafeIRDirty_1_N( datavbits,
5704                               2/*regparms*/,
5705                               hname, VG_(fnptr_to_fnentry)( helper ),
5706                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5707    } else {
5708       di = unsafeIRDirty_1_N( datavbits,
5709                               1/*regparms*/,
5710                               hname, VG_(fnptr_to_fnentry)( helper ),
5711                               mkIRExprVec_1( addrAct ) );
5712    }
5713
5714    setHelperAnns( mce, di );
5715    if (guard) {
5716       di->guard = guard;
5717       /* Ideally the didn't-happen return value here would be all-ones
5718          (all-undefined), so it'd be obvious if it got used
5719          inadvertently.  We can get by with the IR-mandated default
5720          value (0b01 repeating, 0x55 etc) as that'll still look pretty
5721          undefined if it ever leaks out. */
5722    }
5723    stmt( 'V', mce, IRStmt_Dirty(di) );
5724
5725    if (ty == Ity_I128) {
5726       IRAtom* castedToI128
5727          = assignNew('V', mce, Ity_I128,
5728                      unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
5729       return castedToI128;
5730    } else {
5731       return mkexpr(datavbits);
5732    }
5733 }
5734
5735
5736 /* Generate IR to do a shadow load.  The helper is expected to check
5737    the validity of the address and return the V bits for that address.
5738    This can optionally be controlled by a guard, which is assumed to
5739    be True if NULL.  In the case where the guard is False at runtime,
5740    the helper will return the didn't-do-the-call value of 0x55..55.
5741    Since that means "completely undefined result", the caller of
5742    this function will need to fix up the result somehow in that
5743    case.
5744
5745    Caller of this function is also expected to have checked the
5746    definedness of |guard| before this point.
5747 */
5748 static
5749 IRAtom* expr2vbits_Load ( MCEnv* mce,
5750                           IREndness end, IRType ty,
5751                           IRAtom* addr, UInt bias,
5752                           IRAtom* guard )
5753 {
5754    tl_assert(end == Iend_LE || end == Iend_BE);
5755    switch (shadowTypeV(ty)) {
5756       case Ity_I8:
5757       case Ity_I16:
5758       case Ity_I32:
5759       case Ity_I64:
5760       case Ity_I128:
5761       case Ity_V128:
5762       case Ity_V256:
5763          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5764       default:
5765          VG_(tool_panic)("expr2vbits_Load");
5766    }
5767 }
5768
5769
5770 /* The most general handler for guarded loads.  Assumes the
5771    definedness of GUARD has already been checked by the caller.  A
5772    GUARD of NULL is assumed to mean "always True".  Generates code to
5773    check the definedness and validity of ADDR.
5774
5775    Generate IR to do a shadow load from ADDR and return the V bits.
5776    The loaded type is TY.  The loaded data is then (shadow) widened by
5777    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
5778    evaluates to False at run time then the returned Vbits are simply
5779    VALT instead.  Note therefore that the argument type of VWIDEN must
5780    be TY and the result type of VWIDEN must equal the type of VALT.
5781 */
5782 static
5783 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5784                                           IREndness end, IRType ty,
5785                                           IRAtom* addr, UInt bias,
5786                                           IRAtom* guard,
5787                                           IROp vwiden, IRAtom* valt )
5788 {
5789    /* Sanity check the conversion operation, and also set TYWIDE. */
5790    IRType tyWide = Ity_INVALID;
5791    switch (vwiden) {
5792       case Iop_INVALID:
5793          tyWide = ty;
5794          break;
5795       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5796          tyWide = Ity_I32;
5797          break;
5798       default:
5799          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5800    }
5801
5802    /* If the guard evaluates to True, this will hold the loaded V bits
5803       at TY.  If the guard evaluates to False, this will be all
5804       ones, meaning "all undefined", in which case we will have to
5805       replace it using an ITE below. */
5806    IRAtom* iftrue1
5807       = assignNew('V', mce, ty,
5808                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
5809    /* Now (shadow-) widen the loaded V bits to the desired width.  In
5810       the guard-is-False case, the allowable widening operators will
5811       in the worst case (unsigned widening) at least leave the
5812       pre-widened part as being marked all-undefined, and in the best
5813       case (signed widening) mark the whole widened result as
5814       undefined.  Anyway, it doesn't matter really, since in this case
5815       we will replace said value with the default value |valt| using an
5816       ITE. */
5817    IRAtom* iftrue2
5818       = vwiden == Iop_INVALID
5819            ? iftrue1
5820            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5821    /* These are the V bits we will return if the load doesn't take
5822       place. */
5823    IRAtom* iffalse
5824       = valt;
5825    /* Prepare the cond for the ITE.  Convert a NULL cond into
5826       something that iropt knows how to fold out later. */
5827    IRAtom* cond
5828       = guard == NULL  ? mkU1(1)  : guard;
5829    /* And assemble the final result. */
5830    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5831 }
5832
5833
5834 /* A simpler handler for guarded loads, in which there is no
5835    conversion operation, and the default V bit return (when the guard
5836    evaluates to False at runtime) is "all defined".  If there is no
5837    guard expression or the guard is always TRUE this function behaves
5838    like expr2vbits_Load.  It is assumed that definedness of GUARD has
5839    already been checked at the call site. */
5840 static
5841 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5842                                          IREndness end, IRType ty,
5843                                          IRAtom* addr, UInt bias,
5844                                          IRAtom *guard )
5845 {
5846    return expr2vbits_Load_guarded_General(
5847              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5848           );
5849 }
5850
5851
5852 static
5853 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5854                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5855 {
5856    IRAtom *vbitsC, *vbits0, *vbits1;
5857    IRType ty;
5858    /* Given ITE(cond, iftrue,  iffalse),  generate
5859             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5860       That is, steer the V bits like the originals, but trash the
5861       result if the steering value is undefined.  This gives
5862       lazy propagation. */
5863    tl_assert(isOriginalAtom(mce, cond));
5864    tl_assert(isOriginalAtom(mce, iftrue));
5865    tl_assert(isOriginalAtom(mce, iffalse));
5866
5867    vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5868    vbits1 = expr2vbits(mce, iftrue, HuOth);
5869    vbits0 = expr2vbits(mce, iffalse, HuOth);
5870    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5871
5872    return
5873       mkUifU(mce, ty, assignNew('V', mce, ty,
5874                                      IRExpr_ITE(cond, vbits1, vbits0)),
5875                       mkPCastTo(mce, ty, vbitsC) );
5876 }
5877
5878 /* --------- This is the main expression-handling function. --------- */
5879
5880 static
5881 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5882                      HowUsed hu/*use HuOth if unknown*/ )
5883 {
5884    switch (e->tag) {
5885
5886       case Iex_Get:
5887          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5888
5889       case Iex_GetI:
5890          return shadow_GETI( mce, e->Iex.GetI.descr,
5891                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
5892
5893       case Iex_RdTmp:
5894          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5895
5896       case Iex_Const:
5897          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5898
5899       case Iex_Qop:
5900          return expr2vbits_Qop(
5901                    mce,
5902                    e->Iex.Qop.details->op,
5903                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5904                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5905                 );
5906
5907       case Iex_Triop:
5908          return expr2vbits_Triop(
5909                    mce,
5910                    e->Iex.Triop.details->op,
5911                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5912                    e->Iex.Triop.details->arg3
5913                 );
5914
5915       case Iex_Binop:
5916          return expr2vbits_Binop(
5917                    mce,
5918                    e->Iex.Binop.op,
5919                    e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5920                    hu
5921                 );
5922
5923       case Iex_Unop:
5924          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5925
5926       case Iex_Load:
5927          return expr2vbits_Load( mce, e->Iex.Load.end,
5928                                       e->Iex.Load.ty,
5929                                       e->Iex.Load.addr, 0/*addr bias*/,
5930                                       NULL/* guard == "always True"*/ );
5931
5932       case Iex_CCall:
5933          return mkLazyN( mce, e->Iex.CCall.args,
5934                               e->Iex.CCall.retty,
5935                               e->Iex.CCall.cee );
5936
5937       case Iex_ITE:
5938          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5939                                      e->Iex.ITE.iffalse);
5940
5941       default:
5942          VG_(printf)("\n");
5943          ppIRExpr(e);
5944          VG_(printf)("\n");
5945          VG_(tool_panic)("memcheck: expr2vbits");
5946    }
5947 }
5948
5949
5950 /*------------------------------------------------------------*/
5951 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5952 /*------------------------------------------------------------*/
5953
5954 /* Widen a value to the host word size. */
5955
5956 static
5957 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5958 {
5959    IRType ty, tyH;
5960
5961    /* vatom is vbits-value and as such can only have a shadow type. */
5962    tl_assert(isShadowAtom(mce,vatom));
5963
5964    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5965    tyH = mce->hWordTy;
5966
5967    if (tyH == Ity_I32) {
5968       switch (ty) {
5969          case Ity_I32:
5970             return vatom;
5971          case Ity_I16:
5972             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5973          case Ity_I8:
5974             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5975          default:
5976             goto unhandled;
5977       }
5978    } else
5979    if (tyH == Ity_I64) {
5980       switch (ty) {
5981          case Ity_I32:
5982             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5983          case Ity_I16:
5984             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5985                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5986          case Ity_I8:
5987             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5988                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5989          default:
5990             goto unhandled;
5991       }
5992    } else {
5993       goto unhandled;
5994    }
5995   unhandled:
5996    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5997    VG_(tool_panic)("zwidenToHostWord");
5998 }
5999
6000
6001 /* Generate a shadow store.  |addr| is always the original address
6002    atom.  You can pass in either originals or V-bits for the data
6003    atom, but obviously not both.  This function generates a check for
6004    the definedness and (indirectly) the validity of |addr|, but only
6005    when |guard| evaluates to True at run time (or is NULL).
6006
6007    |guard| :: Ity_I1 controls whether the store really happens; NULL
6008    means it unconditionally does.  Note that |guard| itself is not
6009    checked for definedness; the caller of this function must do that
6010    if necessary.
6011 */
6012 static
6013 void do_shadow_Store ( MCEnv* mce,
6014                        IREndness end,
6015                        IRAtom* addr, UInt bias,
6016                        IRAtom* data, IRAtom* vdata,
6017                        IRAtom* guard )
6018 {
6019    IROp     mkAdd;
6020    IRType   ty, tyAddr;
6021    void*    helper = NULL;
6022    const HChar* hname = NULL;
6023    IRConst* c;
6024
6025    tyAddr = mce->hWordTy;
6026    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
6027    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
6028    tl_assert( end == Iend_LE || end == Iend_BE );
6029
6030    if (data) {
6031       tl_assert(!vdata);
6032       tl_assert(isOriginalAtom(mce, data));
6033       tl_assert(bias == 0);
6034       vdata = expr2vbits( mce, data, HuOth );
6035    } else {
6036       tl_assert(vdata);
6037    }
6038
6039    tl_assert(isOriginalAtom(mce,addr));
6040    tl_assert(isShadowAtom(mce,vdata));
6041
6042    if (guard) {
6043       tl_assert(isOriginalAtom(mce, guard));
6044       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6045    }
6046
6047    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
6048
6049    // If we're not doing undefined value checking, pretend that this value
6050    // is "all valid".  That lets Vex's optimiser remove some of the V bit
6051    // shadow computation ops that precede it.
6052    if (MC_(clo_mc_level) == 1) {
6053       switch (ty) {
6054          case Ity_V256: // V256 weirdness -- used four times
6055                         c = IRConst_V256(V_BITS32_DEFINED); break;
6056          case Ity_V128: // V128 weirdness -- used twice
6057                         c = IRConst_V128(V_BITS16_DEFINED); break;
6058          case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
6059          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
6060          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
6061          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
6062          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
6063          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6064       }
6065       vdata = IRExpr_Const( c );
6066    }
6067
6068    /* First, emit a definedness test for the address.  This also sets
6069       the address (shadow) to 'defined' following the test.  Both of
6070       those actions are gated on |guard|. */
6071    complainIfUndefined( mce, addr, guard );
6072
6073    /* Now decide which helper function to call to write the data V
6074       bits into shadow memory. */
6075    if (end == Iend_LE) {
6076       switch (ty) {
6077          case Ity_V256: /* we'll use the helper four times */
6078          case Ity_V128: /* we'll use the helper twice */
6079          case Ity_I128: /* we'll use the helper twice */
6080          case Ity_I64: helper = &MC_(helperc_STOREV64le);
6081                        hname = "MC_(helperc_STOREV64le)";
6082                        break;
6083          case Ity_I32: helper = &MC_(helperc_STOREV32le);
6084                        hname = "MC_(helperc_STOREV32le)";
6085                        break;
6086          case Ity_I16: helper = &MC_(helperc_STOREV16le);
6087                        hname = "MC_(helperc_STOREV16le)";
6088                        break;
6089          case Ity_I8:  helper = &MC_(helperc_STOREV8);
6090                        hname = "MC_(helperc_STOREV8)";
6091                        break;
6092          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6093       }
6094    } else {
6095       switch (ty) {
6096          case Ity_V128: /* we'll use the helper twice */
6097          case Ity_I64: helper = &MC_(helperc_STOREV64be);
6098                        hname = "MC_(helperc_STOREV64be)";
6099                        break;
6100          case Ity_I32: helper = &MC_(helperc_STOREV32be);
6101                        hname = "MC_(helperc_STOREV32be)";
6102                        break;
6103          case Ity_I16: helper = &MC_(helperc_STOREV16be);
6104                        hname = "MC_(helperc_STOREV16be)";
6105                        break;
6106          case Ity_I8:  helper = &MC_(helperc_STOREV8);
6107                        hname = "MC_(helperc_STOREV8)";
6108                        break;
6109          /* Note, no V256 case here, because no big-endian target that
6110             we support, has 256 vectors. */
6111          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
6112       }
6113    }
6114
6115    if (UNLIKELY(ty == Ity_V256)) {
6116
6117       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6118          Q3 being the most significant lane. */
6119       /* These are the offsets of the Qs in memory. */
6120       Int     offQ0, offQ1, offQ2, offQ3;
6121
6122       /* Various bits for constructing the 4 lane helper calls */
6123       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
6124       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
6125       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
6126       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
6127
6128       if (end == Iend_LE) {
6129          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
6130       } else {
6131          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
6132       }
6133
6134       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
6135       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
6136       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
6137       diQ0    = unsafeIRDirty_0_N(
6138                    1/*regparms*/,
6139                    hname, VG_(fnptr_to_fnentry)( helper ),
6140                    mkIRExprVec_2( addrQ0, vdataQ0 )
6141                 );
6142
6143       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
6144       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
6145       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
6146       diQ1    = unsafeIRDirty_0_N(
6147                    1/*regparms*/,
6148                    hname, VG_(fnptr_to_fnentry)( helper ),
6149                    mkIRExprVec_2( addrQ1, vdataQ1 )
6150                 );
6151
6152       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
6153       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
6154       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
6155       diQ2    = unsafeIRDirty_0_N(
6156                    1/*regparms*/,
6157                    hname, VG_(fnptr_to_fnentry)( helper ),
6158                    mkIRExprVec_2( addrQ2, vdataQ2 )
6159                 );
6160
6161       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
6162       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
6163       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
6164       diQ3    = unsafeIRDirty_0_N(
6165                    1/*regparms*/,
6166                    hname, VG_(fnptr_to_fnentry)( helper ),
6167                    mkIRExprVec_2( addrQ3, vdataQ3 )
6168                 );
6169
6170       if (guard)
6171          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
6172
6173       setHelperAnns( mce, diQ0 );
6174       setHelperAnns( mce, diQ1 );
6175       setHelperAnns( mce, diQ2 );
6176       setHelperAnns( mce, diQ3 );
6177       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
6178       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
6179       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
6180       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6181
6182    }
6183    else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6184
6185       /* V128/I128-bit case */
6186       /* See comment in next clause re 64-bit regparms */
6187       /* also, need to be careful about endianness */
6188
6189       Int     offLo64, offHi64;
6190       IRDirty *diLo64, *diHi64;
6191       IRAtom  *addrLo64, *addrHi64;
6192       IRAtom  *vdataLo64, *vdataHi64;
6193       IRAtom  *eBiasLo64, *eBiasHi64;
6194       IROp    opGetLO64,  opGetHI64;
6195
6196       if (end == Iend_LE) {
6197          offLo64 = 0;
6198          offHi64 = 8;
6199       } else {
6200          offLo64 = 8;
6201          offHi64 = 0;
6202       }
6203
6204       if (ty == Ity_V128) {
6205          opGetLO64 = Iop_V128to64;
6206          opGetHI64 = Iop_V128HIto64;
6207       } else {
6208          opGetLO64 = Iop_128to64;
6209          opGetHI64 = Iop_128HIto64;
6210       }
6211
6212       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6213       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6214       vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6215       diLo64    = unsafeIRDirty_0_N(
6216                      1/*regparms*/,
6217                      hname, VG_(fnptr_to_fnentry)( helper ),
6218                      mkIRExprVec_2( addrLo64, vdataLo64 )
6219                   );
6220       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6221       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6222       vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6223       diHi64    = unsafeIRDirty_0_N(
6224                      1/*regparms*/,
6225                      hname, VG_(fnptr_to_fnentry)( helper ),
6226                      mkIRExprVec_2( addrHi64, vdataHi64 )
6227                   );
6228       if (guard) diLo64->guard = guard;
6229       if (guard) diHi64->guard = guard;
6230       setHelperAnns( mce, diLo64 );
6231       setHelperAnns( mce, diHi64 );
6232       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
6233       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
6234
6235    } else {
6236
6237       IRDirty *di;
6238       IRAtom  *addrAct;
6239
6240       /* 8/16/32/64-bit cases */
6241       /* Generate the actual address into addrAct. */
6242       if (bias == 0) {
6243          addrAct = addr;
6244       } else {
6245          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
6246          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
6247       }
6248
6249       if (ty == Ity_I64) {
6250          /* We can't do this with regparm 2 on 32-bit platforms, since
6251             the back ends aren't clever enough to handle 64-bit
6252             regparm args.  Therefore be different. */
6253          di = unsafeIRDirty_0_N(
6254                  1/*regparms*/,
6255                  hname, VG_(fnptr_to_fnentry)( helper ),
6256                  mkIRExprVec_2( addrAct, vdata )
6257               );
6258       } else {
6259          di = unsafeIRDirty_0_N(
6260                  2/*regparms*/,
6261                  hname, VG_(fnptr_to_fnentry)( helper ),
6262                  mkIRExprVec_2( addrAct,
6263                                 zwidenToHostWord( mce, vdata ))
6264               );
6265       }
6266       if (guard) di->guard = guard;
6267       setHelperAnns( mce, di );
6268       stmt( 'V', mce, IRStmt_Dirty(di) );
6269    }
6270
6271 }
6272
6273
6274 /* Do lazy pessimistic propagation through a dirty helper call, by
6275    looking at the annotations on it.  This is the most complex part of
6276    Memcheck. */
6277
6278 static IRType szToITy ( Int n )
6279 {
6280    switch (n) {
6281       case 1: return Ity_I8;
6282       case 2: return Ity_I16;
6283       case 4: return Ity_I32;
6284       case 8: return Ity_I64;
6285       default: VG_(tool_panic)("szToITy(memcheck)");
6286    }
6287 }
6288
6289 static
6290 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6291 {
6292    Int       i, k, n, toDo, gSz, gOff;
6293    IRAtom    *src, *here, *curr;
6294    IRType    tySrc, tyDst;
6295    IRTemp    dst;
6296    IREndness end;
6297
6298    /* What's the native endianness?  We need to know this. */
6299 #  if defined(VG_BIGENDIAN)
6300    end = Iend_BE;
6301 #  elif defined(VG_LITTLEENDIAN)
6302    end = Iend_LE;
6303 #  else
6304 #    error "Unknown endianness"
6305 #  endif
6306
6307    /* First check the guard. */
6308    complainIfUndefined(mce, d->guard, NULL);
6309
6310    /* Now round up all inputs and PCast over them. */
6311    curr = definedOfType(Ity_I32);
6312
6313    /* Inputs: unmasked args
6314       Note: arguments are evaluated REGARDLESS of the guard expression */
6315    for (i = 0; d->args[i]; i++) {
6316       IRAtom* arg = d->args[i];
6317       if ( (d->cee->mcx_mask & (1<<i))
6318            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6319          /* ignore this arg */
6320       } else {
6321          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6322          curr = mkUifU32(mce, here, curr);
6323       }
6324    }
6325
6326    /* Inputs: guest state that we read. */
6327    for (i = 0; i < d->nFxState; i++) {
6328       tl_assert(d->fxState[i].fx != Ifx_None);
6329       if (d->fxState[i].fx == Ifx_Write)
6330          continue;
6331
6332       /* Enumerate the described state segments */
6333       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6334          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6335          gSz  = d->fxState[i].size;
6336
6337          /* Ignore any sections marked as 'always defined'. */
6338          if (isAlwaysDefd(mce, gOff, gSz)) {
6339             if (0)
6340             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6341                         gOff, gSz);
6342             continue;
6343          }
6344
6345          /* This state element is read or modified.  So we need to
6346             consider it.  If larger than 8 bytes, deal with it in
6347             8-byte chunks. */
6348          while (True) {
6349             tl_assert(gSz >= 0);
6350             if (gSz == 0) break;
6351             n = gSz <= 8 ? gSz : 8;
6352             /* update 'curr' with UifU of the state slice
6353                gOff .. gOff+n-1 */
6354             tySrc = szToITy( n );
6355
6356             /* Observe the guard expression. If it is false use an
6357                all-bits-defined bit pattern */
6358             IRAtom *cond, *iffalse, *iftrue;
6359
6360             cond    = assignNew('V', mce, Ity_I1, d->guard);
6361             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6362             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6363             src     = assignNew('V', mce, tySrc,
6364                                 IRExpr_ITE(cond, iftrue, iffalse));
6365
6366             here = mkPCastTo( mce, Ity_I32, src );
6367             curr = mkUifU32(mce, here, curr);
6368             gSz -= n;
6369             gOff += n;
6370          }
6371       }
6372    }
6373
6374    /* Inputs: memory.  First set up some info needed regardless of
6375       whether we're doing reads or writes. */
6376
6377    if (d->mFx != Ifx_None) {
6378       /* Because we may do multiple shadow loads/stores from the same
6379          base address, it's best to do a single test of its
6380          definedness right now.  Post-instrumentation optimisation
6381          should remove all but this test. */
6382       IRType tyAddr;
6383       tl_assert(d->mAddr);
6384       complainIfUndefined(mce, d->mAddr, d->guard);
6385
6386       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6387       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6388       tl_assert(tyAddr == mce->hWordTy); /* not really right */
6389    }
6390
6391    /* Deal with memory inputs (reads or modifies) */
6392    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6393       toDo   = d->mSize;
6394       /* chew off 32-bit chunks.  We don't care about the endianness
6395          since it's all going to be condensed down to a single bit,
6396          but nevertheless choose an endianness which is hopefully
6397          native to the platform. */
6398       while (toDo >= 4) {
6399          here = mkPCastTo(
6400                    mce, Ity_I32,
6401                    expr2vbits_Load_guarded_Simple(
6402                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6403                 );
6404          curr = mkUifU32(mce, here, curr);
6405          toDo -= 4;
6406       }
6407       /* chew off 16-bit chunks */
6408       while (toDo >= 2) {
6409          here = mkPCastTo(
6410                    mce, Ity_I32,
6411                    expr2vbits_Load_guarded_Simple(
6412                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6413                 );
6414          curr = mkUifU32(mce, here, curr);
6415          toDo -= 2;
6416       }
6417       /* chew off the remaining 8-bit chunk, if any */
6418       if (toDo == 1) {
6419          here = mkPCastTo(
6420                    mce, Ity_I32,
6421                    expr2vbits_Load_guarded_Simple(
6422                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6423                 );
6424          curr = mkUifU32(mce, here, curr);
6425          toDo -= 1;
6426       }
6427       tl_assert(toDo == 0);
6428    }
6429
6430    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
6431       all the inputs to the helper.  Now we need to re-distribute the
6432       results to all destinations. */
6433
6434    /* Outputs: the destination temporary, if there is one. */
6435    if (d->tmp != IRTemp_INVALID) {
6436       dst   = findShadowTmpV(mce, d->tmp);
6437       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6438       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6439    }
6440
6441    /* Outputs: guest state that we write or modify. */
6442    for (i = 0; i < d->nFxState; i++) {
6443       tl_assert(d->fxState[i].fx != Ifx_None);
6444       if (d->fxState[i].fx == Ifx_Read)
6445          continue;
6446
6447       /* Enumerate the described state segments */
6448       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6449          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6450          gSz  = d->fxState[i].size;
6451
6452          /* Ignore any sections marked as 'always defined'. */
6453          if (isAlwaysDefd(mce, gOff, gSz))
6454             continue;
6455
6456          /* This state element is written or modified.  So we need to
6457             consider it.  If larger than 8 bytes, deal with it in
6458             8-byte chunks. */
6459          while (True) {
6460             tl_assert(gSz >= 0);
6461             if (gSz == 0) break;
6462             n = gSz <= 8 ? gSz : 8;
6463             /* Write suitably-casted 'curr' to the state slice
6464                gOff .. gOff+n-1 */
6465             tyDst = szToITy( n );
6466             do_shadow_PUT( mce, gOff,
6467                                 NULL, /* original atom */
6468                                 mkPCastTo( mce, tyDst, curr ), d->guard );
6469             gSz -= n;
6470             gOff += n;
6471          }
6472       }
6473    }
6474
6475    /* Outputs: memory that we write or modify.  Same comments about
6476       endianness as above apply. */
6477    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6478       toDo   = d->mSize;
6479       /* chew off 32-bit chunks */
6480       while (toDo >= 4) {
6481          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6482                           NULL, /* original data */
6483                           mkPCastTo( mce, Ity_I32, curr ),
6484                           d->guard );
6485          toDo -= 4;
6486       }
6487       /* chew off 16-bit chunks */
6488       while (toDo >= 2) {
6489          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6490                           NULL, /* original data */
6491                           mkPCastTo( mce, Ity_I16, curr ),
6492                           d->guard );
6493          toDo -= 2;
6494       }
6495       /* chew off the remaining 8-bit chunk, if any */
6496       if (toDo == 1) {
6497          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6498                           NULL, /* original data */
6499                           mkPCastTo( mce, Ity_I8, curr ),
6500                           d->guard );
6501          toDo -= 1;
6502       }
6503       tl_assert(toDo == 0);
6504    }
6505
6506 }
6507
6508
6509 /* We have an ABI hint telling us that [base .. base+len-1] is to
6510    become undefined ("writable").  Generate code to call a helper to
6511    notify the A/V bit machinery of this fact.
6512
6513    We call
6514    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6515                                                     Addr nia );
6516 */
6517 static
6518 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6519 {
6520    IRDirty* di;
6521
6522    if (MC_(clo_mc_level) == 3) {
6523       di = unsafeIRDirty_0_N(
6524               3/*regparms*/,
6525               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6526               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6527               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6528            );
6529    } else {
6530       /* We ignore the supplied nia, since it is irrelevant. */
6531       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6532       /* Special-case the len==128 case, since that is for amd64-ELF,
6533          which is a very common target. */
6534       if (len == 128) {
6535          di = unsafeIRDirty_0_N(
6536                  1/*regparms*/,
6537                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6538                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6539                  mkIRExprVec_1( base )
6540               );
6541       } else {
6542          di = unsafeIRDirty_0_N(
6543                  2/*regparms*/,
6544                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6545                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6546                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6547               );
6548       }
6549    }
6550
6551    stmt( 'V', mce, IRStmt_Dirty(di) );
6552 }
6553
6554
6555 /* ------ Dealing with IRCAS (big and complex) ------ */
6556
6557 /* FWDS */
6558 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
6559                              IRAtom* baseaddr, Int offset );
6560 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6561 static void    gen_store_b ( MCEnv* mce, Int szB,
6562                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
6563                              IRAtom* guard );
6564
6565 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6566 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6567
6568
6569 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6570    IRExpr.Consts, else this asserts.  If they are both Consts, it
6571    doesn't do anything.  So that just leaves the RdTmp case.
6572
6573    In which case: this assigns the shadow value SHADOW to the IR
6574    shadow temporary associated with ORIG.  That is, ORIG, being an
6575    original temporary, will have a shadow temporary associated with
6576    it.  However, in the case envisaged here, there will so far have
6577    been no IR emitted to actually write a shadow value into that
6578    temporary.  What this routine does is to (emit IR to) copy the
6579    value in SHADOW into said temporary, so that after this call,
6580    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6581    value in SHADOW.
6582
6583    Point is to allow callers to compute "by hand" a shadow value for
6584    ORIG, and force it to be associated with ORIG.
6585
6586    How do we know that that shadow associated with ORIG has not so far
6587    been assigned to?  Well, we don't per se know that, but supposing
6588    it had.  Then this routine would create a second assignment to it,
6589    and later the IR sanity checker would barf.  But that never
6590    happens.  QED.
6591 */
6592 static void bind_shadow_tmp_to_orig ( UChar how,
6593                                       MCEnv* mce,
6594                                       IRAtom* orig, IRAtom* shadow )
6595 {
6596    tl_assert(isOriginalAtom(mce, orig));
6597    tl_assert(isShadowAtom(mce, shadow));
6598    switch (orig->tag) {
6599       case Iex_Const:
6600          tl_assert(shadow->tag == Iex_Const);
6601          break;
6602       case Iex_RdTmp:
6603          tl_assert(shadow->tag == Iex_RdTmp);
6604          if (how == 'V') {
6605             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6606                    shadow);
6607          } else {
6608             tl_assert(how == 'B');
6609             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6610                    shadow);
6611          }
6612          break;
6613       default:
6614          tl_assert(0);
6615    }
6616 }
6617
6618
6619 static
6620 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6621 {
6622    /* Scheme is (both single- and double- cases):
6623
6624       1. fetch data#,dataB (the proposed new value)
6625
6626       2. fetch expd#,expdB (what we expect to see at the address)
6627
6628       3. check definedness of address
6629
6630       4. load old#,oldB from shadow memory; this also checks
6631          addressibility of the address
6632
6633       5. the CAS itself
6634
6635       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
6636
6637       7. if "expected == old" (as computed by (6))
6638             store data#,dataB to shadow memory
6639
6640       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
6641       'data' but 7 stores 'data#'.  Hence it is possible for the
6642       shadow data to be incorrectly checked and/or updated:
6643
6644       * 7 is at least gated correctly, since the 'expected == old'
6645         condition is derived from outputs of 5.  However, the shadow
6646         write could happen too late: imagine after 5 we are
6647         descheduled, a different thread runs, writes a different
6648         (shadow) value at the address, and then we resume, hence
6649         overwriting the shadow value written by the other thread.
6650
6651       Because the original memory access is atomic, there's no way to
6652       make both the original and shadow accesses into a single atomic
6653       thing, hence this is unavoidable.
6654
6655       At least as Valgrind stands, I don't think it's a problem, since
6656       we're single threaded *and* we guarantee that there are no
6657       context switches during the execution of any specific superblock
6658       -- context switches can only happen at superblock boundaries.
6659
6660       If Valgrind ever becomes MT in the future, then it might be more
6661       of a problem.  A possible kludge would be to artificially
6662       associate with the location, a lock, which we must acquire and
6663       release around the transaction as a whole.  Hmm, that probably
6664       would't work properly since it only guards us against other
6665       threads doing CASs on the same location, not against other
6666       threads doing normal reads and writes.
6667
6668       ------------------------------------------------------------
6669
6670       COMMENT_ON_CasCmpEQ:
6671
6672       Note two things.  Firstly, in the sequence above, we compute
6673       "expected == old", but we don't check definedness of it.  Why
6674       not?  Also, the x86 and amd64 front ends use
6675       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6676       determination (expected == old ?) for themselves, and we also
6677       don't check definedness for those primops; we just say that the
6678       result is defined.  Why?  Details follow.
6679
6680       x86/amd64 contains various forms of locked insns:
6681       * lock prefix before all basic arithmetic insn;
6682         eg lock xorl %reg1,(%reg2)
6683       * atomic exchange reg-mem
6684       * compare-and-swaps
6685
6686       Rather than attempt to represent them all, which would be a
6687       royal PITA, I used a result from Maurice Herlihy
6688       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6689       demonstrates that compare-and-swap is a primitive more general
6690       than the other two, and so can be used to represent all of them.
6691       So the translation scheme for (eg) lock incl (%reg) is as
6692       follows:
6693
6694         again:
6695          old = * %reg
6696          new = old + 1
6697          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6698
6699       The "atomically" is the CAS bit.  The scheme is always the same:
6700       get old value from memory, compute new value, atomically stuff
6701       new value back in memory iff the old value has not changed (iow,
6702       no other thread modified it in the meantime).  If it has changed
6703       then we've been out-raced and we have to start over.
6704
6705       Now that's all very neat, but it has the bad side effect of
6706       introducing an explicit equality test into the translation.
6707       Consider the behaviour of said code on a memory location which
6708       is uninitialised.  We will wind up doing a comparison on
6709       uninitialised data, and mc duly complains.
6710
6711       What's difficult about this is, the common case is that the
6712       location is uncontended, and so we're usually comparing the same
6713       value (* %reg) with itself.  So we shouldn't complain even if it
6714       is undefined.  But mc doesn't know that.
6715
6716       My solution is to mark the == in the IR specially, so as to tell
6717       mc that it almost certainly compares a value with itself, and we
6718       should just regard the result as always defined.  Rather than
6719       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6720       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6721
6722       So there's always the question of, can this give a false
6723       negative?  eg, imagine that initially, * %reg is defined; and we
6724       read that; but then in the gap between the read and the CAS, a
6725       different thread writes an undefined (and different) value at
6726       the location.  Then the CAS in this thread will fail and we will
6727       go back to "again:", but without knowing that the trip back
6728       there was based on an undefined comparison.  No matter; at least
6729       the other thread won the race and the location is correctly
6730       marked as undefined.  What if it wrote an uninitialised version
6731       of the same value that was there originally, though?
6732
6733       etc etc.  Seems like there's a small corner case in which we
6734       might lose the fact that something's defined -- we're out-raced
6735       in between the "old = * reg" and the "atomically {", _and_ the
6736       other thread is writing in an undefined version of what's
6737       already there.  Well, that seems pretty unlikely.
6738
6739       ---
6740
6741       If we ever need to reinstate it .. code which generates a
6742       definedness test for "expected == old" was removed at r10432 of
6743       this file.
6744    */
6745    if (cas->oldHi == IRTemp_INVALID) {
6746       do_shadow_CAS_single( mce, cas );
6747    } else {
6748       do_shadow_CAS_double( mce, cas );
6749    }
6750 }
6751
6752
6753 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6754 {
6755    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6756    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6757    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6758    IRAtom *expd_eq_old = NULL;
6759    IROp   opCasCmpEQ;
6760    Int    elemSzB;
6761    IRType elemTy;
6762    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6763
6764    /* single CAS */
6765    tl_assert(cas->oldHi == IRTemp_INVALID);
6766    tl_assert(cas->expdHi == NULL);
6767    tl_assert(cas->dataHi == NULL);
6768
6769    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6770    switch (elemTy) {
6771       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
6772       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6773       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6774       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6775       default: tl_assert(0); /* IR defn disallows any other types */
6776    }
6777
6778    /* 1. fetch data# (the proposed new value) */
6779    tl_assert(isOriginalAtom(mce, cas->dataLo));
6780    vdataLo
6781       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6782    tl_assert(isShadowAtom(mce, vdataLo));
6783    if (otrak) {
6784       bdataLo
6785          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6786       tl_assert(isShadowAtom(mce, bdataLo));
6787    }
6788
6789    /* 2. fetch expected# (what we expect to see at the address) */
6790    tl_assert(isOriginalAtom(mce, cas->expdLo));
6791    vexpdLo
6792       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6793    tl_assert(isShadowAtom(mce, vexpdLo));
6794    if (otrak) {
6795       bexpdLo
6796          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6797       tl_assert(isShadowAtom(mce, bexpdLo));
6798    }
6799
6800    /* 3. check definedness of address */
6801    /* 4. fetch old# from shadow memory; this also checks
6802          addressibility of the address */
6803    voldLo
6804       = assignNew(
6805            'V', mce, elemTy,
6806            expr2vbits_Load(
6807               mce,
6808               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6809               NULL/*always happens*/
6810         ));
6811    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6812    if (otrak) {
6813       boldLo
6814          = assignNew('B', mce, Ity_I32,
6815                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6816       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6817    }
6818
6819    /* 5. the CAS itself */
6820    stmt( 'C', mce, IRStmt_CAS(cas) );
6821
6822    /* 6. compute "expected == old" */
6823    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6824    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6825       tree, but it's not copied from the input block. */
6826    expd_eq_old
6827       = assignNew('C', mce, Ity_I1,
6828                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6829
6830    /* 7. if "expected == old"
6831             store data# to shadow memory */
6832    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6833                     NULL/*data*/, vdataLo/*vdata*/,
6834                     expd_eq_old/*guard for store*/ );
6835    if (otrak) {
6836       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6837                    bdataLo/*bdata*/,
6838                    expd_eq_old/*guard for store*/ );
6839    }
6840 }
6841
6842
6843 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6844 {
6845    IRAtom *vdataHi = NULL, *bdataHi = NULL;
6846    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6847    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6848    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6849    IRAtom *voldHi  = NULL, *boldHi  = NULL;
6850    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6851    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6852    IRAtom *expd_eq_old = NULL, *zero = NULL;
6853    IROp   opCasCmpEQ, opOr, opXor;
6854    Int    elemSzB, memOffsLo, memOffsHi;
6855    IRType elemTy;
6856    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6857
6858    /* double CAS */
6859    tl_assert(cas->oldHi != IRTemp_INVALID);
6860    tl_assert(cas->expdHi != NULL);
6861    tl_assert(cas->dataHi != NULL);
6862
6863    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6864    switch (elemTy) {
6865       case Ity_I8:
6866          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6867          elemSzB = 1; zero = mkU8(0);
6868          break;
6869       case Ity_I16:
6870          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6871          elemSzB = 2; zero = mkU16(0);
6872          break;
6873       case Ity_I32:
6874          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6875          elemSzB = 4; zero = mkU32(0);
6876          break;
6877       case Ity_I64:
6878          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6879          elemSzB = 8; zero = mkU64(0);
6880          break;
6881       default:
6882          tl_assert(0); /* IR defn disallows any other types */
6883    }
6884
6885    /* 1. fetch data# (the proposed new value) */
6886    tl_assert(isOriginalAtom(mce, cas->dataHi));
6887    tl_assert(isOriginalAtom(mce, cas->dataLo));
6888    vdataHi
6889       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6890    vdataLo
6891       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6892    tl_assert(isShadowAtom(mce, vdataHi));
6893    tl_assert(isShadowAtom(mce, vdataLo));
6894    if (otrak) {
6895       bdataHi
6896          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6897       bdataLo
6898          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6899       tl_assert(isShadowAtom(mce, bdataHi));
6900       tl_assert(isShadowAtom(mce, bdataLo));
6901    }
6902
6903    /* 2. fetch expected# (what we expect to see at the address) */
6904    tl_assert(isOriginalAtom(mce, cas->expdHi));
6905    tl_assert(isOriginalAtom(mce, cas->expdLo));
6906    vexpdHi
6907       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6908    vexpdLo
6909       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6910    tl_assert(isShadowAtom(mce, vexpdHi));
6911    tl_assert(isShadowAtom(mce, vexpdLo));
6912    if (otrak) {
6913       bexpdHi
6914          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6915       bexpdLo
6916          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6917       tl_assert(isShadowAtom(mce, bexpdHi));
6918       tl_assert(isShadowAtom(mce, bexpdLo));
6919    }
6920
6921    /* 3. check definedness of address */
6922    /* 4. fetch old# from shadow memory; this also checks
6923          addressibility of the address */
6924    if (cas->end == Iend_LE) {
6925       memOffsLo = 0;
6926       memOffsHi = elemSzB;
6927    } else {
6928       tl_assert(cas->end == Iend_BE);
6929       memOffsLo = elemSzB;
6930       memOffsHi = 0;
6931    }
6932    voldHi
6933       = assignNew(
6934            'V', mce, elemTy,
6935            expr2vbits_Load(
6936               mce,
6937               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6938               NULL/*always happens*/
6939         ));
6940    voldLo
6941       = assignNew(
6942            'V', mce, elemTy,
6943            expr2vbits_Load(
6944               mce,
6945               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6946               NULL/*always happens*/
6947         ));
6948    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6949    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6950    if (otrak) {
6951       boldHi
6952          = assignNew('B', mce, Ity_I32,
6953                      gen_load_b(mce, elemSzB, cas->addr,
6954                                 memOffsHi/*addr bias*/));
6955       boldLo
6956          = assignNew('B', mce, Ity_I32,
6957                      gen_load_b(mce, elemSzB, cas->addr,
6958                                 memOffsLo/*addr bias*/));
6959       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6960       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6961    }
6962
6963    /* 5. the CAS itself */
6964    stmt( 'C', mce, IRStmt_CAS(cas) );
6965
6966    /* 6. compute "expected == old" */
6967    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6968    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6969       tree, but it's not copied from the input block. */
6970    /*
6971       xHi = oldHi ^ expdHi;
6972       xLo = oldLo ^ expdLo;
6973       xHL = xHi | xLo;
6974       expd_eq_old = xHL == 0;
6975    */
6976    xHi = assignNew('C', mce, elemTy,
6977                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6978    xLo = assignNew('C', mce, elemTy,
6979                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6980    xHL = assignNew('C', mce, elemTy,
6981                    binop(opOr, xHi, xLo));
6982    expd_eq_old
6983       = assignNew('C', mce, Ity_I1,
6984                   binop(opCasCmpEQ, xHL, zero));
6985
6986    /* 7. if "expected == old"
6987             store data# to shadow memory */
6988    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6989                     NULL/*data*/, vdataHi/*vdata*/,
6990                     expd_eq_old/*guard for store*/ );
6991    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6992                     NULL/*data*/, vdataLo/*vdata*/,
6993                     expd_eq_old/*guard for store*/ );
6994    if (otrak) {
6995       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6996                    bdataHi/*bdata*/,
6997                    expd_eq_old/*guard for store*/ );
6998       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6999                    bdataLo/*bdata*/,
7000                    expd_eq_old/*guard for store*/ );
7001    }
7002 }
7003
7004
7005 /* ------ Dealing with LL/SC (not difficult) ------ */
7006
7007 static void do_shadow_LLSC ( MCEnv*    mce,
7008                              IREndness stEnd,
7009                              IRTemp    stResult,
7010                              IRExpr*   stAddr,
7011                              IRExpr*   stStoredata )
7012 {
7013    /* In short: treat a load-linked like a normal load followed by an
7014       assignment of the loaded (shadow) data to the result temporary.
7015       Treat a store-conditional like a normal store, and mark the
7016       result temporary as defined. */
7017    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
7018    IRTemp resTmp = findShadowTmpV(mce, stResult);
7019
7020    tl_assert(isIRAtom(stAddr));
7021    if (stStoredata)
7022       tl_assert(isIRAtom(stStoredata));
7023
7024    if (stStoredata == NULL) {
7025       /* Load Linked */
7026       /* Just treat this as a normal load, followed by an assignment of
7027          the value to .result. */
7028       /* Stay sane */
7029       tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7030                 || resTy == Ity_I16 || resTy == Ity_I8);
7031       assign( 'V', mce, resTmp,
7032                    expr2vbits_Load(
7033                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
7034                       NULL/*always happens*/) );
7035    } else {
7036       /* Store Conditional */
7037       /* Stay sane */
7038       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
7039                                    stStoredata);
7040       tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
7041                 || dataTy == Ity_I16 || dataTy == Ity_I8);
7042       do_shadow_Store( mce, stEnd,
7043                             stAddr, 0/* addr bias */,
7044                             stStoredata,
7045                             NULL /* shadow data */,
7046                             NULL/*guard*/ );
7047       /* This is a store conditional, so it writes to .result a value
7048          indicating whether or not the store succeeded.  Just claim
7049          this value is always defined.  In the PowerPC interpretation
7050          of store-conditional, definedness of the success indication
7051          depends on whether the address of the store matches the
7052          reservation address.  But we can't tell that here (and
7053          anyway, we're not being PowerPC-specific).  At least we are
7054          guaranteed that the definedness of the store address, and its
7055          addressibility, will be checked as per normal.  So it seems
7056          pretty safe to just say that the success indication is always
7057          defined.
7058
7059          In schemeS, for origin tracking, we must correspondingly set
7060          a no-origin value for the origin shadow of .result.
7061       */
7062       tl_assert(resTy == Ity_I1);
7063       assign( 'V', mce, resTmp, definedOfType(resTy) );
7064    }
7065 }
7066
7067
7068 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7069
7070 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
7071 {
7072    complainIfUndefined(mce, sg->guard, NULL);
7073    /* do_shadow_Store will generate code to check the definedness and
7074       validity of sg->addr, in the case where sg->guard evaluates to
7075       True at run-time. */
7076    do_shadow_Store( mce, sg->end,
7077                     sg->addr, 0/* addr bias */,
7078                     sg->data,
7079                     NULL /* shadow data */,
7080                     sg->guard );
7081 }
7082
7083 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
7084 {
7085    complainIfUndefined(mce, lg->guard, NULL);
7086    /* expr2vbits_Load_guarded_General will generate code to check the
7087       definedness and validity of lg->addr, in the case where
7088       lg->guard evaluates to True at run-time. */
7089
7090    /* Look at the LoadG's built-in conversion operation, to determine
7091       the source (actual loaded data) type, and the equivalent IROp.
7092       NOTE that implicitly we are taking a widening operation to be
7093       applied to original atoms and producing one that applies to V
7094       bits.  Since signed and unsigned widening are self-shadowing,
7095       this is a straight copy of the op (modulo swapping from the
7096       IRLoadGOp form to the IROp form).  Note also therefore that this
7097       implicitly duplicates the logic to do with said widening ops in
7098       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
7099    IROp   vwiden   = Iop_INVALID;
7100    IRType loadedTy = Ity_INVALID;
7101    switch (lg->cvt) {
7102       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
7103       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
7104       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
7105       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
7106       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
7107       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
7108       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
7109       default: VG_(tool_panic)("do_shadow_LoadG");
7110    }
7111
7112    IRAtom* vbits_alt
7113       = expr2vbits( mce, lg->alt, HuOth );
7114    IRAtom* vbits_final
7115       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
7116                                         lg->addr, 0/*addr bias*/,
7117                                         lg->guard, vwiden, vbits_alt );
7118    /* And finally, bind the V bits to the destination temporary. */
7119    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
7120 }
7121
7122
7123 /*------------------------------------------------------------*/
7124 /*--- Origin tracking stuff                                ---*/
7125 /*------------------------------------------------------------*/
7126
7127 /* Almost identical to findShadowTmpV. */
7128 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
7129 {
7130    TempMapEnt* ent;
7131    /* VG_(indexXA) range-checks 'orig', hence no need to check
7132       here. */
7133    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7134    tl_assert(ent->kind == Orig);
7135    if (ent->shadowB == IRTemp_INVALID) {
7136       IRTemp tmpB
7137         = newTemp( mce, Ity_I32, BSh );
7138       /* newTemp may cause mce->tmpMap to resize, hence previous results
7139          from VG_(indexXA) are invalid. */
7140       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7141       tl_assert(ent->kind == Orig);
7142       tl_assert(ent->shadowB == IRTemp_INVALID);
7143       ent->shadowB = tmpB;
7144    }
7145    return ent->shadowB;
7146 }
7147
7148 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
7149 {
7150    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
7151 }
7152
7153
7154 /* Make a guarded origin load, with no special handling in the
7155    didn't-happen case.  A GUARD of NULL is assumed to mean "always
7156    True".
7157
7158    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7159    return the otag.  The loaded size is SZB.  If GUARD evaluates to
7160    False at run time then the returned otag is zero.
7161 */
7162 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
7163                                     IRAtom* baseaddr,
7164                                     Int offset, IRExpr* guard )
7165 {
7166    void*    hFun;
7167    const HChar* hName;
7168    IRTemp   bTmp;
7169    IRDirty* di;
7170    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7171    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7172    IRAtom*  ea    = baseaddr;
7173    if (offset != 0) {
7174       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7175                                    : mkU64( (Long)(Int)offset );
7176       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7177    }
7178    bTmp = newTemp(mce, mce->hWordTy, BSh);
7179
7180    switch (szB) {
7181       case 1: hFun  = (void*)&MC_(helperc_b_load1);
7182               hName = "MC_(helperc_b_load1)";
7183               break;
7184       case 2: hFun  = (void*)&MC_(helperc_b_load2);
7185               hName = "MC_(helperc_b_load2)";
7186               break;
7187       case 4: hFun  = (void*)&MC_(helperc_b_load4);
7188               hName = "MC_(helperc_b_load4)";
7189               break;
7190       case 8: hFun  = (void*)&MC_(helperc_b_load8);
7191               hName = "MC_(helperc_b_load8)";
7192               break;
7193       case 16: hFun  = (void*)&MC_(helperc_b_load16);
7194                hName = "MC_(helperc_b_load16)";
7195                break;
7196       case 32: hFun  = (void*)&MC_(helperc_b_load32);
7197                hName = "MC_(helperc_b_load32)";
7198                break;
7199       default:
7200          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
7201          tl_assert(0);
7202    }
7203    di = unsafeIRDirty_1_N(
7204            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
7205            mkIRExprVec_1( ea )
7206         );
7207    if (guard) {
7208       di->guard = guard;
7209       /* Ideally the didn't-happen return value here would be
7210          all-zeroes (unknown-origin), so it'd be harmless if it got
7211          used inadvertently.  We slum it out with the IR-mandated
7212          default value (0b01 repeating, 0x55 etc) as that'll probably
7213          trump all legitimate otags via Max32, and it's pretty
7214          obviously bogus. */
7215    }
7216    /* no need to mess with any annotations.  This call accesses
7217       neither guest state nor guest memory. */
7218    stmt( 'B', mce, IRStmt_Dirty(di) );
7219    if (mce->hWordTy == Ity_I64) {
7220       /* 64-bit host */
7221       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7222       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7223       return mkexpr(bTmp32);
7224    } else {
7225       /* 32-bit host */
7226       return mkexpr(bTmp);
7227    }
7228 }
7229
7230
7231 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
7232    loaded size is SZB.  The load is regarded as unconditional (always
7233    happens).
7234 */
7235 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7236                             Int offset )
7237 {
7238    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7239 }
7240
7241
7242 /* The most general handler for guarded origin loads.  A GUARD of NULL
7243    is assumed to mean "always True".
7244
7245    Generate IR to do a shadow origin load from ADDR+BIAS and return
7246    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
7247    run time then the returned B bits are simply BALT instead.
7248 */
7249 static
7250 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7251                                         IRType ty,
7252                                         IRAtom* addr, UInt bias,
7253                                         IRAtom* guard, IRAtom* balt )
7254 {
7255    /* If the guard evaluates to True, this will hold the loaded
7256       origin.  If the guard evaluates to False, this will be zero,
7257       meaning "unknown origin", in which case we will have to replace
7258       it using an ITE below. */
7259    IRAtom* iftrue
7260       = assignNew('B', mce, Ity_I32,
7261                   gen_guarded_load_b(mce, sizeofIRType(ty),
7262                                      addr, bias, guard));
7263    /* These are the bits we will return if the load doesn't take
7264       place. */
7265    IRAtom* iffalse
7266       = balt;
7267    /* Prepare the cond for the ITE.  Convert a NULL cond into
7268       something that iropt knows how to fold out later. */
7269    IRAtom* cond
7270       = guard == NULL  ? mkU1(1)  : guard;
7271    /* And assemble the final result. */
7272    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7273 }
7274
7275
7276 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
7277    the store really happens; NULL means it unconditionally does. */
7278 static void gen_store_b ( MCEnv* mce, Int szB,
7279                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
7280                           IRAtom* guard )
7281 {
7282    void*    hFun;
7283    const HChar* hName;
7284    IRDirty* di;
7285    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7286    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7287    IRAtom*  ea    = baseaddr;
7288    if (guard) {
7289       tl_assert(isOriginalAtom(mce, guard));
7290       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7291    }
7292    if (offset != 0) {
7293       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7294                                    : mkU64( (Long)(Int)offset );
7295       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
7296    }
7297    if (mce->hWordTy == Ity_I64)
7298       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7299
7300    switch (szB) {
7301       case 1: hFun  = (void*)&MC_(helperc_b_store1);
7302               hName = "MC_(helperc_b_store1)";
7303               break;
7304       case 2: hFun  = (void*)&MC_(helperc_b_store2);
7305               hName = "MC_(helperc_b_store2)";
7306               break;
7307       case 4: hFun  = (void*)&MC_(helperc_b_store4);
7308               hName = "MC_(helperc_b_store4)";
7309               break;
7310       case 8: hFun  = (void*)&MC_(helperc_b_store8);
7311               hName = "MC_(helperc_b_store8)";
7312               break;
7313       case 16: hFun  = (void*)&MC_(helperc_b_store16);
7314                hName = "MC_(helperc_b_store16)";
7315                break;
7316       case 32: hFun  = (void*)&MC_(helperc_b_store32);
7317                hName = "MC_(helperc_b_store32)";
7318                break;
7319       default:
7320          tl_assert(0);
7321    }
7322    di = unsafeIRDirty_0_N( 2/*regparms*/,
7323            hName, VG_(fnptr_to_fnentry)( hFun ),
7324            mkIRExprVec_2( ea, dataB )
7325         );
7326    /* no need to mess with any annotations.  This call accesses
7327       neither guest state nor guest memory. */
7328    if (guard) di->guard = guard;
7329    stmt( 'B', mce, IRStmt_Dirty(di) );
7330 }
7331
7332 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7333    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7334    if (eTy == Ity_I64)
7335       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7336    if (eTy == Ity_I32)
7337       return e;
7338    tl_assert(0);
7339 }
7340
7341 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7342    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7343    tl_assert(eTy == Ity_I32);
7344    if (dstTy == Ity_I64)
7345       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7346    tl_assert(0);
7347 }
7348
7349
7350 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7351 {
7352    tl_assert(MC_(clo_mc_level) == 3);
7353
7354    switch (e->tag) {
7355
7356       case Iex_GetI: {
7357          IRRegArray* descr_b;
7358          IRAtom      *t1, *t2, *t3, *t4;
7359          IRRegArray* descr      = e->Iex.GetI.descr;
7360          IRType equivIntTy
7361             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7362          /* If this array is unshadowable for whatever reason, use the
7363             usual approximation. */
7364          if (equivIntTy == Ity_INVALID)
7365             return mkU32(0);
7366          tl_assert(sizeofIRType(equivIntTy) >= 4);
7367          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7368          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7369                                  equivIntTy, descr->nElems );
7370          /* Do a shadow indexed get of the same size, giving t1.  Take
7371             the bottom 32 bits of it, giving t2.  Compute into t3 the
7372             origin for the index (almost certainly zero, but there's
7373             no harm in being completely general here, since iropt will
7374             remove any useless code), and fold it in, giving a final
7375             value t4. */
7376          t1 = assignNew( 'B', mce, equivIntTy,
7377                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7378                                                 e->Iex.GetI.bias ));
7379          t2 = narrowTo32( mce, t1 );
7380          t3 = schemeE( mce, e->Iex.GetI.ix );
7381          t4 = gen_maxU32( mce, t2, t3 );
7382          return t4;
7383       }
7384       case Iex_CCall: {
7385          Int i;
7386          IRAtom*  here;
7387          IRExpr** args = e->Iex.CCall.args;
7388          IRAtom*  curr = mkU32(0);
7389          for (i = 0; args[i]; i++) {
7390             tl_assert(i < 32);
7391             tl_assert(isOriginalAtom(mce, args[i]));
7392             /* Only take notice of this arg if the callee's
7393                mc-exclusion mask does not say it is to be excluded. */
7394             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7395                /* the arg is to be excluded from definedness checking.
7396                   Do nothing. */
7397                if (0) VG_(printf)("excluding %s(%d)\n",
7398                                   e->Iex.CCall.cee->name, i);
7399             } else {
7400                /* calculate the arg's definedness, and pessimistically
7401                   merge it in. */
7402                here = schemeE( mce, args[i] );
7403                curr = gen_maxU32( mce, curr, here );
7404             }
7405          }
7406          return curr;
7407       }
7408       case Iex_Load: {
7409          Int dszB;
7410          dszB = sizeofIRType(e->Iex.Load.ty);
7411          /* assert that the B value for the address is already
7412             available (somewhere) */
7413          tl_assert(isIRAtom(e->Iex.Load.addr));
7414          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7415          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7416       }
7417       case Iex_ITE: {
7418          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7419          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7420          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7421          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7422       }
7423       case Iex_Qop: {
7424          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7425          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7426          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7427          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7428          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7429                                  gen_maxU32( mce, b3, b4 ) );
7430       }
7431       case Iex_Triop: {
7432          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7433          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7434          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7435          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7436       }
7437       case Iex_Binop: {
7438          switch (e->Iex.Binop.op) {
7439             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
7440             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7441             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7442             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7443                /* Just say these all produce a defined result,
7444                   regardless of their arguments.  See
7445                   COMMENT_ON_CasCmpEQ in this file. */
7446                return mkU32(0);
7447             default: {
7448                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7449                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7450                return gen_maxU32( mce, b1, b2 );
7451             }
7452          }
7453          tl_assert(0);
7454          /*NOTREACHED*/
7455       }
7456       case Iex_Unop: {
7457          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7458          return b1;
7459       }
7460       case Iex_Const:
7461          return mkU32(0);
7462       case Iex_RdTmp:
7463          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7464       case Iex_Get: {
7465          Int b_offset = MC_(get_otrack_shadow_offset)(
7466                            e->Iex.Get.offset,
7467                            sizeofIRType(e->Iex.Get.ty)
7468                         );
7469          tl_assert(b_offset >= -1
7470                    && b_offset <= mce->layout->total_sizeB -4);
7471          if (b_offset >= 0) {
7472             /* FIXME: this isn't an atom! */
7473             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7474                                Ity_I32 );
7475          }
7476          return mkU32(0);
7477       }
7478       default:
7479          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7480          ppIRExpr(e);
7481          VG_(tool_panic)("memcheck:schemeE");
7482    }
7483 }
7484
7485
7486 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7487 {
7488    // This is a hacked version of do_shadow_Dirty
7489    Int       i, k, n, toDo, gSz, gOff;
7490    IRAtom    *here, *curr;
7491    IRTemp    dst;
7492
7493    /* First check the guard. */
7494    curr = schemeE( mce, d->guard );
7495
7496    /* Now round up all inputs and maxU32 over them. */
7497
7498    /* Inputs: unmasked args
7499       Note: arguments are evaluated REGARDLESS of the guard expression */
7500    for (i = 0; d->args[i]; i++) {
7501       IRAtom* arg = d->args[i];
7502       if ( (d->cee->mcx_mask & (1<<i))
7503            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7504          /* ignore this arg */
7505       } else {
7506          here = schemeE( mce, arg );
7507          curr = gen_maxU32( mce, curr, here );
7508       }
7509    }
7510
7511    /* Inputs: guest state that we read. */
7512    for (i = 0; i < d->nFxState; i++) {
7513       tl_assert(d->fxState[i].fx != Ifx_None);
7514       if (d->fxState[i].fx == Ifx_Write)
7515          continue;
7516
7517       /* Enumerate the described state segments */
7518       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7519          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7520          gSz  = d->fxState[i].size;
7521
7522          /* Ignore any sections marked as 'always defined'. */
7523          if (isAlwaysDefd(mce, gOff, gSz)) {
7524             if (0)
7525             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7526                         gOff, gSz);
7527             continue;
7528          }
7529
7530          /* This state element is read or modified.  So we need to
7531             consider it.  If larger than 4 bytes, deal with it in
7532             4-byte chunks. */
7533          while (True) {
7534             Int b_offset;
7535             tl_assert(gSz >= 0);
7536             if (gSz == 0) break;
7537             n = gSz <= 4 ? gSz : 4;
7538             /* update 'curr' with maxU32 of the state slice
7539                gOff .. gOff+n-1 */
7540             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7541             if (b_offset != -1) {
7542                /* Observe the guard expression. If it is false use 0, i.e.
7543                   nothing is known about the origin */
7544                IRAtom *cond, *iffalse, *iftrue;
7545
7546                cond = assignNew( 'B', mce, Ity_I1, d->guard);
7547                iffalse = mkU32(0);
7548                iftrue  = assignNew( 'B', mce, Ity_I32,
7549                                     IRExpr_Get(b_offset
7550                                                  + 2*mce->layout->total_sizeB,
7551                                                Ity_I32));
7552                here = assignNew( 'B', mce, Ity_I32,
7553                                  IRExpr_ITE(cond, iftrue, iffalse));
7554                curr = gen_maxU32( mce, curr, here );
7555             }
7556             gSz -= n;
7557             gOff += n;
7558          }
7559       }
7560    }
7561
7562    /* Inputs: memory */
7563
7564    if (d->mFx != Ifx_None) {
7565       /* Because we may do multiple shadow loads/stores from the same
7566          base address, it's best to do a single test of its
7567          definedness right now.  Post-instrumentation optimisation
7568          should remove all but this test. */
7569       tl_assert(d->mAddr);
7570       here = schemeE( mce, d->mAddr );
7571       curr = gen_maxU32( mce, curr, here );
7572    }
7573
7574    /* Deal with memory inputs (reads or modifies) */
7575    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7576       toDo   = d->mSize;
7577       /* chew off 32-bit chunks.  We don't care about the endianness
7578          since it's all going to be condensed down to a single bit,
7579          but nevertheless choose an endianness which is hopefully
7580          native to the platform. */
7581       while (toDo >= 4) {
7582          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7583                                     d->guard );
7584          curr = gen_maxU32( mce, curr, here );
7585          toDo -= 4;
7586       }
7587       /* handle possible 16-bit excess */
7588       while (toDo >= 2) {
7589          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7590                                     d->guard );
7591          curr = gen_maxU32( mce, curr, here );
7592          toDo -= 2;
7593       }
7594       /* chew off the remaining 8-bit chunk, if any */
7595       if (toDo == 1) {
7596          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7597                                     d->guard );
7598          curr = gen_maxU32( mce, curr, here );
7599          toDo -= 1;
7600       }
7601       tl_assert(toDo == 0);
7602    }
7603
7604    /* Whew!  So curr is a 32-bit B-value which should give an origin
7605       of some use if any of the inputs to the helper are undefined.
7606       Now we need to re-distribute the results to all destinations. */
7607
7608    /* Outputs: the destination temporary, if there is one. */
7609    if (d->tmp != IRTemp_INVALID) {
7610       dst   = findShadowTmpB(mce, d->tmp);
7611       assign( 'V', mce, dst, curr );
7612    }
7613
7614    /* Outputs: guest state that we write or modify. */
7615    for (i = 0; i < d->nFxState; i++) {
7616       tl_assert(d->fxState[i].fx != Ifx_None);
7617       if (d->fxState[i].fx == Ifx_Read)
7618          continue;
7619
7620       /* Enumerate the described state segments */
7621       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7622          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7623          gSz  = d->fxState[i].size;
7624
7625          /* Ignore any sections marked as 'always defined'. */
7626          if (isAlwaysDefd(mce, gOff, gSz))
7627             continue;
7628
7629          /* This state element is written or modified.  So we need to
7630             consider it.  If larger than 4 bytes, deal with it in
7631             4-byte chunks. */
7632          while (True) {
7633             Int b_offset;
7634             tl_assert(gSz >= 0);
7635             if (gSz == 0) break;
7636             n = gSz <= 4 ? gSz : 4;
7637             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7638             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7639             if (b_offset != -1) {
7640
7641                /* If the guard expression evaluates to false we simply Put
7642                   the value that is already stored in the guest state slot */
7643                IRAtom *cond, *iffalse;
7644
7645                cond    = assignNew('B', mce, Ity_I1,
7646                                    d->guard);
7647                iffalse = assignNew('B', mce, Ity_I32,
7648                                    IRExpr_Get(b_offset +
7649                                               2*mce->layout->total_sizeB,
7650                                               Ity_I32));
7651                curr = assignNew('V', mce, Ity_I32,
7652                                 IRExpr_ITE(cond, curr, iffalse));
7653
7654                stmt( 'B', mce, IRStmt_Put(b_offset
7655                                           + 2*mce->layout->total_sizeB,
7656                                           curr ));
7657             }
7658             gSz -= n;
7659             gOff += n;
7660          }
7661       }
7662    }
7663
7664    /* Outputs: memory that we write or modify.  Same comments about
7665       endianness as above apply. */
7666    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7667       toDo   = d->mSize;
7668       /* chew off 32-bit chunks */
7669       while (toDo >= 4) {
7670          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7671                       d->guard );
7672          toDo -= 4;
7673       }
7674       /* handle possible 16-bit excess */
7675       while (toDo >= 2) {
7676          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7677                       d->guard );
7678          toDo -= 2;
7679       }
7680       /* chew off the remaining 8-bit chunk, if any */
7681       if (toDo == 1) {
7682          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7683                       d->guard );
7684          toDo -= 1;
7685       }
7686       tl_assert(toDo == 0);
7687    }
7688 }
7689
7690
7691 /* Generate IR for origin shadowing for a general guarded store. */
7692 static void do_origins_Store_guarded ( MCEnv* mce,
7693                                        IREndness stEnd,
7694                                        IRExpr* stAddr,
7695                                        IRExpr* stData,
7696                                        IRExpr* guard )
7697 {
7698    Int     dszB;
7699    IRAtom* dataB;
7700    /* assert that the B value for the address is already available
7701       (somewhere), since the call to schemeE will want to see it.
7702       XXXX how does this actually ensure that?? */
7703    tl_assert(isIRAtom(stAddr));
7704    tl_assert(isIRAtom(stData));
7705    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7706    dataB = schemeE( mce, stData );
7707    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7708 }
7709
7710
7711 /* Generate IR for origin shadowing for a plain store. */
7712 static void do_origins_Store_plain ( MCEnv* mce,
7713                                      IREndness stEnd,
7714                                      IRExpr* stAddr,
7715                                      IRExpr* stData )
7716 {
7717    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7718                               NULL/*guard*/ );
7719 }
7720
7721
7722 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7723
7724 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7725 {
7726    do_origins_Store_guarded( mce, sg->end, sg->addr,
7727                              sg->data, sg->guard );
7728 }
7729
7730 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7731 {
7732    IRType loadedTy = Ity_INVALID;
7733    switch (lg->cvt) {
7734       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7735       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7736       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7737       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7738       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7739       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7740       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7741       default: VG_(tool_panic)("schemeS.IRLoadG");
7742    }
7743    IRAtom* ori_alt
7744       = schemeE( mce,lg->alt );
7745    IRAtom* ori_final
7746       = expr2ori_Load_guarded_General(mce, loadedTy,
7747                                       lg->addr, 0/*addr bias*/,
7748                                       lg->guard, ori_alt );
7749    /* And finally, bind the origin to the destination temporary. */
7750    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7751 }
7752
7753
7754 static void schemeS ( MCEnv* mce, IRStmt* st )
7755 {
7756    tl_assert(MC_(clo_mc_level) == 3);
7757
7758    switch (st->tag) {
7759
7760       case Ist_AbiHint:
7761          /* The value-check instrumenter handles this - by arranging
7762             to pass the address of the next instruction to
7763             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7764             happen for origin tracking w.r.t. AbiHints.  So there is
7765             nothing to do here. */
7766          break;
7767
7768       case Ist_PutI: {
7769          IRPutI *puti = st->Ist.PutI.details;
7770          IRRegArray* descr_b;
7771          IRAtom      *t1, *t2, *t3, *t4;
7772          IRRegArray* descr = puti->descr;
7773          IRType equivIntTy
7774             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7775          /* If this array is unshadowable for whatever reason,
7776             generate no code. */
7777          if (equivIntTy == Ity_INVALID)
7778             break;
7779          tl_assert(sizeofIRType(equivIntTy) >= 4);
7780          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7781          descr_b
7782             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7783                             equivIntTy, descr->nElems );
7784          /* Compute a value to Put - the conjoinment of the origin for
7785             the data to be Put-ted (obviously) and of the index value
7786             (not so obviously). */
7787          t1 = schemeE( mce, puti->data );
7788          t2 = schemeE( mce, puti->ix );
7789          t3 = gen_maxU32( mce, t1, t2 );
7790          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7791          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7792                                                puti->bias, t4) ));
7793          break;
7794       }
7795
7796       case Ist_Dirty:
7797          do_origins_Dirty( mce, st->Ist.Dirty.details );
7798          break;
7799
7800       case Ist_Store:
7801          do_origins_Store_plain( mce, st->Ist.Store.end,
7802                                       st->Ist.Store.addr,
7803                                       st->Ist.Store.data );
7804          break;
7805
7806       case Ist_StoreG:
7807          do_origins_StoreG( mce, st->Ist.StoreG.details );
7808          break;
7809
7810       case Ist_LoadG:
7811          do_origins_LoadG( mce, st->Ist.LoadG.details );
7812          break;
7813
7814       case Ist_LLSC: {
7815          /* In short: treat a load-linked like a normal load followed
7816             by an assignment of the loaded (shadow) data the result
7817             temporary.  Treat a store-conditional like a normal store,
7818             and mark the result temporary as defined. */
7819          if (st->Ist.LLSC.storedata == NULL) {
7820             /* Load Linked */
7821             IRType resTy
7822                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7823             IRExpr* vanillaLoad
7824                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7825             tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7826                       || resTy == Ity_I16 || resTy == Ity_I8);
7827             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7828                               schemeE(mce, vanillaLoad));
7829          } else {
7830             /* Store conditional */
7831             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7832                                     st->Ist.LLSC.addr,
7833                                     st->Ist.LLSC.storedata );
7834             /* For the rationale behind this, see comments at the
7835                place where the V-shadow for .result is constructed, in
7836                do_shadow_LLSC.  In short, we regard .result as
7837                always-defined. */
7838             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7839                               mkU32(0) );
7840          }
7841          break;
7842       }
7843
7844       case Ist_Put: {
7845          Int b_offset
7846             = MC_(get_otrack_shadow_offset)(
7847                  st->Ist.Put.offset,
7848                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7849               );
7850          if (b_offset >= 0) {
7851             /* FIXME: this isn't an atom! */
7852             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7853                                        schemeE( mce, st->Ist.Put.data )) );
7854          }
7855          break;
7856       }
7857
7858       case Ist_WrTmp:
7859          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7860                            schemeE(mce, st->Ist.WrTmp.data) );
7861          break;
7862
7863       case Ist_MBE:
7864       case Ist_NoOp:
7865       case Ist_Exit:
7866       case Ist_IMark:
7867          break;
7868
7869       default:
7870          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7871          ppIRStmt(st);
7872          VG_(tool_panic)("memcheck:schemeS");
7873    }
7874 }
7875
7876
7877 /*------------------------------------------------------------*/
7878 /*--- Post-tree-build final tidying                        ---*/
7879 /*------------------------------------------------------------*/
7880
7881 /* This exploits the observation that Memcheck often produces
7882    repeated conditional calls of the form
7883
7884    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7885
7886    with the same guard expression G guarding the same helper call.
7887    The second and subsequent calls are redundant.  This usually
7888    results from instrumentation of guest code containing multiple
7889    memory references at different constant offsets from the same base
7890    register.  After optimisation of the instrumentation, you get a
7891    test for the definedness of the base register for each memory
7892    reference, which is kinda pointless.  MC_(final_tidy) therefore
7893    looks for such repeated calls and removes all but the first. */
7894
7895
7896 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7897    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7898    get almost all the benefits of this transformation whilst causing
7899    the slide-back case to just often enough to be verifiably
7900    correct.  For posterity, the numbers are:
7901
7902    bz2-32
7903
7904    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
7905    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
7906    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
7907    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
7908    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
7909    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
7910    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
7911    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
7912    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
7913    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
7914    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
7915    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
7916    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7917
7918    bz2-64
7919
7920    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
7921    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
7922    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
7923    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
7924    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
7925    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
7926    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
7927    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
7928    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
7929    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
7930    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
7931    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
7932    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7933 */
7934
7935 /* Structs for recording which (helper, guard) pairs we have already
7936    seen. */
7937
7938 #define N_TIDYING_PAIRS 16
7939
7940 typedef
7941    struct { void* entry; IRExpr* guard; }
7942    Pair;
7943
7944 typedef
7945    struct {
7946       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7947       UInt pairsUsed;
7948    }
7949    Pairs;
7950
7951
7952 /* Return True if e1 and e2 definitely denote the same value (used to
7953    compare guards).  Return False if unknown; False is the safe
7954    answer.  Since guest registers and guest memory do not have the
7955    SSA property we must return False if any Gets or Loads appear in
7956    the expression.  This implicitly assumes that e1 and e2 have the
7957    same IR type, which is always true here -- the type is Ity_I1. */
7958
7959 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7960 {
7961    if (e1->tag != e2->tag)
7962       return False;
7963    switch (e1->tag) {
7964       case Iex_Const:
7965          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7966       case Iex_Binop:
7967          return e1->Iex.Binop.op == e2->Iex.Binop.op
7968                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7969                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7970       case Iex_Unop:
7971          return e1->Iex.Unop.op == e2->Iex.Unop.op
7972                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7973       case Iex_RdTmp:
7974          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7975       case Iex_ITE:
7976          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7977                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
7978                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7979       case Iex_Qop:
7980       case Iex_Triop:
7981       case Iex_CCall:
7982          /* be lazy.  Could define equality for these, but they never
7983             appear to be used. */
7984          return False;
7985       case Iex_Get:
7986       case Iex_GetI:
7987       case Iex_Load:
7988          /* be conservative - these may not give the same value each
7989             time */
7990          return False;
7991       case Iex_Binder:
7992          /* should never see this */
7993          /* fallthrough */
7994       default:
7995          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7996          ppIRExpr(e1);
7997          VG_(tool_panic)("memcheck:sameIRValue");
7998          return False;
7999    }
8000 }
8001
8002 /* See if 'pairs' already has an entry for (entry, guard).  Return
8003    True if so.  If not, add an entry. */
8004
8005 static
8006 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
8007 {
8008    UInt i, n = tidyingEnv->pairsUsed;
8009    tl_assert(n <= N_TIDYING_PAIRS);
8010    for (i = 0; i < n; i++) {
8011       if (tidyingEnv->pairs[i].entry == entry
8012           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
8013          return True;
8014    }
8015    /* (guard, entry) wasn't found in the array.  Add it at the end.
8016       If the array is already full, slide the entries one slot
8017       backwards.  This means we will lose to ability to detect
8018       duplicates from the pair in slot zero, but that happens so
8019       rarely that it's unlikely to have much effect on overall code
8020       quality.  Also, this strategy loses the check for the oldest
8021       tracked exit (memory reference, basically) and so that is (I'd
8022       guess) least likely to be re-used after this point. */
8023    tl_assert(i == n);
8024    if (n == N_TIDYING_PAIRS) {
8025       for (i = 1; i < N_TIDYING_PAIRS; i++) {
8026          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
8027       }
8028       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
8029       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
8030    } else {
8031       tl_assert(n < N_TIDYING_PAIRS);
8032       tidyingEnv->pairs[n].entry = entry;
8033       tidyingEnv->pairs[n].guard = guard;
8034       n++;
8035       tidyingEnv->pairsUsed = n;
8036    }
8037    return False;
8038 }
8039
8040 static Bool is_helperc_value_checkN_fail ( const HChar* name )
8041 {
8042    /* This is expensive because it happens a lot.  We are checking to
8043       see whether |name| is one of the following 8 strings:
8044
8045          MC_(helperc_value_check8_fail_no_o)
8046          MC_(helperc_value_check4_fail_no_o)
8047          MC_(helperc_value_check0_fail_no_o)
8048          MC_(helperc_value_check1_fail_no_o)
8049          MC_(helperc_value_check8_fail_w_o)
8050          MC_(helperc_value_check0_fail_w_o)
8051          MC_(helperc_value_check1_fail_w_o)
8052          MC_(helperc_value_check4_fail_w_o)
8053
8054       To speed it up, check the common prefix just once, rather than
8055       all 8 times.
8056    */
8057    const HChar* prefix = "MC_(helperc_value_check";
8058
8059    HChar n, p;
8060    while (True) {
8061       n = *name;
8062       p = *prefix;
8063       if (p == 0) break; /* ran off the end of the prefix */
8064       /* We still have some prefix to use */
8065       if (n == 0) return False; /* have prefix, but name ran out */
8066       if (n != p) return False; /* have both pfx and name, but no match */
8067       name++;
8068       prefix++;
8069    }
8070
8071    /* Check the part after the prefix. */
8072    tl_assert(*prefix == 0 && *name != 0);
8073    return    0==VG_(strcmp)(name, "8_fail_no_o)")
8074           || 0==VG_(strcmp)(name, "4_fail_no_o)")
8075           || 0==VG_(strcmp)(name, "0_fail_no_o)")
8076           || 0==VG_(strcmp)(name, "1_fail_no_o)")
8077           || 0==VG_(strcmp)(name, "8_fail_w_o)")
8078           || 0==VG_(strcmp)(name, "4_fail_w_o)")
8079           || 0==VG_(strcmp)(name, "0_fail_w_o)")
8080           || 0==VG_(strcmp)(name, "1_fail_w_o)");
8081 }
8082
8083 IRSB* MC_(final_tidy) ( IRSB* sb_in )
8084 {
8085    Int       i;
8086    IRStmt*   st;
8087    IRDirty*  di;
8088    IRExpr*   guard;
8089    IRCallee* cee;
8090    Bool      alreadyPresent;
8091    Pairs     pairs;
8092
8093    pairs.pairsUsed = 0;
8094
8095    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
8096    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
8097
8098    /* Scan forwards through the statements.  Each time a call to one
8099       of the relevant helpers is seen, check if we have made a
8100       previous call to the same helper using the same guard
8101       expression, and if so, delete the call. */
8102    for (i = 0; i < sb_in->stmts_used; i++) {
8103       st = sb_in->stmts[i];
8104       tl_assert(st);
8105       if (st->tag != Ist_Dirty)
8106          continue;
8107       di = st->Ist.Dirty.details;
8108       guard = di->guard;
8109       tl_assert(guard);
8110       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
8111       cee = di->cee;
8112       if (!is_helperc_value_checkN_fail( cee->name ))
8113          continue;
8114        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8115           guard 'guard'.  Check if we have already seen a call to this
8116           function with the same guard.  If so, delete it.  If not,
8117           add it to the set of calls we do know about. */
8118       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
8119       if (alreadyPresent) {
8120          sb_in->stmts[i] = IRStmt_NoOp();
8121          if (0) VG_(printf)("XX\n");
8122       }
8123    }
8124
8125    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
8126    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
8127
8128    return sb_in;
8129 }
8130
8131 #undef N_TIDYING_PAIRS
8132
8133
8134 /*------------------------------------------------------------*/
8135 /*--- Startup assertion checking                           ---*/
8136 /*------------------------------------------------------------*/
8137
8138 void MC_(do_instrumentation_startup_checks)( void )
8139 {
8140    /* Make a best-effort check to see that is_helperc_value_checkN_fail
8141       is working as we expect. */
8142
8143 #  define CHECK(_expected, _string) \
8144       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8145
8146    /* It should identify these 8, and no others, as targets. */
8147    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
8148    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
8149    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
8150    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
8151    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
8152    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
8153    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
8154    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
8155
8156    /* Ad-hoc selection of other strings gathered via a quick test. */
8157    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
8158    CHECK(False, "amd64g_dirtyhelper_RDTSC");
8159    CHECK(False, "MC_(helperc_b_load1)");
8160    CHECK(False, "MC_(helperc_b_load2)");
8161    CHECK(False, "MC_(helperc_b_load4)");
8162    CHECK(False, "MC_(helperc_b_load8)");
8163    CHECK(False, "MC_(helperc_b_load16)");
8164    CHECK(False, "MC_(helperc_b_load32)");
8165    CHECK(False, "MC_(helperc_b_store1)");
8166    CHECK(False, "MC_(helperc_b_store2)");
8167    CHECK(False, "MC_(helperc_b_store4)");
8168    CHECK(False, "MC_(helperc_b_store8)");
8169    CHECK(False, "MC_(helperc_b_store16)");
8170    CHECK(False, "MC_(helperc_b_store32)");
8171    CHECK(False, "MC_(helperc_LOADV8)");
8172    CHECK(False, "MC_(helperc_LOADV16le)");
8173    CHECK(False, "MC_(helperc_LOADV32le)");
8174    CHECK(False, "MC_(helperc_LOADV64le)");
8175    CHECK(False, "MC_(helperc_LOADV128le)");
8176    CHECK(False, "MC_(helperc_LOADV256le)");
8177    CHECK(False, "MC_(helperc_STOREV16le)");
8178    CHECK(False, "MC_(helperc_STOREV32le)");
8179    CHECK(False, "MC_(helperc_STOREV64le)");
8180    CHECK(False, "MC_(helperc_STOREV8)");
8181    CHECK(False, "track_die_mem_stack_8");
8182    CHECK(False, "track_new_mem_stack_8_w_ECU");
8183    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8184    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
8185
8186 #  undef CHECK
8187 }
8188
8189
8190 /*------------------------------------------------------------*/
8191 /*--- Memcheck main                                        ---*/
8192 /*------------------------------------------------------------*/
8193
8194 static Bool isBogusAtom ( IRAtom* at )
8195 {
8196    if (at->tag == Iex_RdTmp)
8197       return False;
8198    tl_assert(at->tag == Iex_Const);
8199
8200    ULong n = 0;
8201    IRConst* con = at->Iex.Const.con;
8202    switch (con->tag) {
8203       case Ico_U1:   return False;
8204       case Ico_U8:   n = (ULong)con->Ico.U8; break;
8205       case Ico_U16:  n = (ULong)con->Ico.U16; break;
8206       case Ico_U32:  n = (ULong)con->Ico.U32; break;
8207       case Ico_U64:  n = (ULong)con->Ico.U64; break;
8208       case Ico_F32:  return False;
8209       case Ico_F64:  return False;
8210       case Ico_F32i: return False;
8211       case Ico_F64i: return False;
8212       case Ico_V128: return False;
8213       case Ico_V256: return False;
8214       default: ppIRExpr(at); tl_assert(0);
8215    }
8216    /* VG_(printf)("%llx\n", n); */
8217    /* Shortcuts */
8218    if (LIKELY(n <= 0x0000000000001000ULL)) return False;
8219    if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
8220    /* The list of bogus atoms is: */
8221    return (/*32*/    n == 0xFEFEFEFFULL
8222            /*32*/ || n == 0x80808080ULL
8223            /*32*/ || n == 0x7F7F7F7FULL
8224            /*32*/ || n == 0x7EFEFEFFULL
8225            /*32*/ || n == 0x81010100ULL
8226            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
8227            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
8228            /*64*/ || n == 0x0000000000008080ULL
8229            /*64*/ || n == 0x8080808080808080ULL
8230            /*64*/ || n == 0x0101010101010101ULL
8231           );
8232 }
8233
8234
8235 /* Does 'st' mention any of the literals identified/listed in
8236    isBogusAtom()? */
8237 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
8238 {
8239    Int      i;
8240    IRExpr*  e;
8241    IRDirty* d;
8242    IRCAS*   cas;
8243    switch (st->tag) {
8244       case Ist_WrTmp:
8245          e = st->Ist.WrTmp.data;
8246          switch (e->tag) {
8247             case Iex_Get:
8248             case Iex_RdTmp:
8249                return False;
8250             case Iex_Const:
8251                return isBogusAtom(e);
8252             case Iex_Unop:
8253                return isBogusAtom(e->Iex.Unop.arg)
8254                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
8255             case Iex_GetI:
8256                return isBogusAtom(e->Iex.GetI.ix);
8257             case Iex_Binop:
8258                return isBogusAtom(e->Iex.Binop.arg1)
8259                       || isBogusAtom(e->Iex.Binop.arg2);
8260             case Iex_Triop:
8261                return isBogusAtom(e->Iex.Triop.details->arg1)
8262                       || isBogusAtom(e->Iex.Triop.details->arg2)
8263                       || isBogusAtom(e->Iex.Triop.details->arg3);
8264             case Iex_Qop:
8265                return isBogusAtom(e->Iex.Qop.details->arg1)
8266                       || isBogusAtom(e->Iex.Qop.details->arg2)
8267                       || isBogusAtom(e->Iex.Qop.details->arg3)
8268                       || isBogusAtom(e->Iex.Qop.details->arg4);
8269             case Iex_ITE:
8270                return isBogusAtom(e->Iex.ITE.cond)
8271                       || isBogusAtom(e->Iex.ITE.iftrue)
8272                       || isBogusAtom(e->Iex.ITE.iffalse);
8273             case Iex_Load:
8274                return isBogusAtom(e->Iex.Load.addr);
8275             case Iex_CCall:
8276                for (i = 0; e->Iex.CCall.args[i]; i++)
8277                   if (isBogusAtom(e->Iex.CCall.args[i]))
8278                      return True;
8279                return False;
8280             default:
8281                goto unhandled;
8282          }
8283       case Ist_Dirty:
8284          d = st->Ist.Dirty.details;
8285          for (i = 0; d->args[i]; i++) {
8286             IRAtom* atom = d->args[i];
8287             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8288                if (isBogusAtom(atom))
8289                   return True;
8290             }
8291          }
8292          if (isBogusAtom(d->guard))
8293             return True;
8294          if (d->mAddr && isBogusAtom(d->mAddr))
8295             return True;
8296          return False;
8297       case Ist_Put:
8298          return isBogusAtom(st->Ist.Put.data);
8299       case Ist_PutI:
8300          return isBogusAtom(st->Ist.PutI.details->ix)
8301                 || isBogusAtom(st->Ist.PutI.details->data);
8302       case Ist_Store:
8303          return isBogusAtom(st->Ist.Store.addr)
8304                 || isBogusAtom(st->Ist.Store.data);
8305       case Ist_StoreG: {
8306          IRStoreG* sg = st->Ist.StoreG.details;
8307          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8308                 || isBogusAtom(sg->guard);
8309       }
8310       case Ist_LoadG: {
8311          IRLoadG* lg = st->Ist.LoadG.details;
8312          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8313                 || isBogusAtom(lg->guard);
8314       }
8315       case Ist_Exit:
8316          return isBogusAtom(st->Ist.Exit.guard);
8317       case Ist_AbiHint:
8318          return isBogusAtom(st->Ist.AbiHint.base)
8319                 || isBogusAtom(st->Ist.AbiHint.nia);
8320       case Ist_NoOp:
8321       case Ist_IMark:
8322       case Ist_MBE:
8323          return False;
8324       case Ist_CAS:
8325          cas = st->Ist.CAS.details;
8326          return isBogusAtom(cas->addr)
8327                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8328                 || isBogusAtom(cas->expdLo)
8329                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8330                 || isBogusAtom(cas->dataLo);
8331       case Ist_LLSC:
8332          return isBogusAtom(st->Ist.LLSC.addr)
8333                 || (st->Ist.LLSC.storedata
8334                        ? isBogusAtom(st->Ist.LLSC.storedata)
8335                        : False);
8336       default:
8337       unhandled:
8338          ppIRStmt(st);
8339          VG_(tool_panic)("hasBogusLiterals");
8340    }
8341 }
8342
8343
8344 /* This is the pre-instrumentation analysis.  It does a backwards pass over
8345    the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8346    the block.
8347
8348    Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8349    as a positive result from that is a strong indication that we need to
8350    expensively instrument add/sub in the block.  We do both analyses in one
8351    pass, even though they are independent, so as to avoid the overhead of
8352    having to traverse the whole block twice.
8353
8354    The usage pass proceeds as follows.  Let max= be the max operation in the
8355    HowUsed lattice, hence
8356
8357      X max= Y   means   X = max(X, Y)
8358
8359    then
8360
8361      for t in original tmps . useEnv[t] = HuUnU
8362
8363      for t used in the block's . next field
8364         useEnv[t] max= HuPCa  // because jmp targets are PCast-tested
8365
8366      for st iterating *backwards* in the block
8367
8368         match st
8369
8370            case "t1 = load(t2)"          // case 1
8371               useEnv[t2] max= HuPCa
8372
8373            case "t1 = add(t2, t3)"       // case 2
8374               useEnv[t2] max= useEnv[t1]
8375               useEnv[t3] max= useEnv[t1]
8376
8377            other
8378               for t in st.usedTmps       // case 3
8379                  useEnv[t] max= HuOth
8380                  // same as useEnv[t] = HuOth
8381
8382    The general idea is that we accumulate, in useEnv[], information about
8383    how each tmp is used.  That can be updated as we work further back
8384    through the block and find more uses of it, but its HowUsed value can
8385    only ascend the lattice, not descend.
8386
8387    Initially we mark all tmps as unused.  In case (1), if a tmp is seen to
8388    be used as a memory address, then its use is at least HuPCa.  The point
8389    is that for a memory address we will add instrumentation to check if any
8390    bit of the address is undefined, which means that we won't need expensive
8391    V-bit propagation through an add expression that computed the address --
8392    cheap add instrumentation will be equivalent.
8393
8394    Note in case (1) that if we have previously seen a non-memory-address use
8395    of the tmp, then its use will already be HuOth and will be unchanged by
8396    the max= operation.  And if it turns out that the source of the tmp was
8397    an add, then we'll have to expensively instrument the add, because we
8398    can't prove that, for the previous non-memory-address use of the tmp,
8399    cheap and expensive instrumentation will be equivalent.
8400
8401    In case 2, we propagate the usage-mode of the result of an add back
8402    through to its operands.  Again, we use max= so as to take account of the
8403    fact that t2 or t3 might later in the block (viz, earlier in the
8404    iteration) have been used in a way that requires expensive add
8405    instrumentation.
8406
8407    In case 3, we deal with all other tmp uses.  We assume that we'll need a
8408    result that is as accurate as possible, so we max= HuOth into its use
8409    mode.  Since HuOth is the top of the lattice, that's equivalent to just
8410    setting its use to HuOth.
8411
8412    The net result of all this is that:
8413
8414      tmps that are used either
8415        - only as a memory address, or
8416        - only as part of a tree of adds that computes a memory address,
8417          and has no other use
8418      are marked as HuPCa, and so we can instrument their generating Add
8419      nodes cheaply, which is the whole point of this analysis
8420
8421      tmps that are used any other way at all are marked as HuOth
8422
8423      tmps that are unused are marked as HuUnU.  We don't expect to see any
8424      since we expect that the incoming IR has had all dead assignments
8425      removed by previous optimisation passes.  Nevertheless the analysis is
8426      correct even in the presence of dead tmps.
8427
8428    A final comment on dead tmps.  In case 1 and case 2, we could actually
8429    conditionalise the updates thusly:
8430
8431      if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa }  // case 1
8432
8433      if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] }  // case 2
8434      if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] }  // case 2
8435
8436    In other words, if the assigned-to tmp |t1| is never used, then there's
8437    no point in propagating any use through to its operands.  That won't
8438    change the final HuPCa-vs-HuOth results, which is what we care about.
8439    Given that we expect to get dead-code-free inputs, there's no point in
8440    adding this extra refinement.
8441 */
8442
8443 /* Helper for |preInstrumentationAnalysis|. */
8444 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8445                                    UInt tyenvUsed,
8446                                    HowUsed newUse, IRAtom* at )
8447 {
8448    /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8449       seen a use of |newUse|.  So, merge that info into |t|'s accumulated
8450       use info. */
8451    switch (at->tag) {
8452       case Iex_GSPTR:
8453       case Iex_VECRET:
8454       case Iex_Const:
8455          return;
8456       case Iex_RdTmp: {
8457          IRTemp t = at->Iex.RdTmp.tmp;
8458          tl_assert(t < tyenvUsed); // "is an original tmp"
8459          // The "max" operation in the lattice
8460          if (newUse > useEnv[t]) useEnv[t] = newUse;
8461          return;
8462       }
8463       default:
8464          // We should never get here -- it implies non-flat IR
8465          ppIRExpr(at);
8466          VG_(tool_panic)("noteTmpUsesIn");
8467    }
8468    /*NOTREACHED*/
8469    tl_assert(0);
8470 }
8471
8472
8473 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8474                                          /*OUT*/Bool* hasBogusLiteralsP,
8475                                          const IRSB* sb_in )
8476 {
8477    const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8478
8479    // We've seen no bogus literals so far.
8480    Bool bogus = False;
8481
8482    // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8483    HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8484                                  nOrigTmps, sizeof(HowUsed));
8485
8486    // Firstly, roll in contributions from the final dst address.
8487    bogus = isBogusAtom(sb_in->next);
8488    noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8489
8490    // Now work backwards through the stmts.
8491    for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8492       IRStmt* st = sb_in->stmts[i];
8493
8494       // Deal with literals.
8495       if (LIKELY(!bogus)) {
8496          bogus = containsBogusLiterals(st);
8497       }
8498
8499       // Deal with tmp uses.
8500       switch (st->tag) {
8501          case Ist_WrTmp: {
8502             IRTemp  dst = st->Ist.WrTmp.tmp;
8503             IRExpr* rhs = st->Ist.WrTmp.data;
8504             // This is the one place where we have to consider all possible
8505             // tags for |rhs|, and can't just assume it is a tmp or a const.
8506             switch (rhs->tag) {
8507                case Iex_RdTmp:
8508                   // just propagate demand for |dst| into this tmp use.
8509                   noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8510                   break;
8511                case Iex_Unop:
8512                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8513                   break;
8514                case Iex_Binop:
8515                   if (rhs->Iex.Binop.op == Iop_Add64
8516                       || rhs->Iex.Binop.op == Iop_Add32) {
8517                      // propagate demand for |dst| through to the operands.
8518                      noteTmpUsesIn(useEnv, nOrigTmps,
8519                                    useEnv[dst], rhs->Iex.Binop.arg1);
8520                      noteTmpUsesIn(useEnv, nOrigTmps,
8521                                    useEnv[dst], rhs->Iex.Binop.arg2);
8522                   } else {
8523                      // just say that the operands are used in some unknown way.
8524                      noteTmpUsesIn(useEnv, nOrigTmps,
8525                                    HuOth, rhs->Iex.Binop.arg1);
8526                      noteTmpUsesIn(useEnv, nOrigTmps,
8527                                    HuOth, rhs->Iex.Binop.arg2);
8528                   }
8529                   break;
8530                case Iex_Triop: {
8531                   // All operands are used in some unknown way.
8532                   IRTriop* tri = rhs->Iex.Triop.details;
8533                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8534                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8535                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8536                   break;
8537                }
8538                case Iex_Qop: {
8539                   // All operands are used in some unknown way.
8540                   IRQop* qop = rhs->Iex.Qop.details;
8541                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8542                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8543                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8544                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8545                   break;
8546                }
8547                case Iex_Load:
8548                   // The address will be checked (== PCasted).
8549                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8550                   break;
8551                case Iex_ITE:
8552                   // The condition is PCasted, the then- and else-values
8553                   // aren't.
8554                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8555                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8556                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8557                   break;
8558                case Iex_CCall:
8559                   // The args are used in unknown ways.
8560                   for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8561                      noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8562                   }
8563                   break;
8564                case Iex_GetI: {
8565                   // The index will be checked/PCasted (see do_shadow_GETI)
8566                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8567                   break;
8568                }
8569                case Iex_Const:
8570                case Iex_Get:
8571                   break;
8572                default:
8573                   ppIRExpr(rhs);
8574                   VG_(tool_panic)("preInstrumentationAnalysis:"
8575                                   " unhandled IRExpr");
8576             }
8577             break;
8578          }
8579          case Ist_Store:
8580             // The address will be checked (== PCasted).  The data will be
8581             // used in some unknown way.
8582             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8583             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8584             break;
8585          case Ist_Exit:
8586             // The guard will be checked (== PCasted)
8587             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8588             break;
8589          case Ist_Put:
8590             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8591             break;
8592          case Ist_PutI: {
8593             IRPutI* putI = st->Ist.PutI.details;
8594             // The index will be checked/PCasted (see do_shadow_PUTI).  The
8595             // data will be used in an unknown way.
8596             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8597             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8598             break;
8599          }
8600          case Ist_Dirty: {
8601             IRDirty* d = st->Ist.Dirty.details;
8602             // The guard will be checked (== PCasted)
8603             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8604             // The args will be used in unknown ways.
8605             for (IRExpr** args = d->args; *args; args++) {
8606                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8607             }
8608             break;
8609          }
8610          case Ist_CAS: {
8611             IRCAS* cas = st->Ist.CAS.details;
8612             // Address will be pcasted, everything else used as unknown
8613             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8614             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8615             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8616             if (cas->expdHi)
8617                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8618             if (cas->dataHi)
8619                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8620             break;
8621          }
8622          case Ist_AbiHint:
8623             // Both exprs are used in unknown ways.  TODO: can we safely
8624             // just ignore AbiHints?
8625             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8626             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8627             break;
8628          case Ist_StoreG: {
8629             // We might be able to do better, and use HuPCa for the addr.
8630             // It's not immediately obvious that we can, because the address
8631             // is regarded as "used" only when the guard is true.
8632             IRStoreG* sg = st->Ist.StoreG.details;
8633             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8634             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8635             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8636             break;
8637          }
8638          case Ist_LoadG: {
8639             // Per similar comments to Ist_StoreG .. not sure whether this
8640             // is really optimal.
8641             IRLoadG* lg = st->Ist.LoadG.details;
8642             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8643             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8644             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8645             break;
8646          }
8647          case Ist_LLSC: {
8648             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8649             if (st->Ist.LLSC.storedata)
8650                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8651             break;
8652          }
8653          case Ist_MBE:
8654          case Ist_IMark:
8655          case Ist_NoOp:
8656             break;
8657          default: {
8658             ppIRStmt(st);
8659             VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8660          }
8661       }
8662    } // Now work backwards through the stmts.
8663
8664    // Return the computed use env and the bogus-atom flag.
8665    tl_assert(*useEnvP == NULL);
8666    *useEnvP = useEnv;
8667
8668    tl_assert(*hasBogusLiteralsP == False);
8669    *hasBogusLiteralsP = bogus;
8670 }
8671
8672
8673 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8674                         IRSB* sb_in,
8675                         const VexGuestLayout* layout,
8676                         const VexGuestExtents* vge,
8677                         const VexArchInfo* archinfo_host,
8678                         IRType gWordTy, IRType hWordTy )
8679 {
8680    Bool    verboze = 0||False;
8681    Int     i, j, first_stmt;
8682    IRStmt* st;
8683    MCEnv   mce;
8684    IRSB*   sb_out;
8685
8686    if (gWordTy != hWordTy) {
8687       /* We don't currently support this case. */
8688       VG_(tool_panic)("host/guest word size mismatch");
8689    }
8690
8691    /* Check we're not completely nuts */
8692    tl_assert(sizeof(UWord)  == sizeof(void*));
8693    tl_assert(sizeof(Word)   == sizeof(void*));
8694    tl_assert(sizeof(Addr)   == sizeof(void*));
8695    tl_assert(sizeof(ULong)  == 8);
8696    tl_assert(sizeof(Long)   == 8);
8697    tl_assert(sizeof(UInt)   == 4);
8698    tl_assert(sizeof(Int)    == 4);
8699
8700    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8701
8702    /* Set up SB */
8703    sb_out = deepCopyIRSBExceptStmts(sb_in);
8704
8705    /* Set up the running environment.  Both .sb and .tmpMap are
8706       modified as we go along.  Note that tmps are added to both
8707       .sb->tyenv and .tmpMap together, so the valid index-set for
8708       those two arrays should always be identical. */
8709    VG_(memset)(&mce, 0, sizeof(mce));
8710    mce.sb             = sb_out;
8711    mce.trace          = verboze;
8712    mce.layout         = layout;
8713    mce.hWordTy        = hWordTy;
8714    mce.tmpHowUsed     = NULL;
8715
8716    /* BEGIN decide on expense levels for instrumentation. */
8717
8718    /* Initially, select the cheap version of everything for which we have an
8719       option. */
8720    DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8721
8722    /* Take account of the --expensive-definedness-checks= flag. */
8723    if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8724       /* We just selected 'cheap for everything', so we don't need to do
8725          anything here.  mce.tmpHowUsed remains NULL. */
8726    }
8727    else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8728       /* Select 'expensive for everything'.  mce.tmpHowUsed remains NULL. */
8729       DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8730    }
8731    else {
8732       tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8733       /* We'll make our own selection, based on known per-target constraints
8734          and also on analysis of the block to be instrumented.  First, set
8735          up default values for detail levels.
8736
8737          On x86 and amd64, we'll routinely encounter code optimised by LLVM
8738          5 and above.  Enable accurate interpretation of the following.
8739          LLVM uses adds for some bitfield inserts, and we get a lot of false
8740          errors if the cheap interpretation is used, alas.  Could solve this
8741          much better if we knew which of such adds came from x86/amd64 LEA
8742          instructions, since these are the only ones really needing the
8743          expensive interpretation, but that would require some way to tag
8744          them in the _toIR.c front ends, which is a lot of faffing around.
8745          So for now we use preInstrumentationAnalysis() to detect adds which
8746          are used only to construct memory addresses, which is an
8747          approximation to the above, and is self-contained.*/
8748 #     if defined(VGA_x86)
8749       mce.dlbo.dl_Add32           = DLauto;
8750       mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8751       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8752 #     elif defined(VGA_amd64)
8753       mce.dlbo.dl_Add32           = DLexpensive;
8754       mce.dlbo.dl_Add64           = DLauto;
8755       mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8756       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8757       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8758 #     elif defined(VGA_ppc64le)
8759       // Needed by (at least) set_AV_CR6() in the front end.
8760       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8761 #     elif defined(VGA_arm64)
8762       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8763       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8764 #     elif defined(VGA_arm)
8765       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8766 #     endif
8767
8768       /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8769          fill it in. */
8770       Bool hasBogusLiterals = False;
8771       preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8772
8773       if (hasBogusLiterals) {
8774          /* This happens very rarely.  In this case just select expensive
8775             for everything, and throw away the tmp-use analysis results. */
8776          DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8777          VG_(free)( mce.tmpHowUsed );
8778          mce.tmpHowUsed = NULL;
8779       } else {
8780          /* Nothing.  mce.tmpHowUsed contains tmp-use analysis results,
8781             which will be used for some subset of Iop_{Add,Sub}{32,64},
8782             based on which ones are set to DLauto for this target. */
8783       }
8784    }
8785
8786    DetailLevelByOp__check_sanity( &mce.dlbo );
8787
8788    if (0) {
8789       // Debug printing: which tmps have been identified as PCast-only use
8790       if (mce.tmpHowUsed) {
8791          VG_(printf)("Cheapies: ");
8792          for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8793             if (mce.tmpHowUsed[q] == HuPCa) {
8794                VG_(printf)("t%u ", q);
8795             }
8796          }
8797          VG_(printf)("\n");
8798       }
8799
8800       // Debug printing: number of ops by detail level
8801       UChar nCheap     = DetailLevelByOp__count( &mce.dlbo, DLcheap     );
8802       UChar nAuto      = DetailLevelByOp__count( &mce.dlbo, DLauto      );
8803       UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8804       tl_assert(nCheap + nAuto + nExpensive == 8);
8805
8806       VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8807    }
8808    /* END decide on expense levels for instrumentation. */
8809
8810    /* Initialise the running the tmp environment. */
8811
8812    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8813                             sizeof(TempMapEnt));
8814    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8815    for (i = 0; i < sb_in->tyenv->types_used; i++) {
8816       TempMapEnt ent;
8817       ent.kind    = Orig;
8818       ent.shadowV = IRTemp_INVALID;
8819       ent.shadowB = IRTemp_INVALID;
8820       VG_(addToXA)( mce.tmpMap, &ent );
8821    }
8822    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8823
8824    /* Finally, begin instrumentation. */
8825    /* Copy verbatim any IR preamble preceding the first IMark */
8826
8827    tl_assert(mce.sb == sb_out);
8828    tl_assert(mce.sb != sb_in);
8829
8830    i = 0;
8831    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8832
8833       st = sb_in->stmts[i];
8834       tl_assert(st);
8835       tl_assert(isFlatIRStmt(st));
8836
8837       stmt( 'C', &mce, sb_in->stmts[i] );
8838       i++;
8839    }
8840
8841    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
8842       cause the IR following the preamble to contain references to IR
8843       temporaries defined in the preamble.  Because the preamble isn't
8844       instrumented, these temporaries don't have any shadows.
8845       Nevertheless uses of them following the preamble will cause
8846       memcheck to generate references to their shadows.  End effect is
8847       to cause IR sanity check failures, due to references to
8848       non-existent shadows.  This is only evident for the complex
8849       preambles used for function wrapping on TOC-afflicted platforms
8850       (ppc64-linux).
8851
8852       The following loop therefore scans the preamble looking for
8853       assignments to temporaries.  For each one found it creates an
8854       assignment to the corresponding (V) shadow temp, marking it as
8855       'defined'.  This is the same resulting IR as if the main
8856       instrumentation loop before had been applied to the statement
8857       'tmp = CONSTANT'.
8858
8859       Similarly, if origin tracking is enabled, we must generate an
8860       assignment for the corresponding origin (B) shadow, claiming
8861       no-origin, as appropriate for a defined value.
8862    */
8863    for (j = 0; j < i; j++) {
8864       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8865          /* findShadowTmpV checks its arg is an original tmp;
8866             no need to assert that here. */
8867          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8868          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8869          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
8870          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8871          if (MC_(clo_mc_level) == 3) {
8872             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8873             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8874             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8875          }
8876          if (0) {
8877             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8878             ppIRType( ty_v );
8879             VG_(printf)("\n");
8880          }
8881       }
8882    }
8883
8884    /* Iterate over the remaining stmts to generate instrumentation. */
8885
8886    tl_assert(sb_in->stmts_used > 0);
8887    tl_assert(i >= 0);
8888    tl_assert(i < sb_in->stmts_used);
8889    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8890
8891    for (/* use current i*/; i < sb_in->stmts_used; i++) {
8892
8893       st = sb_in->stmts[i];
8894       first_stmt = sb_out->stmts_used;
8895
8896       if (verboze) {
8897          VG_(printf)("\n");
8898          ppIRStmt(st);
8899          VG_(printf)("\n");
8900       }
8901
8902       if (MC_(clo_mc_level) == 3) {
8903          /* See comments on case Ist_CAS below. */
8904          if (st->tag != Ist_CAS)
8905             schemeS( &mce, st );
8906       }
8907
8908       /* Generate instrumentation code for each stmt ... */
8909
8910       switch (st->tag) {
8911
8912          case Ist_WrTmp: {
8913             IRTemp dst = st->Ist.WrTmp.tmp;
8914             tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8915             HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8916                                         : HuOth/*we don't know, so play safe*/;
8917             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8918                                expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8919             break;
8920          }
8921
8922          case Ist_Put:
8923             do_shadow_PUT( &mce,
8924                            st->Ist.Put.offset,
8925                            st->Ist.Put.data,
8926                            NULL /* shadow atom */, NULL /* guard */ );
8927             break;
8928
8929          case Ist_PutI:
8930             do_shadow_PUTI( &mce, st->Ist.PutI.details);
8931             break;
8932
8933          case Ist_Store:
8934             do_shadow_Store( &mce, st->Ist.Store.end,
8935                                    st->Ist.Store.addr, 0/* addr bias */,
8936                                    st->Ist.Store.data,
8937                                    NULL /* shadow data */,
8938                                    NULL/*guard*/ );
8939             break;
8940
8941          case Ist_StoreG:
8942             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8943             break;
8944
8945          case Ist_LoadG:
8946             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8947             break;
8948
8949          case Ist_Exit:
8950             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8951             break;
8952
8953          case Ist_IMark:
8954             break;
8955
8956          case Ist_NoOp:
8957          case Ist_MBE:
8958             break;
8959
8960          case Ist_Dirty:
8961             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8962             break;
8963
8964          case Ist_AbiHint:
8965             do_AbiHint( &mce, st->Ist.AbiHint.base,
8966                               st->Ist.AbiHint.len,
8967                               st->Ist.AbiHint.nia );
8968             break;
8969
8970          case Ist_CAS:
8971             do_shadow_CAS( &mce, st->Ist.CAS.details );
8972             /* Note, do_shadow_CAS copies the CAS itself to the output
8973                block, because it needs to add instrumentation both
8974                before and after it.  Hence skip the copy below.  Also
8975                skip the origin-tracking stuff (call to schemeS) above,
8976                since that's all tangled up with it too; do_shadow_CAS
8977                does it all. */
8978             break;
8979
8980          case Ist_LLSC:
8981             do_shadow_LLSC( &mce,
8982                             st->Ist.LLSC.end,
8983                             st->Ist.LLSC.result,
8984                             st->Ist.LLSC.addr,
8985                             st->Ist.LLSC.storedata );
8986             break;
8987
8988          default:
8989             VG_(printf)("\n");
8990             ppIRStmt(st);
8991             VG_(printf)("\n");
8992             VG_(tool_panic)("memcheck: unhandled IRStmt");
8993
8994       } /* switch (st->tag) */
8995
8996       if (0 && verboze) {
8997          for (j = first_stmt; j < sb_out->stmts_used; j++) {
8998             VG_(printf)("   ");
8999             ppIRStmt(sb_out->stmts[j]);
9000             VG_(printf)("\n");
9001          }
9002          VG_(printf)("\n");
9003       }
9004
9005       /* ... and finally copy the stmt itself to the output.  Except,
9006          skip the copy of IRCASs; see comments on case Ist_CAS
9007          above. */
9008       if (st->tag != Ist_CAS)
9009          stmt('C', &mce, st);
9010    }
9011
9012    /* Now we need to complain if the jump target is undefined. */
9013    first_stmt = sb_out->stmts_used;
9014
9015    if (verboze) {
9016       VG_(printf)("sb_in->next = ");
9017       ppIRExpr(sb_in->next);
9018       VG_(printf)("\n\n");
9019    }
9020
9021    complainIfUndefined( &mce, sb_in->next, NULL );
9022
9023    if (0 && verboze) {
9024       for (j = first_stmt; j < sb_out->stmts_used; j++) {
9025          VG_(printf)("   ");
9026          ppIRStmt(sb_out->stmts[j]);
9027          VG_(printf)("\n");
9028       }
9029       VG_(printf)("\n");
9030    }
9031
9032    /* If this fails, there's been some serious snafu with tmp management,
9033       that should be investigated. */
9034    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
9035    VG_(deleteXA)( mce.tmpMap );
9036
9037    if (mce.tmpHowUsed) {
9038       VG_(free)( mce.tmpHowUsed );
9039    }
9040
9041    tl_assert(mce.sb == sb_out);
9042    return sb_out;
9043 }
9044
9045
9046 /*--------------------------------------------------------------------*/
9047 /*--- end                                           mc_translate.c ---*/
9048 /*--------------------------------------------------------------------*/