memcheck/mc_translate.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Instrument IR to perform memory checking operations.         ---*/
   4 /*---                                               mc_translate.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of MemCheck, a heavyweight Valgrind tool for
   9    detecting memory errors.
  10
  11    Copyright (C) 2000-2017 Julian Seward
  12       jseward@acm.org
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 #include "pub_tool_basics.h"
  31 #include "pub_tool_poolalloc.h"     // For mc_include.h
  32 #include "pub_tool_hashtable.h"     // For mc_include.h
  33 #include "pub_tool_libcassert.h"
  34 #include "pub_tool_libcprint.h"
  35 #include "pub_tool_tooliface.h"
  36 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
  37 #include "pub_tool_xarray.h"
  38 #include "pub_tool_mallocfree.h"
  39 #include "pub_tool_libcbase.h"
  40
  41 #include "mc_include.h"
  42
  43
  44 /* FIXMEs JRS 2011-June-16.
  45
  46    Check the interpretation for vector narrowing and widening ops,
  47    particularly the saturating ones.  I suspect they are either overly
  48    pessimistic and/or wrong.
  49
  50    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
  51    saturating shifts): the interpretation is overly pessimistic.
  52    See comments on the relevant cases below for details.
  53
  54    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
  55    both rounding and non-rounding variants): ditto
  56 */
  57
  58 /* This file implements the Memcheck instrumentation, and in
  59    particular contains the core of its undefined value detection
  60    machinery.  For a comprehensive background of the terminology,
  61    algorithms and rationale used herein, read:
  62
  63      Using Valgrind to detect undefined value errors with
  64      bit-precision
  65
  66      Julian Seward and Nicholas Nethercote
  67
  68      2005 USENIX Annual Technical Conference (General Track),
  69      Anaheim, CA, USA, April 10-15, 2005.
  70
  71    ----
  72
  73    Here is as good a place as any to record exactly when V bits are and
  74    should be checked, why, and what function is responsible.
  75
  76
  77    Memcheck complains when an undefined value is used:
  78
  79    1. In the condition of a conditional branch.  Because it could cause
  80       incorrect control flow, and thus cause incorrect externally-visible
  81       behaviour.  [mc_translate.c:complainIfUndefined]
  82
  83    2. As an argument to a system call, or as the value that specifies
  84       the system call number.  Because it could cause an incorrect
  85       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
  86
  87    3. As the address in a load or store.  Because it could cause an
  88       incorrect value to be used later, which could cause externally-visible
  89       behaviour (eg. via incorrect control flow or an incorrect system call
  90       argument)  [complainIfUndefined]
  91
  92    4. As the target address of a branch.  Because it could cause incorrect
  93       control flow.  [complainIfUndefined]
  94
  95    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
  96       an incorrect value into the external environment.
  97       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
  98
  99    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
 100       [complainIfUndefined]
 101
 102    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
 103       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
 104       requested it.  [in memcheck.h]
 105
 106
 107    Memcheck also complains, but should not, when an undefined value is used:
 108
 109    8. As the shift value in certain SIMD shift operations (but not in the
 110       standard integer shift operations).  This inconsistency is due to
 111       historical reasons.)  [complainIfUndefined]
 112
 113
 114    Memcheck does not complain, but should, when an undefined value is used:
 115
 116    9. As an input to a client request.  Because the client request may
 117       affect the visible behaviour -- see bug #144362 for an example
 118       involving the malloc replacements in vg_replace_malloc.c and
 119       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
 120       isn't identified.  That bug report also has some info on how to solve
 121       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
 122
 123
 124    In practice, 1 and 2 account for the vast majority of cases.
 125 */
 126
 127 /* Generation of addr-definedness, addr-validity and
 128    guard-definedness checks pertaining to loads and stores (Iex_Load,
 129    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
 130    loads/stores) was re-checked 11 May 2013. */
 131
 132
 133 /*------------------------------------------------------------*/
 134 /*--- Forward decls                                        ---*/
 135 /*------------------------------------------------------------*/
 136
 137 struct _MCEnv;
 138
 139 // See below for comments explaining what this is for.
 140 typedef
 141    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 142    HowUsed;
 143
 144 static IRType  shadowTypeV ( IRType ty );
 145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
 146                             HowUsed hu/*use HuOth if unknown*/ );
 147 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
 148
 149 static IRExpr *i128_const_zero(void);
 150
 151
 152 /*------------------------------------------------------------*/
 153 /*--- Memcheck running state, and tmp management.          ---*/
 154 /*------------------------------------------------------------*/
 155
 156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
 157    propagation scheme, and a more expensive, more precise vbit propagation
 158    scheme.  This enum describes, for such an IROp, which scheme to use. */
 159 typedef
 160    enum {
 161       // Use the cheaper, less-exact variant.
 162       DLcheap=4,
 163       // Choose between cheap and expensive based on analysis of the block
 164       // to be instrumented.  Note that the choice may be done on a
 165       // per-instance basis of the IROp that this DetailLevel describes.
 166       DLauto,
 167       // Use the more expensive, more-exact variant.
 168       DLexpensive
 169    }
 170    DetailLevel;
 171
 172
 173 /* A readonly part of the running state.  For IROps that have both a
 174    less-exact and more-exact interpretation, records which interpretation is
 175    to be used.  */
 176 typedef
 177    struct {
 178       // For Add32/64 and Sub32/64, all 3 settings are allowed.  For the
 179       // DLauto case, a per-instance decision is to be made by inspecting
 180       // the associated tmp's entry in MCEnv.tmpHowUsed.
 181       DetailLevel dl_Add32;
 182       DetailLevel dl_Add64;
 183       DetailLevel dl_Sub32;
 184       DetailLevel dl_Sub64;
 185       // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
 186       // allowed.
 187       DetailLevel dl_CmpEQ64_CmpNE64;
 188       DetailLevel dl_CmpEQ32_CmpNE32;
 189       DetailLevel dl_CmpEQ16_CmpNE16;
 190       DetailLevel dl_CmpEQ8_CmpNE8;
 191    }
 192    DetailLevelByOp;
 193
 194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
 195                                        DetailLevel dl )
 196 {
 197    dlbo->dl_Add32           = dl;
 198    dlbo->dl_Add64           = dl;
 199    dlbo->dl_Sub32           = dl;
 200    dlbo->dl_Sub64           = dl;
 201    dlbo->dl_CmpEQ64_CmpNE64 = dl;
 202    dlbo->dl_CmpEQ32_CmpNE32 = dl;
 203    dlbo->dl_CmpEQ16_CmpNE16 = dl;
 204    dlbo->dl_CmpEQ8_CmpNE8   = dl;
 205 }
 206
 207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
 208 {
 209    tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
 210    tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
 211    tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
 212    tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
 213    tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
 214              || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
 215    tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
 216              || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
 217    tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
 218              || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
 219    tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
 220              || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
 221 }
 222
 223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
 224                                      DetailLevel dl )
 225 {
 226    UInt n = 0;
 227    n += (dlbo->dl_Add32 == dl            ? 1 : 0);
 228    n += (dlbo->dl_Add64 == dl            ? 1 : 0);
 229    n += (dlbo->dl_Sub32 == dl            ? 1 : 0);
 230    n += (dlbo->dl_Sub64 == dl            ? 1 : 0);
 231    n += (dlbo->dl_CmpEQ64_CmpNE64 == dl  ? 1 : 0);
 232    n += (dlbo->dl_CmpEQ32_CmpNE32 == dl  ? 1 : 0);
 233    n += (dlbo->dl_CmpEQ16_CmpNE16 == dl  ? 1 : 0);
 234    n += (dlbo->dl_CmpEQ8_CmpNE8 == dl    ? 1 : 0);
 235    return n;
 236 }
 237
 238
 239 /* Carries info about a particular tmp.  The tmp's number is not
 240    recorded, as this is implied by (equal to) its index in the tmpMap
 241    in MCEnv.  The tmp's type is also not recorded, as this is present
 242    in MCEnv.sb->tyenv.
 243
 244    When .kind is Orig, .shadowV and .shadowB may give the identities
 245    of the temps currently holding the associated definedness (shadowV)
 246    and origin (shadowB) values, or these may be IRTemp_INVALID if code
 247    to compute such values has not yet been emitted.
 248
 249    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
 250    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
 251    illogical for a shadow tmp itself to be shadowed.
 252 */
 253 typedef
 254    enum { Orig=1, VSh=2, BSh=3 }
 255    TempKind;
 256
 257 typedef
 258    struct {
 259       TempKind kind;
 260       IRTemp   shadowV;
 261       IRTemp   shadowB;
 262    }
 263    TempMapEnt;
 264
 265
 266 /* A |HowUsed| value carries analysis results about how values are used,
 267    pertaining to whether we need to instrument integer adds expensively or
 268    not.  The running state carries a (readonly) mapping from original tmp to
 269    a HowUsed value for it.  A usage value can be one of three values,
 270    forming a 3-point chain lattice.
 271
 272       HuOth   ("Other") used in some arbitrary way
 273        |
 274       HuPCa   ("PCast") used *only* in effectively a PCast, in which all
 275        |      we care about is the all-defined vs not-all-defined distinction
 276        |
 277       HuUnU   ("Unused") not used at all.
 278
 279    The "safe" (don't-know) end of the lattice is "HuOth".  See comments
 280    below in |preInstrumentationAnalysis| for further details.
 281 */
 282 /* DECLARED ABOVE:
 283 typedef
 284    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 285    HowUsed;
 286 */
 287
 288 // Not actually necessary, but we don't want to waste D1 space.
 289 STATIC_ASSERT(sizeof(HowUsed) == 1);
 290
 291
 292 /* Carries around state during memcheck instrumentation. */
 293 typedef
 294    struct _MCEnv {
 295       /* MODIFIED: the superblock being constructed.  IRStmts are
 296          added. */
 297       IRSB* sb;
 298       Bool  trace;
 299
 300       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
 301          current kind and possibly shadow temps for each temp in the
 302          IRSB being constructed.  Note that it does not contain the
 303          type of each tmp.  If you want to know the type, look at the
 304          relevant entry in sb->tyenv.  It follows that at all times
 305          during the instrumentation process, the valid indices for
 306          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
 307          total number of Orig, V- and B- temps allocated so far.
 308
 309          The reason for this strange split (types in one place, all
 310          other info in another) is that we need the types to be
 311          attached to sb so as to make it possible to do
 312          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
 313          instrumentation process. */
 314       XArray* /* of TempMapEnt */ tmpMap;
 315
 316       /* READONLY: contains details of which ops should be expensively
 317          instrumented. */
 318       DetailLevelByOp dlbo;
 319
 320       /* READONLY: for each original tmp, how the tmp is used.  This is
 321          computed by |preInstrumentationAnalysis|.  Valid indices are
 322          0 .. #temps_in_sb-1 (same as for tmpMap). */
 323       HowUsed* tmpHowUsed;
 324
 325       /* READONLY: the guest layout.  This indicates which parts of
 326          the guest state should be regarded as 'always defined'. */
 327       const VexGuestLayout* layout;
 328
 329       /* READONLY: the host word type.  Needed for constructing
 330          arguments of type 'HWord' to be passed to helper functions.
 331          Ity_I32 or Ity_I64 only. */
 332       IRType hWordTy;
 333    }
 334    MCEnv;
 335
 336
 337 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
 338    demand), as they are encountered.  This is for two reasons.
 339
 340    (1) (less important reason): Many original tmps are unused due to
 341    initial IR optimisation, and we do not want to spaces in tables
 342    tracking them.
 343
 344    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
 345    table indexed [0 .. n_types-1], which gives the current shadow for
 346    each original tmp, or INVALID_IRTEMP if none is so far assigned.
 347    It is necessary to support making multiple assignments to a shadow
 348    -- specifically, after testing a shadow for definedness, it needs
 349    to be made defined.  But IR's SSA property disallows this.
 350
 351    (2) (more important reason): Therefore, when a shadow needs to get
 352    a new value, a new temporary is created, the value is assigned to
 353    that, and the tmpMap is updated to reflect the new binding.
 354
 355    A corollary is that if the tmpMap maps a given tmp to
 356    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
 357    there's a read-before-write error in the original tmps.  The IR
 358    sanity checker should catch all such anomalies, however.
 359 */
 360
 361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
 362    both the table in mce->sb and to our auxiliary mapping.  Note that
 363    newTemp may cause mce->tmpMap to resize, hence previous results
 364    from VG_(indexXA)(mce->tmpMap) are invalidated. */
 365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
 366 {
 367    Word       newIx;
 368    TempMapEnt ent;
 369    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
 370    ent.kind    = kind;
 371    ent.shadowV = IRTemp_INVALID;
 372    ent.shadowB = IRTemp_INVALID;
 373    newIx = VG_(addToXA)( mce->tmpMap, &ent );
 374    tl_assert(newIx == (Word)tmp);
 375    return tmp;
 376 }
 377
 378
 379 /* Find the tmp currently shadowing the given original tmp.  If none
 380    so far exists, allocate one.  */
 381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 382 {
 383    TempMapEnt* ent;
 384    /* VG_(indexXA) range-checks 'orig', hence no need to check
 385       here. */
 386    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 387    tl_assert(ent->kind == Orig);
 388    if (ent->shadowV == IRTemp_INVALID) {
 389       IRTemp tmpV
 390         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 391       /* newTemp may cause mce->tmpMap to resize, hence previous results
 392          from VG_(indexXA) are invalid. */
 393       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 394       tl_assert(ent->kind == Orig);
 395       tl_assert(ent->shadowV == IRTemp_INVALID);
 396       ent->shadowV = tmpV;
 397    }
 398    return ent->shadowV;
 399 }
 400
 401 /* Allocate a new shadow for the given original tmp.  This means any
 402    previous shadow is abandoned.  This is needed because it is
 403    necessary to give a new value to a shadow once it has been tested
 404    for undefinedness, but unfortunately IR's SSA property disallows
 405    this.  Instead we must abandon the old shadow, allocate a new one
 406    and use that instead.
 407
 408    This is the same as findShadowTmpV, except we don't bother to see
 409    if a shadow temp already existed -- we simply allocate a new one
 410    regardless. */
 411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 412 {
 413    TempMapEnt* ent;
 414    /* VG_(indexXA) range-checks 'orig', hence no need to check
 415       here. */
 416    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 417    tl_assert(ent->kind == Orig);
 418    if (1) {
 419       IRTemp tmpV
 420         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 421       /* newTemp may cause mce->tmpMap to resize, hence previous results
 422          from VG_(indexXA) are invalid. */
 423       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 424       tl_assert(ent->kind == Orig);
 425       ent->shadowV = tmpV;
 426    }
 427 }
 428
 429
 430 /*------------------------------------------------------------*/
 431 /*--- IRAtoms -- a subset of IRExprs                       ---*/
 432 /*------------------------------------------------------------*/
 433
 434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
 435    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
 436    input, most of this code deals in atoms.  Usefully, a value atom
 437    always has a V-value which is also an atom: constants are shadowed
 438    by constants, and temps are shadowed by the corresponding shadow
 439    temporary. */
 440
 441 typedef  IRExpr  IRAtom;
 442
 443 /* (used for sanity checks only): is this an atom which looks
 444    like it's from original code? */
 445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 446 {
 447    if (a1->tag == Iex_Const)
 448       return True;
 449    if (a1->tag == Iex_RdTmp) {
 450       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 451       return ent->kind == Orig;
 452    }
 453    return False;
 454 }
 455
 456 /* (used for sanity checks only): is this an atom which looks
 457    like it's from shadow code? */
 458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 459 {
 460    if (a1->tag == Iex_Const)
 461       return True;
 462    if (a1->tag == Iex_RdTmp) {
 463       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 464       return ent->kind == VSh || ent->kind == BSh;
 465    }
 466    return False;
 467 }
 468
 469 /* (used for sanity checks only): check that both args are atoms and
 470    are identically-kinded. */
 471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 472 {
 473    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 474       return True;
 475    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 476       return True;
 477    return False;
 478 }
 479
 480
 481 /*------------------------------------------------------------*/
 482 /*--- Type management                                      ---*/
 483 /*------------------------------------------------------------*/
 484
 485 /* Shadow state is always accessed using integer types.  This returns
 486    an integer type with the same size (as per sizeofIRType) as the
 487    given type.  The only valid shadow types are Bit, I8, I16, I32,
 488    I64, I128, V128, V256. */
 489
 490 static IRType shadowTypeV ( IRType ty )
 491 {
 492    switch (ty) {
 493       case Ity_I1:
 494       case Ity_I8:
 495       case Ity_I16:
 496       case Ity_I32:
 497       case Ity_I64:
 498       case Ity_I128: return ty;
 499       case Ity_F16:  return Ity_I16;
 500       case Ity_F32:  return Ity_I32;
 501       case Ity_D32:  return Ity_I32;
 502       case Ity_F64:  return Ity_I64;
 503       case Ity_D64:  return Ity_I64;
 504       case Ity_F128: return Ity_I128;
 505       case Ity_D128: return Ity_I128;
 506       case Ity_V128: return Ity_V128;
 507       case Ity_V256: return Ity_V256;
 508       default: ppIRType(ty);
 509                VG_(tool_panic)("memcheck:shadowTypeV");
 510    }
 511 }
 512
 513 /* Produce a 'defined' value of the given shadow type.  Should only be
 514    supplied shadow types (Bit/I8/I16/I32/UI64). */
 515 static IRExpr* definedOfType ( IRType ty ) {
 516    switch (ty) {
 517       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
 518       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
 519       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
 520       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
 521       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
 522       case Ity_I128: return i128_const_zero();
 523       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
 524       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
 525       default:       VG_(tool_panic)("memcheck:definedOfType");
 526    }
 527 }
 528
 529
 530 /*------------------------------------------------------------*/
 531 /*--- Constructing IR fragments                            ---*/
 532 /*------------------------------------------------------------*/
 533
 534 /* add stmt to a bb */
 535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
 536    if (mce->trace) {
 537       VG_(printf)("  %c: ", cat);
 538       ppIRStmt(st);
 539       VG_(printf)("\n");
 540    }
 541    addStmtToIRSB(mce->sb, st);
 542 }
 543
 544 /* assign value to tmp */
 545 static inline
 546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
 547    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
 548 }
 549
 550 /* build various kinds of expressions */
 551 #define triop(_op, _arg1, _arg2, _arg3) \
 552                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
 553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
 554 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
 555 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
 556 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
 557 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
 558 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 559 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 560 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 561 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 562
 563 /* Bind the given expression to a new temporary, and return the
 564    temporary.  This effectively converts an arbitrary expression into
 565    an atom.
 566
 567    'ty' is the type of 'e' and hence the type that the new temporary
 568    needs to be.  But passing it in is redundant, since we can deduce
 569    the type merely by inspecting 'e'.  So at least use that fact to
 570    assert that the two types agree. */
 571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 572 {
 573    TempKind k;
 574    IRTemp   t;
 575    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
 576
 577    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
 578    switch (cat) {
 579       case 'V': k = VSh;  break;
 580       case 'B': k = BSh;  break;
 581       case 'C': k = Orig; break;
 582                 /* happens when we are making up new "orig"
 583                    expressions, for IRCAS handling */
 584       default: tl_assert(0);
 585    }
 586    t = newTemp(mce, ty, k);
 587    assign(cat, mce, t, e);
 588    return mkexpr(t);
 589 }
 590
 591
 592 /*------------------------------------------------------------*/
 593 /*--- Helper functions for 128-bit ops                     ---*/
 594 /*------------------------------------------------------------*/
 595
 596 static IRExpr *i128_const_zero(void)
 597 {
 598    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
 599    return binop(Iop_64HLto128, z64, z64);
 600 }
 601
 602 /* There are no I128-bit loads and/or stores [as generated by any
 603    current front ends].  So we do not need to worry about that in
 604    expr2vbits_Load */
 605
 606
 607 /*------------------------------------------------------------*/
 608 /*--- Constructing definedness primitive ops               ---*/
 609 /*------------------------------------------------------------*/
 610
 611 /* --------- Defined-if-either-defined --------- */
 612
 613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 614    tl_assert(isShadowAtom(mce,a1));
 615    tl_assert(isShadowAtom(mce,a2));
 616    return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
 617 }
 618
 619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 620    tl_assert(isShadowAtom(mce,a1));
 621    tl_assert(isShadowAtom(mce,a2));
 622    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 623 }
 624
 625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 626    tl_assert(isShadowAtom(mce,a1));
 627    tl_assert(isShadowAtom(mce,a2));
 628    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 629 }
 630
 631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 632    tl_assert(isShadowAtom(mce,a1));
 633    tl_assert(isShadowAtom(mce,a2));
 634    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 635 }
 636
 637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 638    tl_assert(isShadowAtom(mce,a1));
 639    tl_assert(isShadowAtom(mce,a2));
 640    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 641 }
 642
 643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 644    tl_assert(isShadowAtom(mce,a1));
 645    tl_assert(isShadowAtom(mce,a2));
 646    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 647 }
 648
 649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 650    tl_assert(isShadowAtom(mce,a1));
 651    tl_assert(isShadowAtom(mce,a2));
 652    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
 653 }
 654
 655 /* --------- Undefined-if-either-undefined --------- */
 656
 657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 658    tl_assert(isShadowAtom(mce,a1));
 659    tl_assert(isShadowAtom(mce,a2));
 660    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
 661 }
 662
 663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 664    tl_assert(isShadowAtom(mce,a1));
 665    tl_assert(isShadowAtom(mce,a2));
 666    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 667 }
 668
 669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 670    tl_assert(isShadowAtom(mce,a1));
 671    tl_assert(isShadowAtom(mce,a2));
 672    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 673 }
 674
 675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 676    tl_assert(isShadowAtom(mce,a1));
 677    tl_assert(isShadowAtom(mce,a2));
 678    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 679 }
 680
 681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 682    tl_assert(isShadowAtom(mce,a1));
 683    tl_assert(isShadowAtom(mce,a2));
 684    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 685 }
 686
 687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 688    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
 689    tl_assert(isShadowAtom(mce,a1));
 690    tl_assert(isShadowAtom(mce,a2));
 691    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
 692    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
 693    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
 694    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
 695    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
 696    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
 697
 698    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
 699 }
 700
 701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 702    tl_assert(isShadowAtom(mce,a1));
 703    tl_assert(isShadowAtom(mce,a2));
 704    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 705 }
 706
 707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 708    tl_assert(isShadowAtom(mce,a1));
 709    tl_assert(isShadowAtom(mce,a2));
 710    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
 711 }
 712
 713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
 714    switch (vty) {
 715       case Ity_I8:   return mkUifU8(mce, a1, a2);
 716       case Ity_I16:  return mkUifU16(mce, a1, a2);
 717       case Ity_I32:  return mkUifU32(mce, a1, a2);
 718       case Ity_I64:  return mkUifU64(mce, a1, a2);
 719       case Ity_I128: return mkUifU128(mce, a1, a2);
 720       case Ity_V128: return mkUifUV128(mce, a1, a2);
 721       case Ity_V256: return mkUifUV256(mce, a1, a2);
 722       default:
 723          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
 724          VG_(tool_panic)("memcheck:mkUifU");
 725    }
 726 }
 727
 728 /* --------- The Left-family of operations. --------- */
 729
 730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
 731    tl_assert(isShadowAtom(mce,a1));
 732    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 733 }
 734
 735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
 736    tl_assert(isShadowAtom(mce,a1));
 737    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 738 }
 739
 740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
 741    tl_assert(isShadowAtom(mce,a1));
 742    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 743 }
 744
 745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
 746    tl_assert(isShadowAtom(mce,a1));
 747    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 748 }
 749
 750 /* --------- The Right-family of operations. --------- */
 751
 752 /* Unfortunately these are a lot more expensive then their Left
 753    counterparts.  Fortunately they are only very rarely used -- only for
 754    count-leading-zeroes instrumentation. */
 755
 756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
 757 {
 758    for (Int i = 1; i <= 16; i *= 2) {
 759       // a1 |= (a1 >>u i)
 760       IRAtom* tmp
 761          = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
 762       a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
 763    }
 764    return a1;
 765 }
 766
 767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
 768 {
 769    for (Int i = 1; i <= 32; i *= 2) {
 770       // a1 |= (a1 >>u i)
 771       IRAtom* tmp
 772          = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
 773       a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
 774    }
 775    return a1;
 776 }
 777
 778 /* --------- 'Improvement' functions for AND/OR. --------- */
 779
 780 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
 781    defined (0); all other -> undefined (1).
 782 */
 783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 784 {
 785    tl_assert(isOriginalAtom(mce, data));
 786    tl_assert(isShadowAtom(mce, vbits));
 787    tl_assert(sameKindedAtoms(data, vbits));
 788    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
 789 }
 790
 791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 792 {
 793    tl_assert(isOriginalAtom(mce, data));
 794    tl_assert(isShadowAtom(mce, vbits));
 795    tl_assert(sameKindedAtoms(data, vbits));
 796    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 797 }
 798
 799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 800 {
 801    tl_assert(isOriginalAtom(mce, data));
 802    tl_assert(isShadowAtom(mce, vbits));
 803    tl_assert(sameKindedAtoms(data, vbits));
 804    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 805 }
 806
 807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 808 {
 809    tl_assert(isOriginalAtom(mce, data));
 810    tl_assert(isShadowAtom(mce, vbits));
 811    tl_assert(sameKindedAtoms(data, vbits));
 812    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 813 }
 814
 815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 816 {
 817    tl_assert(isOriginalAtom(mce, data));
 818    tl_assert(isShadowAtom(mce, vbits));
 819    tl_assert(sameKindedAtoms(data, vbits));
 820    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 821 }
 822
 823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 824 {
 825    tl_assert(isOriginalAtom(mce, data));
 826    tl_assert(isShadowAtom(mce, vbits));
 827    tl_assert(sameKindedAtoms(data, vbits));
 828    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 829 }
 830
 831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 832 {
 833    tl_assert(isOriginalAtom(mce, data));
 834    tl_assert(isShadowAtom(mce, vbits));
 835    tl_assert(sameKindedAtoms(data, vbits));
 836    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
 837 }
 838
 839 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
 840    defined (0); all other -> undefined (1).
 841 */
 842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 843 {
 844    tl_assert(isOriginalAtom(mce, data));
 845    tl_assert(isShadowAtom(mce, vbits));
 846    tl_assert(sameKindedAtoms(data, vbits));
 847    return assignNew(
 848              'V', mce, Ity_I1,
 849              binop(Iop_Or1,
 850                    assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
 851                    vbits) );
 852 }
 853
 854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 855 {
 856    tl_assert(isOriginalAtom(mce, data));
 857    tl_assert(isShadowAtom(mce, vbits));
 858    tl_assert(sameKindedAtoms(data, vbits));
 859    return assignNew(
 860              'V', mce, Ity_I8,
 861              binop(Iop_Or8,
 862                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
 863                    vbits) );
 864 }
 865
 866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 867 {
 868    tl_assert(isOriginalAtom(mce, data));
 869    tl_assert(isShadowAtom(mce, vbits));
 870    tl_assert(sameKindedAtoms(data, vbits));
 871    return assignNew(
 872              'V', mce, Ity_I16,
 873              binop(Iop_Or16,
 874                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
 875                    vbits) );
 876 }
 877
 878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 879 {
 880    tl_assert(isOriginalAtom(mce, data));
 881    tl_assert(isShadowAtom(mce, vbits));
 882    tl_assert(sameKindedAtoms(data, vbits));
 883    return assignNew(
 884              'V', mce, Ity_I32,
 885              binop(Iop_Or32,
 886                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
 887                    vbits) );
 888 }
 889
 890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 891 {
 892    tl_assert(isOriginalAtom(mce, data));
 893    tl_assert(isShadowAtom(mce, vbits));
 894    tl_assert(sameKindedAtoms(data, vbits));
 895    return assignNew(
 896              'V', mce, Ity_I64,
 897              binop(Iop_Or64,
 898                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
 899                    vbits) );
 900 }
 901
 902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 903 {
 904    tl_assert(isOriginalAtom(mce, data));
 905    tl_assert(isShadowAtom(mce, vbits));
 906    tl_assert(sameKindedAtoms(data, vbits));
 907    return assignNew(
 908              'V', mce, Ity_V128,
 909              binop(Iop_OrV128,
 910                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
 911                    vbits) );
 912 }
 913
 914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 915 {
 916    tl_assert(isOriginalAtom(mce, data));
 917    tl_assert(isShadowAtom(mce, vbits));
 918    tl_assert(sameKindedAtoms(data, vbits));
 919    return assignNew(
 920              'V', mce, Ity_V256,
 921              binop(Iop_OrV256,
 922                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
 923                    vbits) );
 924 }
 925
 926 /* --------- Pessimising casts. --------- */
 927
 928 /* The function returns an expression of type DST_TY. If any of the VBITS
 929    is undefined (value == 1) the resulting expression has all bits set to
 930    1. Otherwise, all bits are 0. */
 931
 932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
 933 {
 934    IRType  src_ty;
 935    IRAtom* tmp1;
 936
 937    /* Note, dst_ty is a shadow type, not an original type. */
 938    tl_assert(isShadowAtom(mce,vbits));
 939    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
 940
 941    /* Fast-track some common cases */
 942    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
 943       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 944
 945    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
 946       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 947
 948    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
 949       /* PCast the arg, then clone it. */
 950       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 951       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 952    }
 953
 954    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
 955       /* PCast the arg, then clone it 4 times. */
 956       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 957       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 958       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 959    }
 960
 961    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
 962       /* PCast the arg, then clone it 8 times. */
 963       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 964       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 965       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 966       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
 967    }
 968
 969    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
 970       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
 971          the top half. */
 972       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 973       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
 974    }
 975
 976    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
 977       /* Use InterleaveHI64x2 to copy the top half of the vector into
 978          the bottom half.  Then we can UifU it with the original, throw
 979          away the upper half of the result, and PCast-I64-to-I64
 980          the lower half. */
 981       // Generates vbits[127:64] : vbits[127:64]
 982       IRAtom* hi64hi64
 983          = assignNew('V', mce, Ity_V128,
 984                      binop(Iop_InterleaveHI64x2, vbits, vbits));
 985       // Generates
 986       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
 987       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
 988       IRAtom* lohi64
 989          = mkUifUV128(mce, hi64hi64, vbits);
 990       // Generates UifU(vbits[127:64],vbits[63:0])
 991       IRAtom* lo64
 992          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
 993       // Generates
 994       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
 995       //   == PCast-to-I64( vbits[127:0] )
 996       IRAtom* res
 997          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
 998       return res;
 999    }
1000
1001    /* Else do it the slow way .. */
1002    /* First of all, collapse vbits down to a single bit. */
1003    tmp1   = NULL;
1004    switch (src_ty) {
1005       case Ity_I1:
1006          tmp1 = vbits;
1007          break;
1008       case Ity_I8:
1009          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010          break;
1011       case Ity_I16:
1012          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013          break;
1014       case Ity_I32:
1015          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016          break;
1017       case Ity_I64:
1018          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019          break;
1020       case Ity_I128: {
1021          /* Gah.  Chop it in half, OR the halves together, and compare
1022             that with zero. */
1023          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026          tmp1         = assignNew('V', mce, Ity_I1,
1027                                        unop(Iop_CmpNEZ64, tmp4));
1028          break;
1029       }
1030       case Ity_V128: {
1031          /* Chop it in half, OR the halves together, and compare that
1032           * with zero.
1033           */
1034          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037          tmp1         = assignNew('V', mce, Ity_I1,
1038                                        unop(Iop_CmpNEZ64, tmp4));
1039          break;
1040       }
1041       default:
1042          ppIRType(src_ty);
1043          VG_(tool_panic)("mkPCastTo(1)");
1044    }
1045    tl_assert(tmp1);
1046    /* Now widen up to the dst type. */
1047    switch (dst_ty) {
1048       case Ity_I1:
1049          return tmp1;
1050       case Ity_I8:
1051          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052       case Ity_I16:
1053          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054       case Ity_I32:
1055          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056       case Ity_I64:
1057          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058       case Ity_V128:
1059          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1060          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061          return tmp1;
1062       case Ity_I128:
1063          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1064          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065          return tmp1;
1066       case Ity_V256:
1067          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1068          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069                                                     tmp1, tmp1));
1070          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071                                                     tmp1, tmp1));
1072          return tmp1;
1073       default:
1074          ppIRType(dst_ty);
1075          VG_(tool_panic)("mkPCastTo(2)");
1076    }
1077 }
1078
1079 /* This is a minor variant.  It takes an arg of some type and returns
1080    a value of the same type.  The result consists entirely of Defined
1081    (zero) bits except its least significant bit, which is a PCast of
1082    the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1084 {
1085    if (ty == Ity_V128) {
1086       /* --- Case for V128 --- */
1087       IRAtom* varg128 = varg;
1088       // generates: PCast-to-I64(varg128)
1089       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090       // Now introduce zeros (defined bits) in the top 63 places
1091       // generates: Def--(63)--Def PCast-to-I1(varg128)
1092       IRAtom* d63pc
1093          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094       // generates: Def--(64)--Def
1095       IRAtom* d64
1096          = definedOfType(Ity_I64);
1097       // generates: Def--(127)--Def PCast-to-I1(varg128)
1098       IRAtom* res
1099          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100       return res;
1101    }
1102    if (ty == Ity_I64) {
1103       /* --- Case for I64 --- */
1104       // PCast to 64
1105       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106       // Zero (Def) out the top 63 bits
1107       IRAtom* res
1108          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109       return res;
1110    }
1111    /*NOTREACHED*/
1112    tl_assert(0);
1113 }
1114
1115 /* --------- Optimistic casts. --------- */
1116
1117 /* The function takes and returns an expression of type TY. If any of the
1118    VBITS indicate defined (value == 0) the resulting expression has all bits
1119    set to 0. Otherwise, all bits are 1.  In words, if any bits are defined
1120    then all bits are made to be defined.
1121
1122    In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1123 */
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1125 {
1126    IROp opSUB, opSHR, opSAR;
1127    UInt sh;
1128
1129    switch (ty) {
1130       case Ity_I64:
1131          opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132          break;
1133       case Ity_I32:
1134          opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135          break;
1136       case Ity_I16:
1137          opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138          break;
1139       case Ity_I8:
1140          opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141          break;
1142       default:
1143          ppIRType(ty);
1144          VG_(tool_panic)("mkOCastTo");
1145    }
1146
1147    IRAtom *shr1, *at;
1148    shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149    at   = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150    at   = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151    return at;
1152 }
1153
1154
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1156 /*
1157    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158    PCasting to Ity_U1.  However, sometimes it is necessary to be more
1159    accurate.  The insight is that the result is defined if two
1160    corresponding bits can be found, one from each argument, so that
1161    both bits are defined but are different -- that makes EQ say "No"
1162    and NE say "Yes".  Hence, we compute an improvement term and DifD
1163    it onto the "normal" (UifU) result.
1164
1165    The result is:
1166
1167    PCastTo<1> (
1168       -- naive version
1169       UifU<sz>(vxx, vyy)
1170
1171       `DifD<sz>`
1172
1173       -- improvement term
1174       OCast<sz>(vec)
1175    )
1176
1177    where
1178      vec contains 0 (defined) bits where the corresponding arg bits
1179      are defined but different, and 1 bits otherwise.
1180
1181      vec = Or<sz>( vxx,   // 0 iff bit defined
1182                    vyy,   // 0 iff bit defined
1183                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1184                  )
1185
1186      If any bit of vec is 0, the result is defined and so the
1187      improvement term should produce 0...0, else it should produce
1188      1...1.
1189
1190      Hence require for the improvement term:
1191
1192         OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1193
1194      which you can think of as an "optimistic cast" (OCast, the opposite of
1195      the normal "pessimistic cast" (PCast) family.  An OCast says all bits
1196      are defined if any bit is defined.
1197
1198      It is possible to show that
1199
1200          if vec == 1...1 then 1...1 else 0...0
1201
1202      can be implemented in straight-line code as
1203
1204          (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1205
1206    We note that vec contains the sub-term Or<sz>(vxx, vyy).  Since UifU is
1207    implemented with Or (since 1 signifies undefinedness), this is a
1208    duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209    a final version of:
1210
1211    let naive = UifU<sz>(vxx, vyy)
1212        vec   = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1213    in
1214        PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1215
1216    This was extensively re-analysed and checked on 6 July 05 and again
1217    in July 2017.
1218 */
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
1220                                     IRType  ty,
1221                                     IRAtom* vxx, IRAtom* vyy,
1222                                     IRAtom* xx,  IRAtom* yy )
1223 {
1224    IRAtom *naive, *vec, *improved, *final_cast;
1225    IROp   opDIFD, opUIFU, opOR, opXOR, opNOT;
1226
1227    tl_assert(isShadowAtom(mce,vxx));
1228    tl_assert(isShadowAtom(mce,vyy));
1229    tl_assert(isOriginalAtom(mce,xx));
1230    tl_assert(isOriginalAtom(mce,yy));
1231    tl_assert(sameKindedAtoms(vxx,xx));
1232    tl_assert(sameKindedAtoms(vyy,yy));
1233
1234    switch (ty) {
1235       case Ity_I8:
1236          opDIFD = Iop_And8;
1237          opUIFU = Iop_Or8;
1238          opOR   = Iop_Or8;
1239          opXOR  = Iop_Xor8;
1240          opNOT  = Iop_Not8;
1241          break;
1242       case Ity_I16:
1243          opDIFD = Iop_And16;
1244          opUIFU = Iop_Or16;
1245          opOR   = Iop_Or16;
1246          opXOR  = Iop_Xor16;
1247          opNOT  = Iop_Not16;
1248          break;
1249       case Ity_I32:
1250          opDIFD = Iop_And32;
1251          opUIFU = Iop_Or32;
1252          opOR   = Iop_Or32;
1253          opXOR  = Iop_Xor32;
1254          opNOT  = Iop_Not32;
1255          break;
1256       case Ity_I64:
1257          opDIFD = Iop_And64;
1258          opUIFU = Iop_Or64;
1259          opOR   = Iop_Or64;
1260          opXOR  = Iop_Xor64;
1261          opNOT  = Iop_Not64;
1262          break;
1263       default:
1264          VG_(tool_panic)("expensiveCmpEQorNE");
1265    }
1266
1267    naive
1268       = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1269
1270    vec
1271       = assignNew(
1272            'V', mce,ty,
1273            binop( opOR,
1274                   naive,
1275                   assignNew(
1276                      'V', mce,ty,
1277                      unop(opNOT,
1278                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1279
1280    improved
1281       = assignNew( 'V', mce,ty,
1282                    binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1283
1284    final_cast
1285       = mkPCastTo( mce, Ity_I1, improved );
1286
1287    return final_cast;
1288 }
1289
1290 /* Check if we can know, despite the uncertain bits, that xx is greater than yy.
1291    Notice that it's xx > yy and not the other way around.  This is Intel syntax
1292    with destination first.  It will appear reversed in gdb disassembly (AT&T
1293    syntax).
1294  */
1295 static IRAtom* expensiveCmpGT ( MCEnv*  mce,
1296                                 IROp opGT,
1297                                 IRAtom* vxx, IRAtom* vyy,
1298                                 IRAtom* xx,  IRAtom* yy )
1299 {
1300    IROp   opAND, opOR, opXOR, opNOT, opSHL;
1301    IRType ty;
1302    unsigned int word_size;
1303    Bool is_signed;
1304
1305    tl_assert(isShadowAtom(mce,vxx));
1306    tl_assert(isShadowAtom(mce,vyy));
1307    tl_assert(isOriginalAtom(mce,xx));
1308    tl_assert(isOriginalAtom(mce,yy));
1309    tl_assert(sameKindedAtoms(vxx,xx));
1310    tl_assert(sameKindedAtoms(vyy,yy));
1311
1312    switch (opGT) {
1313       case Iop_CmpGT64Sx2:
1314       case Iop_CmpGT64Ux2:
1315          opSHL = Iop_ShlN64x2;
1316          word_size = 64;
1317          break;
1318       case Iop_CmpGT32Sx4:
1319       case Iop_CmpGT32Ux4:
1320          opSHL = Iop_ShlN32x4;
1321          word_size = 32;
1322          break;
1323       case Iop_CmpGT16Sx8:
1324       case Iop_CmpGT16Ux8:
1325          opSHL = Iop_ShlN16x8;
1326          word_size = 16;
1327          break;
1328       case Iop_CmpGT8Sx16:
1329       case Iop_CmpGT8Ux16:
1330          opSHL = Iop_ShlN8x16;
1331          word_size = 8;
1332          break;
1333       default:
1334          VG_(tool_panic)("expensiveCmpGT");
1335    }
1336
1337    switch (opGT) {
1338       case Iop_CmpGT64Sx2:
1339       case Iop_CmpGT32Sx4:
1340       case Iop_CmpGT16Sx8:
1341       case Iop_CmpGT8Sx16:
1342          is_signed = True;
1343          break;
1344       case Iop_CmpGT64Ux2:
1345       case Iop_CmpGT32Ux4:
1346       case Iop_CmpGT16Ux8:
1347       case Iop_CmpGT8Ux16:
1348          is_signed = False;
1349          break;
1350       default:
1351          VG_(tool_panic)("expensiveCmpGT");
1352    }
1353
1354    ty = Ity_V128;
1355    opAND = Iop_AndV128;
1356    opOR   = Iop_OrV128;
1357    opXOR  = Iop_XorV128;
1358    opNOT  = Iop_NotV128;
1359
1360    IRAtom *MSBs;
1361    if (is_signed) {
1362       // For unsigned it's easy to make the min and max: Just set the unknown
1363       // bits all to 0s or 1s.  For signed it's harder because having a 1 in the
1364       // MSB makes a number smaller, not larger!  We can work around this by
1365       // flipping the MSB before and after computing the min and max values.
1366       IRAtom *all_ones = mkV128(0xffff);
1367       MSBs = assignNew('V', mce, ty, binop(opSHL, all_ones, mkU8(word_size-1)));
1368       xx = assignNew('V', mce, ty, binop(opXOR, xx, MSBs));
1369       yy = assignNew('V', mce, ty, binop(opXOR, yy, MSBs));
1370       // From here on out, we're dealing with MSB-flipped integers.
1371    }
1372    // We can combine xx and vxx to create two values: the largest that xx could
1373    // possibly be and the smallest that xx could possibly be.  Likewise, we can
1374    // do the same for yy.  We'll call those max_xx and min_xx and max_yy and
1375    // min_yy.
1376    IRAtom *not_vxx = assignNew('V', mce, ty, unop(opNOT, vxx));
1377    IRAtom *not_vyy = assignNew('V', mce, ty, unop(opNOT, vyy));
1378    IRAtom *max_xx = assignNew('V', mce, ty, binop(opOR, xx, vxx));
1379    IRAtom *min_xx = assignNew('V', mce, ty, binop(opAND, xx, not_vxx));
1380    IRAtom *max_yy = assignNew('V', mce, ty, binop(opOR, yy, vyy));
1381    IRAtom *min_yy = assignNew('V', mce, ty, binop(opAND, yy, not_vyy));
1382    if (is_signed) {
1383       // Unflip the MSBs.
1384       max_xx = assignNew('V', mce, ty, binop(opXOR, max_xx, MSBs));
1385       min_xx = assignNew('V', mce, ty, binop(opXOR, min_xx, MSBs));
1386       max_yy = assignNew('V', mce, ty, binop(opXOR, max_yy, MSBs));
1387       min_yy = assignNew('V', mce, ty, binop(opXOR, min_yy, MSBs));
1388    }
1389    IRAtom *min_xx_gt_max_yy = assignNew('V', mce, ty, binop(opGT, min_xx, max_yy));
1390    IRAtom *max_xx_gt_min_yy = assignNew('V', mce, ty, binop(opGT, max_xx, min_yy));
1391    // If min_xx is greater than max_yy then xx is surely greater than yy so we know
1392    // our answer for sure.  If max_xx is not greater than min_yy then xx can't
1393    // possible be greater than yy so again we know the answer for sure.  For all
1394    // other cases, we can't know.
1395    //
1396    // So the result is defined if:
1397    //
1398    // min_xx_gt_max_yy | ~max_xx_gt_min_yy
1399    //
1400    // Because defined in vbits is 0s and not 1s, we need to invert that:
1401    //
1402    // ~(min_xx_gt_max_yy | ~max_xx_gt_min_yy)
1403    //
1404    // We can use DeMorgan's Law to simplify the above:
1405    //
1406    // ~min_xx_gt_max_yy & max_xx_gt_min_yy
1407    IRAtom *not_min_xx_gt_max_yy = assignNew('V', mce, ty, unop(opNOT, min_xx_gt_max_yy));
1408    return assignNew('V', mce, ty, binop(opAND, not_min_xx_gt_max_yy, max_xx_gt_min_yy));
1409 }
1410
1411 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1412
1413 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1414
1415       CmpORD32S(x,y) = 1<<3   if  x <s y
1416                      = 1<<2   if  x >s y
1417                      = 1<<1   if  x == y
1418
1419    and similarly the unsigned variant.  The default interpretation is:
1420
1421       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1422                                   & (7<<1)
1423
1424    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1425    are zero and therefore defined (viz, zero).
1426
1427    Also deal with a special case better:
1428
1429       CmpORD32S(x,0)
1430
1431    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1432    will be defined even if the rest of x isn't.  In which case we do:
1433
1434       CmpORD32S#(x,x#,0,{impliedly 0}#)
1435          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1436            | (x# >>u 31) << 3      -- LT# = x#[31]
1437
1438    Analogous handling for CmpORD64{S,U}.
1439 */
1440 static Bool isZeroU32 ( IRAtom* e )
1441 {
1442    return
1443       toBool( e->tag == Iex_Const
1444               && e->Iex.Const.con->tag == Ico_U32
1445               && e->Iex.Const.con->Ico.U32 == 0 );
1446 }
1447
1448 static Bool isZeroU64 ( IRAtom* e )
1449 {
1450    return
1451       toBool( e->tag == Iex_Const
1452               && e->Iex.Const.con->tag == Ico_U64
1453               && e->Iex.Const.con->Ico.U64 == 0 );
1454 }
1455
1456 static IRAtom* doCmpORD ( MCEnv*  mce,
1457                           IROp    cmp_op,
1458                           IRAtom* xxhash, IRAtom* yyhash,
1459                           IRAtom* xx,     IRAtom* yy )
1460 {
1461    Bool   m64      = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1462    Bool   syned    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1463    IROp   opOR     = m64 ? Iop_Or64   : Iop_Or32;
1464    IROp   opAND    = m64 ? Iop_And64  : Iop_And32;
1465    IROp   opSHL    = m64 ? Iop_Shl64  : Iop_Shl32;
1466    IROp   opSHR    = m64 ? Iop_Shr64  : Iop_Shr32;
1467    IROp   op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1468    IRType ty       = m64 ? Ity_I64    : Ity_I32;
1469    Int    width    = m64 ? 64         : 32;
1470
1471    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1472
1473    tl_assert(isShadowAtom(mce,xxhash));
1474    tl_assert(isShadowAtom(mce,yyhash));
1475    tl_assert(isOriginalAtom(mce,xx));
1476    tl_assert(isOriginalAtom(mce,yy));
1477    tl_assert(sameKindedAtoms(xxhash,xx));
1478    tl_assert(sameKindedAtoms(yyhash,yy));
1479    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1480              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1481
1482    if (0) {
1483       ppIROp(cmp_op); VG_(printf)(" ");
1484       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1485    }
1486
1487    if (syned && isZero(yy)) {
1488       /* fancy interpretation */
1489       /* if yy is zero, then it must be fully defined (zero#). */
1490       tl_assert(isZero(yyhash));
1491       // This is still inaccurate, but I don't think it matters, since
1492       // nobody writes code of the form
1493       // "is <partially-undefined-value> signedly greater than zero?".
1494       // We therefore simply declare "x >s 0" to be undefined if any bit in
1495       // x is undefined.  That's clearly suboptimal in some cases.  Eg, if
1496       // the highest order bit is a defined 1 then x is negative so it
1497       // doesn't matter whether the remaining bits are defined or not.
1498       IRAtom* t_0_gt_0_0
1499          = assignNew(
1500               'V', mce,ty,
1501               binop(
1502                  opAND,
1503                  mkPCastTo(mce,ty, xxhash),
1504                  m64 ? mkU64(1<<2) : mkU32(1<<2)
1505               ));
1506       // For "x <s 0", we can just copy the definedness of the top bit of x
1507       // and we have a precise result.
1508       IRAtom* t_lt_0_0_0
1509          = assignNew(
1510               'V', mce,ty,
1511               binop(
1512                  opSHL,
1513                  assignNew(
1514                     'V', mce,ty,
1515                     binop(opSHR, xxhash, mkU8(width-1))),
1516                  mkU8(3)
1517               ));
1518       // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1519       IRAtom* t_0_0_eq_0
1520          = assignNew(
1521               'V', mce,ty,
1522               binop(
1523                  opSHL,
1524                  assignNew('V', mce,ty,
1525                     unop(
1526                     op1UtoWS,
1527                     expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1528                  ),
1529                  mkU8(1)
1530               ));
1531       return
1532          binop(
1533             opOR,
1534             assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1535             t_0_0_eq_0
1536          );
1537    } else {
1538       /* standard interpretation */
1539       IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1540       return
1541          binop(
1542             opAND,
1543             mkPCastTo( mce,ty,
1544                        mkUifU(mce,ty, xxhash,yyhash)),
1545             sevenLeft1
1546          );
1547    }
1548 }
1549
1550
1551 /*------------------------------------------------------------*/
1552 /*--- Emit a test and complaint if something is undefined. ---*/
1553 /*------------------------------------------------------------*/
1554
1555 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1556
1557
1558 /* Set the annotations on a dirty helper to indicate that the stack
1559    pointer and instruction pointers might be read.  This is the
1560    behaviour of all 'emit-a-complaint' style functions we might
1561    call. */
1562
1563 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1564    di->nFxState = 2;
1565    di->fxState[0].fx        = Ifx_Read;
1566    di->fxState[0].offset    = mce->layout->offset_SP;
1567    di->fxState[0].size      = mce->layout->sizeof_SP;
1568    di->fxState[0].nRepeats  = 0;
1569    di->fxState[0].repeatLen = 0;
1570    di->fxState[1].fx        = Ifx_Read;
1571    di->fxState[1].offset    = mce->layout->offset_IP;
1572    di->fxState[1].size      = mce->layout->sizeof_IP;
1573    di->fxState[1].nRepeats  = 0;
1574    di->fxState[1].repeatLen = 0;
1575 }
1576
1577
1578 /* Check the supplied *original* |atom| for undefinedness, and emit a
1579    complaint if so.  Once that happens, mark it as defined.  This is
1580    possible because the atom is either a tmp or literal.  If it's a
1581    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1582    be defined.  In fact as mentioned above, we will have to allocate a
1583    new tmp to carry the new 'defined' shadow value, and update the
1584    original->tmp mapping accordingly; we cannot simply assign a new
1585    value to an existing shadow tmp as this breaks SSAness.
1586
1587    The checks are performed, any resulting complaint emitted, and
1588    |atom|'s shadow temp set to 'defined', ONLY in the case that
1589    |guard| evaluates to True at run-time.  If it evaluates to False
1590    then no action is performed.  If |guard| is NULL (the usual case)
1591    then it is assumed to be always-true, and hence these actions are
1592    performed unconditionally.
1593
1594    This routine does not generate code to check the definedness of
1595    |guard|.  The caller is assumed to have taken care of that already.
1596 */
1597 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1598 {
1599    IRAtom*  vatom;
1600    IRType   ty;
1601    Int      sz;
1602    IRDirty* di;
1603    IRAtom*  cond;
1604    IRAtom*  origin;
1605    void*    fn;
1606    const HChar* nm;
1607    IRExpr** args;
1608    Int      nargs;
1609
1610    // Don't do V bit tests if we're not reporting undefined value errors.
1611    if (MC_(clo_mc_level) == 1)
1612       return;
1613
1614    if (guard)
1615       tl_assert(isOriginalAtom(mce, guard));
1616
1617    /* Since the original expression is atomic, there's no duplicated
1618       work generated by making multiple V-expressions for it.  So we
1619       don't really care about the possibility that someone else may
1620       also create a V-interpretion for it. */
1621    tl_assert(isOriginalAtom(mce, atom));
1622    vatom = expr2vbits( mce, atom, HuOth );
1623    tl_assert(isShadowAtom(mce, vatom));
1624    tl_assert(sameKindedAtoms(atom, vatom));
1625
1626    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1627
1628    /* sz is only used for constructing the error message */
1629    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1630
1631    cond = mkPCastTo( mce, Ity_I1, vatom );
1632    /* cond will be 0 if all defined, and 1 if any not defined. */
1633
1634    /* Get the origin info for the value we are about to check.  At
1635       least, if we are doing origin tracking.  If not, use a dummy
1636       zero origin. */
1637    if (MC_(clo_mc_level) == 3) {
1638       origin = schemeE( mce, atom );
1639       if (mce->hWordTy == Ity_I64) {
1640          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1641       }
1642    } else {
1643       origin = NULL;
1644    }
1645
1646    fn    = NULL;
1647    nm    = NULL;
1648    args  = NULL;
1649    nargs = -1;
1650
1651    switch (sz) {
1652       case 0:
1653          if (origin) {
1654             fn    = &MC_(helperc_value_check0_fail_w_o);
1655             nm    = "MC_(helperc_value_check0_fail_w_o)";
1656             args  = mkIRExprVec_1(origin);
1657             nargs = 1;
1658          } else {
1659             fn    = &MC_(helperc_value_check0_fail_no_o);
1660             nm    = "MC_(helperc_value_check0_fail_no_o)";
1661             args  = mkIRExprVec_0();
1662             nargs = 0;
1663          }
1664          break;
1665       case 1:
1666          if (origin) {
1667             fn    = &MC_(helperc_value_check1_fail_w_o);
1668             nm    = "MC_(helperc_value_check1_fail_w_o)";
1669             args  = mkIRExprVec_1(origin);
1670             nargs = 1;
1671          } else {
1672             fn    = &MC_(helperc_value_check1_fail_no_o);
1673             nm    = "MC_(helperc_value_check1_fail_no_o)";
1674             args  = mkIRExprVec_0();
1675             nargs = 0;
1676          }
1677          break;
1678       case 4:
1679          if (origin) {
1680             fn    = &MC_(helperc_value_check4_fail_w_o);
1681             nm    = "MC_(helperc_value_check4_fail_w_o)";
1682             args  = mkIRExprVec_1(origin);
1683             nargs = 1;
1684          } else {
1685             fn    = &MC_(helperc_value_check4_fail_no_o);
1686             nm    = "MC_(helperc_value_check4_fail_no_o)";
1687             args  = mkIRExprVec_0();
1688             nargs = 0;
1689          }
1690          break;
1691       case 8:
1692          if (origin) {
1693             fn    = &MC_(helperc_value_check8_fail_w_o);
1694             nm    = "MC_(helperc_value_check8_fail_w_o)";
1695             args  = mkIRExprVec_1(origin);
1696             nargs = 1;
1697          } else {
1698             fn    = &MC_(helperc_value_check8_fail_no_o);
1699             nm    = "MC_(helperc_value_check8_fail_no_o)";
1700             args  = mkIRExprVec_0();
1701             nargs = 0;
1702          }
1703          break;
1704       case 2:
1705       case 16:
1706          if (origin) {
1707             fn    = &MC_(helperc_value_checkN_fail_w_o);
1708             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1709             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1710             nargs = 2;
1711          } else {
1712             fn    = &MC_(helperc_value_checkN_fail_no_o);
1713             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1714             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1715             nargs = 1;
1716          }
1717          break;
1718       default:
1719          VG_(tool_panic)("unexpected szB");
1720    }
1721
1722    tl_assert(fn);
1723    tl_assert(nm);
1724    tl_assert(args);
1725    tl_assert(nargs >= 0 && nargs <= 2);
1726    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1727               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1728
1729    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1730                            VG_(fnptr_to_fnentry)( fn ), args );
1731    di->guard = cond; // and cond is PCast-to-1(atom#)
1732
1733    /* If the complaint is to be issued under a guard condition, AND
1734       that into the guard condition for the helper call. */
1735    if (guard) {
1736       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1737       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1738       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1739       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1740    }
1741
1742    setHelperAnns( mce, di );
1743    stmt( 'V', mce, IRStmt_Dirty(di));
1744
1745    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1746       defined -- but only in the case where the guard evaluates to
1747       True at run-time.  Do the update by setting the orig->shadow
1748       mapping for tmp to reflect the fact that this shadow is getting
1749       a new value. */
1750    tl_assert(isIRAtom(vatom));
1751    /* sameKindedAtoms ... */
1752    if (vatom->tag == Iex_RdTmp) {
1753       tl_assert(atom->tag == Iex_RdTmp);
1754       if (guard == NULL) {
1755          // guard is 'always True', hence update unconditionally
1756          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1757          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1758                           definedOfType(ty));
1759       } else {
1760          // update the temp only conditionally.  Do this by copying
1761          // its old value when the guard is False.
1762          // The old value ..
1763          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1764          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1765          IRAtom* new_tmpV
1766             = assignNew('V', mce, shadowTypeV(ty),
1767                         IRExpr_ITE(guard, definedOfType(ty),
1768                                           mkexpr(old_tmpV)));
1769          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1770       }
1771    }
1772 }
1773
1774
1775 /*------------------------------------------------------------*/
1776 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1777 /*------------------------------------------------------------*/
1778
1779 /* Examine the always-defined sections declared in layout to see if
1780    the (offset,size) section is within one.  Note, is is an error to
1781    partially fall into such a region: (offset,size) should either be
1782    completely in such a region or completely not-in such a region.
1783 */
1784 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1785 {
1786    Int minoffD, maxoffD, i;
1787    Int minoff = offset;
1788    Int maxoff = minoff + size - 1;
1789    tl_assert((minoff & ~0xFFFF) == 0);
1790    tl_assert((maxoff & ~0xFFFF) == 0);
1791
1792    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1793       minoffD = mce->layout->alwaysDefd[i].offset;
1794       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1795       tl_assert((minoffD & ~0xFFFF) == 0);
1796       tl_assert((maxoffD & ~0xFFFF) == 0);
1797
1798       if (maxoff < minoffD || maxoffD < minoff)
1799          continue; /* no overlap */
1800       if (minoff >= minoffD && maxoff <= maxoffD)
1801          return True; /* completely contained in an always-defd section */
1802
1803       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1804    }
1805    return False; /* could not find any containing section */
1806 }
1807
1808
1809 /* Generate into bb suitable actions to shadow this Put.  If the state
1810    slice is marked 'always defined', do nothing.  Otherwise, write the
1811    supplied V bits to the shadow state.  We can pass in either an
1812    original atom or a V-atom, but not both.  In the former case the
1813    relevant V-bits are then generated from the original.
1814    We assume here, that the definedness of GUARD has already been checked.
1815 */
1816 static
1817 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1818                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1819 {
1820    IRType ty;
1821
1822    // Don't do shadow PUTs if we're not doing undefined value checking.
1823    // Their absence lets Vex's optimiser remove all the shadow computation
1824    // that they depend on, which includes GETs of the shadow registers.
1825    if (MC_(clo_mc_level) == 1)
1826       return;
1827
1828    if (atom) {
1829       tl_assert(!vatom);
1830       tl_assert(isOriginalAtom(mce, atom));
1831       vatom = expr2vbits( mce, atom, HuOth );
1832    } else {
1833       tl_assert(vatom);
1834       tl_assert(isShadowAtom(mce, vatom));
1835    }
1836
1837    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1838    tl_assert(ty != Ity_I1);
1839    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1840       /* later: no ... */
1841       /* emit code to emit a complaint if any of the vbits are 1. */
1842       /* complainIfUndefined(mce, atom); */
1843    } else {
1844       /* Do a plain shadow Put. */
1845       if (guard) {
1846          /* If the guard expression evaluates to false we simply Put the value
1847             that is already stored in the guest state slot */
1848          IRAtom *cond, *iffalse;
1849
1850          cond    = assignNew('V', mce, Ity_I1, guard);
1851          iffalse = assignNew('V', mce, ty,
1852                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1853          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1854       }
1855       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1856    }
1857 }
1858
1859
1860 /* Return an expression which contains the V bits corresponding to the
1861    given GETI (passed in in pieces).
1862 */
1863 static
1864 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1865 {
1866    IRAtom* vatom;
1867    IRType  ty, tyS;
1868    Int     arrSize;;
1869    IRRegArray* descr = puti->descr;
1870    IRAtom*     ix    = puti->ix;
1871    Int         bias  = puti->bias;
1872    IRAtom*     atom  = puti->data;
1873
1874    // Don't do shadow PUTIs if we're not doing undefined value checking.
1875    // Their absence lets Vex's optimiser remove all the shadow computation
1876    // that they depend on, which includes GETIs of the shadow registers.
1877    if (MC_(clo_mc_level) == 1)
1878       return;
1879
1880    tl_assert(isOriginalAtom(mce,atom));
1881    vatom = expr2vbits( mce, atom, HuOth );
1882    tl_assert(sameKindedAtoms(atom, vatom));
1883    ty   = descr->elemTy;
1884    tyS  = shadowTypeV(ty);
1885    arrSize = descr->nElems * sizeofIRType(ty);
1886    tl_assert(ty != Ity_I1);
1887    tl_assert(isOriginalAtom(mce,ix));
1888    complainIfUndefined(mce, ix, NULL);
1889    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1890       /* later: no ... */
1891       /* emit code to emit a complaint if any of the vbits are 1. */
1892       /* complainIfUndefined(mce, atom); */
1893    } else {
1894       /* Do a cloned version of the Put that refers to the shadow
1895          area. */
1896       IRRegArray* new_descr
1897          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1898                          tyS, descr->nElems);
1899       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1900    }
1901 }
1902
1903
1904 /* Return an expression which contains the V bits corresponding to the
1905    given GET (passed in in pieces).
1906 */
1907 static
1908 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1909 {
1910    IRType tyS = shadowTypeV(ty);
1911    tl_assert(ty != Ity_I1);
1912    tl_assert(ty != Ity_I128);
1913    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1914       /* Always defined, return all zeroes of the relevant type */
1915       return definedOfType(tyS);
1916    } else {
1917       /* return a cloned version of the Get that refers to the shadow
1918          area. */
1919       /* FIXME: this isn't an atom! */
1920       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1921    }
1922 }
1923
1924
1925 /* Return an expression which contains the V bits corresponding to the
1926    given GETI (passed in in pieces).
1927 */
1928 static
1929 IRExpr* shadow_GETI ( MCEnv* mce,
1930                       IRRegArray* descr, IRAtom* ix, Int bias )
1931 {
1932    IRType ty   = descr->elemTy;
1933    IRType tyS  = shadowTypeV(ty);
1934    Int arrSize = descr->nElems * sizeofIRType(ty);
1935    tl_assert(ty != Ity_I1);
1936    tl_assert(isOriginalAtom(mce,ix));
1937    complainIfUndefined(mce, ix, NULL);
1938    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1939       /* Always defined, return all zeroes of the relevant type */
1940       return definedOfType(tyS);
1941    } else {
1942       /* return a cloned version of the Get that refers to the shadow
1943          area. */
1944       IRRegArray* new_descr
1945          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1946                          tyS, descr->nElems);
1947       return IRExpr_GetI( new_descr, ix, bias );
1948    }
1949 }
1950
1951
1952 /*------------------------------------------------------------*/
1953 /*--- Generating approximations for unknown operations,    ---*/
1954 /*--- using lazy-propagate semantics                       ---*/
1955 /*------------------------------------------------------------*/
1956
1957 /* Lazy propagation of undefinedness from two values, resulting in the
1958    specified shadow type.
1959 */
1960 static
1961 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1962 {
1963    IRAtom* at;
1964    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1965    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1966    tl_assert(isShadowAtom(mce,va1));
1967    tl_assert(isShadowAtom(mce,va2));
1968
1969    /* The general case is inefficient because PCast is an expensive
1970       operation.  Here are some special cases which use PCast only
1971       once rather than twice. */
1972
1973    /* I64 x I64 -> I64 */
1974    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1975       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1976       at = mkUifU(mce, Ity_I64, va1, va2);
1977       at = mkPCastTo(mce, Ity_I64, at);
1978       return at;
1979    }
1980
1981    /* I64 x I64 -> I32 */
1982    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1983       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1984       at = mkUifU(mce, Ity_I64, va1, va2);
1985       at = mkPCastTo(mce, Ity_I32, at);
1986       return at;
1987    }
1988
1989    /* I32 x I32 -> I32 */
1990    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1991       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1992       at = mkUifU(mce, Ity_I32, va1, va2);
1993       at = mkPCastTo(mce, Ity_I32, at);
1994       return at;
1995    }
1996
1997    if (0) {
1998       VG_(printf)("mkLazy2 ");
1999       ppIRType(t1);
2000       VG_(printf)("_");
2001       ppIRType(t2);
2002       VG_(printf)("_");
2003       ppIRType(finalVty);
2004       VG_(printf)("\n");
2005    }
2006
2007    /* General case: force everything via 32-bit intermediaries. */
2008    at = mkPCastTo(mce, Ity_I32, va1);
2009    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2010    at = mkPCastTo(mce, finalVty, at);
2011    return at;
2012 }
2013
2014
2015 /* 3-arg version of the above. */
2016 static
2017 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
2018                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
2019 {
2020    IRAtom* at;
2021    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2022    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2023    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2024    tl_assert(isShadowAtom(mce,va1));
2025    tl_assert(isShadowAtom(mce,va2));
2026    tl_assert(isShadowAtom(mce,va3));
2027
2028    /* The general case is inefficient because PCast is an expensive
2029       operation.  Here are some special cases which use PCast only
2030       twice rather than three times. */
2031
2032    /* I32 x I64 x I64 -> I64 */
2033    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2034    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2035        && finalVty == Ity_I64) {
2036       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
2037       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
2038          mode indication which is fully defined, this should get
2039          folded out later. */
2040       at = mkPCastTo(mce, Ity_I64, va1);
2041       /* Now fold in 2nd and 3rd args. */
2042       at = mkUifU(mce, Ity_I64, at, va2);
2043       at = mkUifU(mce, Ity_I64, at, va3);
2044       /* and PCast once again. */
2045       at = mkPCastTo(mce, Ity_I64, at);
2046       return at;
2047    }
2048
2049    /* I32 x I8 x I64 -> I64 */
2050    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
2051        && finalVty == Ity_I64) {
2052       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
2053       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
2054        * rounding mode indication which is fully defined, this should
2055        * get folded out later.
2056       */
2057       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2058       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2059       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
2060       at = mkUifU(mce, Ity_I64, at, va3);
2061       /* and PCast once again. */
2062       at = mkPCastTo(mce, Ity_I64, at);
2063       return at;
2064    }
2065
2066    /* I32 x I64 x I64 -> I32 */
2067    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2068        && finalVty == Ity_I32) {
2069       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
2070       at = mkPCastTo(mce, Ity_I64, va1);
2071       at = mkUifU(mce, Ity_I64, at, va2);
2072       at = mkUifU(mce, Ity_I64, at, va3);
2073       at = mkPCastTo(mce, Ity_I32, at);
2074       return at;
2075    }
2076
2077    /* I32 x I32 x I32 -> I32 */
2078    /* 32-bit FP idiom, as (eg) happens on ARM */
2079    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
2080        && finalVty == Ity_I32) {
2081       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
2082       at = va1;
2083       at = mkUifU(mce, Ity_I32, at, va2);
2084       at = mkUifU(mce, Ity_I32, at, va3);
2085       at = mkPCastTo(mce, Ity_I32, at);
2086       return at;
2087    }
2088
2089    /* I32 x I16 x I16 -> I16 */
2090    /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
2091    if (t1 == Ity_I32 && t2 == Ity_I16 && t3 == Ity_I16
2092        && finalVty == Ity_I16) {
2093       if (0) VG_(printf)("mkLazy3: I32 x I16 x I16 -> I16\n");
2094       at = mkPCastTo(mce, Ity_I16, va1);
2095       at = mkUifU(mce, Ity_I16, at, va2);
2096       at = mkUifU(mce, Ity_I16, at, va3);
2097       at = mkPCastTo(mce, Ity_I16, at);
2098       return at;
2099    }
2100
2101    /* I32 x I128 x I128 -> I128 */
2102    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2103    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
2104        && finalVty == Ity_I128) {
2105       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
2106       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
2107          mode indication which is fully defined, this should get
2108          folded out later. */
2109       at = mkPCastTo(mce, Ity_I128, va1);
2110       /* Now fold in 2nd and 3rd args. */
2111       at = mkUifU(mce, Ity_I128, at, va2);
2112       at = mkUifU(mce, Ity_I128, at, va3);
2113       /* and PCast once again. */
2114       at = mkPCastTo(mce, Ity_I128, at);
2115       return at;
2116    }
2117
2118    /* I32 x I8 x I128 -> I128 */
2119    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2120    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
2121        && finalVty == Ity_I128) {
2122       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
2123       /* Use I64 as an intermediate type, which means PCasting all 3
2124          args to I64 to start with. 1st arg is typically a rounding
2125          mode indication which is fully defined, so we hope that it
2126          will get folded out later. */
2127       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2128       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2129       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
2130       /* Now UifU all three together. */
2131       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
2132       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
2133       /* and PCast once again. */
2134       at = mkPCastTo(mce, Ity_I128, at);
2135       return at;
2136    }
2137    if (1) {
2138       VG_(printf)("mkLazy3: ");
2139       ppIRType(t1);
2140       VG_(printf)(" x ");
2141       ppIRType(t2);
2142       VG_(printf)(" x ");
2143       ppIRType(t3);
2144       VG_(printf)(" -> ");
2145       ppIRType(finalVty);
2146       VG_(printf)("\n");
2147    }
2148
2149    tl_assert(0);
2150    /* General case: force everything via 32-bit intermediaries. */
2151    /*
2152    at = mkPCastTo(mce, Ity_I32, va1);
2153    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2154    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2155    at = mkPCastTo(mce, finalVty, at);
2156    return at;
2157    */
2158 }
2159
2160
2161 /* 4-arg version of the above. */
2162 static
2163 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2164                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2165 {
2166    IRAtom* at;
2167    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2168    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2169    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2170    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2171    tl_assert(isShadowAtom(mce,va1));
2172    tl_assert(isShadowAtom(mce,va2));
2173    tl_assert(isShadowAtom(mce,va3));
2174    tl_assert(isShadowAtom(mce,va4));
2175
2176    /* The general case is inefficient because PCast is an expensive
2177       operation.  Here are some special cases which use PCast only
2178       twice rather than three times. */
2179
2180    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2181
2182    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2183        && finalVty == Ity_I128) {
2184       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2185       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
2186          mode indication which is fully defined, this should get
2187          folded out later. */
2188       at = mkPCastTo(mce, Ity_I128, va1);
2189       /* Now fold in 2nd, 3rd, 4th args. */
2190       at = mkUifU(mce, Ity_I128, at, va2);
2191       at = mkUifU(mce, Ity_I128, at, va3);
2192       at = mkUifU(mce, Ity_I128, at, va4);
2193       /* and PCast once again. */
2194       at = mkPCastTo(mce, Ity_I128, at);
2195       return at;
2196    }
2197
2198    /* I32 x I64 x I64 x I64 -> I64 */
2199    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2200        && finalVty == Ity_I64) {
2201       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2202       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
2203          mode indication which is fully defined, this should get
2204          folded out later. */
2205       at = mkPCastTo(mce, Ity_I64, va1);
2206       /* Now fold in 2nd, 3rd, 4th args. */
2207       at = mkUifU(mce, Ity_I64, at, va2);
2208       at = mkUifU(mce, Ity_I64, at, va3);
2209       at = mkUifU(mce, Ity_I64, at, va4);
2210       /* and PCast once again. */
2211       at = mkPCastTo(mce, Ity_I64, at);
2212       return at;
2213    }
2214    /* I32 x I32 x I32 x I32 -> I32 */
2215    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2216    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2217        && finalVty == Ity_I32) {
2218       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2219       at = va1;
2220       /* Now fold in 2nd, 3rd, 4th args. */
2221       at = mkUifU(mce, Ity_I32, at, va2);
2222       at = mkUifU(mce, Ity_I32, at, va3);
2223       at = mkUifU(mce, Ity_I32, at, va4);
2224       at = mkPCastTo(mce, Ity_I32, at);
2225       return at;
2226    }
2227
2228    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2229        && finalVty == Ity_I32) {
2230       if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2231       at = mkPCastTo(mce, Ity_I8, va1);
2232       /* Now fold in 2nd, 3rd, 4th args. */
2233       at = mkUifU(mce, Ity_I8, at, va2);
2234       at = mkUifU(mce, Ity_I8, at, va3);
2235       at = mkUifU(mce, Ity_I8, at, va4);
2236       at = mkPCastTo(mce, Ity_I32, at);
2237       return at;
2238    }
2239
2240    if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2241        && finalVty == Ity_I64) {
2242       if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2243       at = mkPCastTo(mce, Ity_I8, va1);
2244       /* Now fold in 2nd, 3rd, 4th args. */
2245       at = mkUifU(mce, Ity_I8, at, va2);
2246       at = mkUifU(mce, Ity_I8, at, va3);
2247       at = mkUifU(mce, Ity_I8, at, va4);
2248       at = mkPCastTo(mce, Ity_I64, at);
2249       return at;
2250    }
2251
2252    if (1) {
2253       VG_(printf)("mkLazy4: ");
2254       ppIRType(t1);
2255       VG_(printf)(" x ");
2256       ppIRType(t2);
2257       VG_(printf)(" x ");
2258       ppIRType(t3);
2259       VG_(printf)(" x ");
2260       ppIRType(t4);
2261       VG_(printf)(" -> ");
2262       ppIRType(finalVty);
2263       VG_(printf)("\n");
2264    }
2265
2266    tl_assert(0);
2267 }
2268
2269
2270 /* Do the lazy propagation game from a null-terminated vector of
2271    atoms.  This is presumably the arguments to a helper call, so the
2272    IRCallee info is also supplied in order that we can know which
2273    arguments should be ignored (via the .mcx_mask field).
2274 */
2275 static
2276 IRAtom* mkLazyN ( MCEnv* mce,
2277                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2278 {
2279    Int     i;
2280    IRAtom* here;
2281    IRAtom* curr;
2282    IRType  mergeTy;
2283    Bool    mergeTy64 = True;
2284
2285    /* Decide on the type of the merge intermediary.  If all relevant
2286       args are I64, then it's I64.  In all other circumstances, use
2287       I32. */
2288    for (i = 0; exprvec[i]; i++) {
2289       tl_assert(i < 32);
2290       tl_assert(isOriginalAtom(mce, exprvec[i]));
2291       if (cee->mcx_mask & (1<<i))
2292          continue;
2293       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2294          mergeTy64 = False;
2295    }
2296
2297    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
2298    curr    = definedOfType(mergeTy);
2299
2300    for (i = 0; exprvec[i]; i++) {
2301       tl_assert(i < 32);
2302       tl_assert(isOriginalAtom(mce, exprvec[i]));
2303       /* Only take notice of this arg if the callee's mc-exclusion
2304          mask does not say it is to be excluded. */
2305       if (cee->mcx_mask & (1<<i)) {
2306          /* the arg is to be excluded from definedness checking.  Do
2307             nothing. */
2308          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2309       } else {
2310          /* calculate the arg's definedness, and pessimistically merge
2311             it in. */
2312          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2313          curr = mergeTy64
2314                    ? mkUifU64(mce, here, curr)
2315                    : mkUifU32(mce, here, curr);
2316       }
2317    }
2318    return mkPCastTo(mce, finalVtype, curr );
2319 }
2320
2321
2322 /*------------------------------------------------------------*/
2323 /*--- Generating expensive sequences for exact carry-chain ---*/
2324 /*--- propagation in add/sub and related operations.       ---*/
2325 /*------------------------------------------------------------*/
2326
2327 static
2328 IRAtom* expensiveAddSub ( MCEnv*  mce,
2329                           Bool    add,
2330                           IRType  ty,
2331                           IRAtom* qaa, IRAtom* qbb,
2332                           IRAtom* aa,  IRAtom* bb )
2333 {
2334    IRAtom *a_min, *b_min, *a_max, *b_max;
2335    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
2336
2337    tl_assert(isShadowAtom(mce,qaa));
2338    tl_assert(isShadowAtom(mce,qbb));
2339    tl_assert(isOriginalAtom(mce,aa));
2340    tl_assert(isOriginalAtom(mce,bb));
2341    tl_assert(sameKindedAtoms(qaa,aa));
2342    tl_assert(sameKindedAtoms(qbb,bb));
2343
2344    switch (ty) {
2345       case Ity_I32:
2346          opAND = Iop_And32;
2347          opOR  = Iop_Or32;
2348          opXOR = Iop_Xor32;
2349          opNOT = Iop_Not32;
2350          opADD = Iop_Add32;
2351          opSUB = Iop_Sub32;
2352          break;
2353       case Ity_I64:
2354          opAND = Iop_And64;
2355          opOR  = Iop_Or64;
2356          opXOR = Iop_Xor64;
2357          opNOT = Iop_Not64;
2358          opADD = Iop_Add64;
2359          opSUB = Iop_Sub64;
2360          break;
2361       default:
2362          VG_(tool_panic)("expensiveAddSub");
2363    }
2364
2365    // a_min = aa & ~qaa
2366    a_min = assignNew('V', mce,ty,
2367                      binop(opAND, aa,
2368                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
2369
2370    // b_min = bb & ~qbb
2371    b_min = assignNew('V', mce,ty,
2372                      binop(opAND, bb,
2373                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
2374
2375    // a_max = aa | qaa
2376    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2377
2378    // b_max = bb | qbb
2379    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2380
2381    if (add) {
2382       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2383       return
2384       assignNew('V', mce,ty,
2385          binop( opOR,
2386                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2387                 assignNew('V', mce,ty,
2388                    binop( opXOR,
2389                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2390                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2391                    )
2392                 )
2393          )
2394       );
2395    } else {
2396       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2397       return
2398       assignNew('V', mce,ty,
2399          binop( opOR,
2400                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2401                 assignNew('V', mce,ty,
2402                    binop( opXOR,
2403                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2404                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2405                    )
2406                 )
2407          )
2408       );
2409    }
2410
2411 }
2412
2413
2414 static
2415 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2416                                        IRAtom* atom, IRAtom* vatom )
2417 {
2418    IRType ty;
2419    IROp xorOp, subOp, andOp;
2420    IRExpr *one;
2421    IRAtom *improver, *improved;
2422    tl_assert(isShadowAtom(mce,vatom));
2423    tl_assert(isOriginalAtom(mce,atom));
2424    tl_assert(sameKindedAtoms(atom,vatom));
2425
2426    switch (czop) {
2427       case Iop_Ctz32: case Iop_CtzNat32:
2428          ty = Ity_I32;
2429          xorOp = Iop_Xor32;
2430          subOp = Iop_Sub32;
2431          andOp = Iop_And32;
2432          one = mkU32(1);
2433          break;
2434       case Iop_Ctz64: case Iop_CtzNat64:
2435          ty = Ity_I64;
2436          xorOp = Iop_Xor64;
2437          subOp = Iop_Sub64;
2438          andOp = Iop_And64;
2439          one = mkU64(1);
2440          break;
2441       default:
2442          ppIROp(czop);
2443          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2444    }
2445
2446    // improver = atom ^ (atom - 1)
2447    //
2448    // That is, improver has its low ctz(atom)+1 bits equal to one;
2449    // higher bits (if any) equal to zero.  So it's exactly the right
2450    // mask to use to remove the irrelevant undefined input bits.
2451    /* Here are some examples:
2452          atom   = U...U 1 0...0
2453          atom-1 = U...U 0 1...1
2454          ^ed    = 0...0 1 11111, which correctly describes which bits of |atom|
2455                                  actually influence the result
2456       A boundary case
2457          atom   = 0...0
2458          atom-1 = 1...1
2459          ^ed    = 11111, also a correct mask for the input: all input bits
2460                          are relevant
2461       Another boundary case
2462          atom   = 1..1 1
2463          atom-1 = 1..1 0
2464          ^ed    = 0..0 1, also a correct mask: only the rightmost input bit
2465                           is relevant
2466       Now with misc U bits interspersed:
2467          atom   = U...U 1 0 U...U 0 1 0...0
2468          atom-1 = U...U 1 0 U...U 0 0 1...1
2469          ^ed    = 0...0 0 0 0...0 0 1 1...1, also correct
2470       (Per re-check/analysis of 14 Nov 2018)
2471    */
2472    improver = assignNew('V', mce,ty,
2473                         binop(xorOp,
2474                               atom,
2475                               assignNew('V', mce, ty,
2476                                         binop(subOp, atom, one))));
2477
2478    // improved = vatom & improver
2479    //
2480    // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2481    // bits as "defined".
2482    improved = assignNew('V', mce, ty,
2483                         binop(andOp, vatom, improver));
2484
2485    // Return pessimizing cast of improved.
2486    return mkPCastTo(mce, ty, improved);
2487 }
2488
2489 static
2490 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2491                                       IRAtom* atom, IRAtom* vatom )
2492 {
2493    IRType ty;
2494    IROp shrOp, notOp, andOp;
2495    IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2496    IRAtom *improver, *improved;
2497    tl_assert(isShadowAtom(mce,vatom));
2498    tl_assert(isOriginalAtom(mce,atom));
2499    tl_assert(sameKindedAtoms(atom,vatom));
2500
2501    switch (czop) {
2502       case Iop_Clz32: case Iop_ClzNat32:
2503          ty = Ity_I32;
2504          shrOp = Iop_Shr32;
2505          notOp = Iop_Not32;
2506          andOp = Iop_And32;
2507          mkRight = mkRight32;
2508          break;
2509       case Iop_Clz64: case Iop_ClzNat64:
2510          ty = Ity_I64;
2511          shrOp = Iop_Shr64;
2512          notOp = Iop_Not64;
2513          andOp = Iop_And64;
2514          mkRight = mkRight64;
2515          break;
2516       default:
2517          ppIROp(czop);
2518          VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2519    }
2520
2521    // This is in principle very similar to how expensiveCountTrailingZeroes
2522    // works.  That function computed an "improver", which it used to mask
2523    // off all but the rightmost 1-bit and the zeroes to the right of it,
2524    // hence removing irrelevant bits from the input.  Here, we play the
2525    // exact same game but with the left-vs-right roles interchanged.
2526    // Unfortunately calculation of the improver in this case is
2527    // significantly more expensive.
2528    //
2529    // improver = ~(RIGHT(atom) >>u 1)
2530    //
2531    // That is, improver has its upper clz(atom)+1 bits equal to one;
2532    // lower bits (if any) equal to zero.  So it's exactly the right
2533    // mask to use to remove the irrelevant undefined input bits.
2534    /* Here are some examples:
2535          atom             = 0...0 1 U...U
2536          R(atom)          = 0...0 1 1...1
2537          R(atom) >>u 1    = 0...0 0 1...1
2538          ~(R(atom) >>u 1) = 1...1 1 0...0
2539                             which correctly describes which bits of |atom|
2540                             actually influence the result
2541       A boundary case
2542          atom             = 0...0
2543          R(atom)          = 0...0
2544          R(atom) >>u 1    = 0...0
2545          ~(R(atom) >>u 1) = 1...1
2546                             also a correct mask for the input: all input bits
2547                             are relevant
2548       Another boundary case
2549          atom             = 1 1..1
2550          R(atom)          = 1 1..1
2551          R(atom) >>u 1    = 0 1..1
2552          ~(R(atom) >>u 1) = 1 0..0
2553                             also a correct mask: only the leftmost input bit
2554                             is relevant
2555       Now with misc U bits interspersed:
2556          atom             = 0...0 1 U...U 0 1 U...U
2557          R(atom)          = 0...0 1 1...1 1 1 1...1
2558          R(atom) >>u 1    = 0...0 0 1...1 1 1 1...1
2559          ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2560       (Per initial implementation of 15 Nov 2018)
2561    */
2562    improver = mkRight(mce, atom);
2563    improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2564    improver = assignNew('V', mce, ty, unop(notOp, improver));
2565
2566    // improved = vatom & improver
2567    //
2568    // That is, treat any V bits to the right of the leftmost clz(atom)+1
2569    // bits as "defined".
2570    improved = assignNew('V', mce, ty,
2571                         binop(andOp, vatom, improver));
2572
2573    // Return pessimizing cast of improved.
2574    return mkPCastTo(mce, ty, improved);
2575 }
2576
2577
2578 /*------------------------------------------------------------*/
2579 /*--- Scalar shifts.                                       ---*/
2580 /*------------------------------------------------------------*/
2581
2582 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2583    idea is to shift the definedness bits by the original shift amount.
2584    This introduces 0s ("defined") in new positions for left shifts and
2585    unsigned right shifts, and copies the top definedness bit for
2586    signed right shifts.  So, conveniently, applying the original shift
2587    operator to the definedness bits for the left arg is exactly the
2588    right thing to do:
2589
2590       (qaa << bb)
2591
2592    However if the shift amount is undefined then the whole result
2593    is undefined.  Hence need:
2594
2595       (qaa << bb) `UifU` PCast(qbb)
2596
2597    If the shift amount bb is a literal than qbb will say 'all defined'
2598    and the UifU and PCast will get folded out by post-instrumentation
2599    optimisation.
2600 */
2601 static IRAtom* scalarShift ( MCEnv*  mce,
2602                              IRType  ty,
2603                              IROp    original_op,
2604                              IRAtom* qaa, IRAtom* qbb,
2605                              IRAtom* aa,  IRAtom* bb )
2606 {
2607    tl_assert(isShadowAtom(mce,qaa));
2608    tl_assert(isShadowAtom(mce,qbb));
2609    tl_assert(isOriginalAtom(mce,aa));
2610    tl_assert(isOriginalAtom(mce,bb));
2611    tl_assert(sameKindedAtoms(qaa,aa));
2612    tl_assert(sameKindedAtoms(qbb,bb));
2613    return
2614       assignNew(
2615          'V', mce, ty,
2616          mkUifU( mce, ty,
2617                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2618                  mkPCastTo(mce, ty, qbb)
2619          )
2620    );
2621 }
2622
2623
2624 /*------------------------------------------------------------*/
2625 /*--- Helpers for dealing with vector primops.             ---*/
2626 /*------------------------------------------------------------*/
2627
2628 /* Vector pessimisation -- pessimise within each lane individually. */
2629
2630 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2631 {
2632    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2633 }
2634
2635 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2636 {
2637    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2638 }
2639
2640 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2641 {
2642    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2643 }
2644
2645 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2646 {
2647    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2648 }
2649
2650 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2651 {
2652    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2653 }
2654
2655 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2656 {
2657    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2658 }
2659
2660 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2661 {
2662    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2663 }
2664
2665 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2666 {
2667    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2668 }
2669
2670 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2671 {
2672    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2673 }
2674
2675 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2676 {
2677    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2678 }
2679
2680 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2681 {
2682    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2683 }
2684
2685 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2686 {
2687    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2688 }
2689
2690 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2691 {
2692    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2693 }
2694
2695 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2696 {
2697    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2698 }
2699
2700
2701 /* Here's a simple scheme capable of handling ops derived from SSE1
2702    code and while only generating ops that can be efficiently
2703    implemented in SSE1. */
2704
2705 /* All-lanes versions are straightforward:
2706
2707    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2708
2709    unary32Fx4(x,y)    ==> PCast32x4(x#)
2710
2711    Lowest-lane-only versions are more complex:
2712
2713    binary32F0x4(x,y)  ==> SetV128lo32(
2714                              x#,
2715                              PCast32(V128to32(UifUV128(x#,y#)))
2716                           )
2717
2718    This is perhaps not so obvious.  In particular, it's faster to
2719    do a V128-bit UifU and then take the bottom 32 bits than the more
2720    obvious scheme of taking the bottom 32 bits of each operand
2721    and doing a 32-bit UifU.  Basically since UifU is fast and
2722    chopping lanes off vector values is slow.
2723
2724    Finally:
2725
2726    unary32F0x4(x)     ==> SetV128lo32(
2727                              x#,
2728                              PCast32(V128to32(x#))
2729                           )
2730
2731    Where:
2732
2733    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2734    PCast32x4(v#) = CmpNEZ32x4(v#)
2735 */
2736
2737 static
2738 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2739 {
2740    IRAtom* at;
2741    tl_assert(isShadowAtom(mce, vatomX));
2742    tl_assert(isShadowAtom(mce, vatomY));
2743    at = mkUifUV128(mce, vatomX, vatomY);
2744    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2745    return at;
2746 }
2747
2748 static
2749 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2750 {
2751    IRAtom* at;
2752    tl_assert(isShadowAtom(mce, vatomX));
2753    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2754    return at;
2755 }
2756
2757 static
2758 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2759 {
2760    IRAtom* at;
2761    tl_assert(isShadowAtom(mce, vatomX));
2762    tl_assert(isShadowAtom(mce, vatomY));
2763    at = mkUifUV128(mce, vatomX, vatomY);
2764    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2765    at = mkPCastTo(mce, Ity_I32, at);
2766    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2767    return at;
2768 }
2769
2770 static
2771 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2772 {
2773    IRAtom* at;
2774    tl_assert(isShadowAtom(mce, vatomX));
2775    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2776    at = mkPCastTo(mce, Ity_I32, at);
2777    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2778    return at;
2779 }
2780
2781 /* --- ... and ... 64Fx2 versions of the same ... --- */
2782
2783 static
2784 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2785 {
2786    IRAtom* at;
2787    tl_assert(isShadowAtom(mce, vatomX));
2788    tl_assert(isShadowAtom(mce, vatomY));
2789    at = mkUifUV128(mce, vatomX, vatomY);
2790    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2791    return at;
2792 }
2793
2794 static
2795 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2796 {
2797    IRAtom* at;
2798    tl_assert(isShadowAtom(mce, vatomX));
2799    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2800    return at;
2801 }
2802
2803 static
2804 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2805 {
2806    IRAtom* at;
2807    tl_assert(isShadowAtom(mce, vatomX));
2808    tl_assert(isShadowAtom(mce, vatomY));
2809    at = mkUifUV128(mce, vatomX, vatomY);
2810    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2811    at = mkPCastTo(mce, Ity_I64, at);
2812    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2813    return at;
2814 }
2815
2816 static
2817 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2818 {
2819    IRAtom* at;
2820    tl_assert(isShadowAtom(mce, vatomX));
2821    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2822    at = mkPCastTo(mce, Ity_I64, at);
2823    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2824    return at;
2825 }
2826
2827 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2828
2829 static
2830 IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2831 {
2832    IRAtom* at;
2833    tl_assert(isShadowAtom(mce, vatomX));
2834    tl_assert(isShadowAtom(mce, vatomY));
2835    at = mkUifUV128(mce, vatomX, vatomY);
2836    at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at));
2837    return at;
2838 }
2839
2840 static
2841 IRAtom* unary16Fx8 ( MCEnv* mce, IRAtom* vatomX )
2842 {
2843    IRAtom* at;
2844    tl_assert(isShadowAtom(mce, vatomX));
2845    at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, vatomX));
2846    return at;
2847 }
2848
2849 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2850    implemented.
2851 */
2852
2853 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2854
2855 static
2856 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2857 {
2858    IRAtom* at;
2859    tl_assert(isShadowAtom(mce, vatomX));
2860    tl_assert(isShadowAtom(mce, vatomY));
2861    at = mkUifU64(mce, vatomX, vatomY);
2862    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2863    return at;
2864 }
2865
2866 static
2867 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2868 {
2869    IRAtom* at;
2870    tl_assert(isShadowAtom(mce, vatomX));
2871    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2872    return at;
2873 }
2874
2875 /* --- ... and ... 64Fx4 versions of the same ... --- */
2876
2877 static
2878 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2879 {
2880    IRAtom* at;
2881    tl_assert(isShadowAtom(mce, vatomX));
2882    tl_assert(isShadowAtom(mce, vatomY));
2883    at = mkUifUV256(mce, vatomX, vatomY);
2884    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2885    return at;
2886 }
2887
2888 static
2889 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2890 {
2891    IRAtom* at;
2892    tl_assert(isShadowAtom(mce, vatomX));
2893    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2894    return at;
2895 }
2896
2897 /* --- ... and ... 32Fx8 versions of the same ... --- */
2898
2899 static
2900 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2901 {
2902    IRAtom* at;
2903    tl_assert(isShadowAtom(mce, vatomX));
2904    tl_assert(isShadowAtom(mce, vatomY));
2905    at = mkUifUV256(mce, vatomX, vatomY);
2906    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2907    return at;
2908 }
2909
2910 static
2911 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2912 {
2913    IRAtom* at;
2914    tl_assert(isShadowAtom(mce, vatomX));
2915    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2916    return at;
2917 }
2918
2919 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2920
2921 static
2922 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2923                                        IRAtom* vatomX, IRAtom* vatomY )
2924 {
2925    /* This is the same as binary64Fx2, except that we subsequently
2926       pessimise vRM (definedness of the rounding mode), widen to 128
2927       bits and UifU it into the result.  As with the scalar cases, if
2928       the RM is a constant then it is defined and so this extra bit
2929       will get constant-folded out later. */
2930    // "do" the vector args
2931    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2932    // PCast the RM, and widen it to 128 bits
2933    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2934    // Roll it into the result
2935    t1 = mkUifUV128(mce, t1, t2);
2936    return t1;
2937 }
2938
2939 /* --- ... and ... 32Fx4 versions of the same --- */
2940
2941 static
2942 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2943                                        IRAtom* vatomX, IRAtom* vatomY )
2944 {
2945    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2946    // PCast the RM, and widen it to 128 bits
2947    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2948    // Roll it into the result
2949    t1 = mkUifUV128(mce, t1, t2);
2950    return t1;
2951 }
2952
2953 /* --- ... and ... 64Fx4 versions of the same --- */
2954
2955 static
2956 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2957                                        IRAtom* vatomX, IRAtom* vatomY )
2958 {
2959    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2960    // PCast the RM, and widen it to 256 bits
2961    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2962    // Roll it into the result
2963    t1 = mkUifUV256(mce, t1, t2);
2964    return t1;
2965 }
2966
2967 /* --- ... and ... 16Fx8 versions of the same --- */
2968
2969 static
2970 IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2971                                        IRAtom* vatomX, IRAtom* vatomY )
2972 {
2973    IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY);
2974    // PCast the RM, and widen it to 128 bits
2975    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2976    // Roll it into the result
2977    t1 = mkUifUV128(mce, t1, t2);
2978    return t1;
2979 }
2980
2981 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2982    implemented.
2983 */
2984
2985 /* --- ... and ... 32Fx8 versions of the same --- */
2986
2987 static
2988 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2989                                        IRAtom* vatomX, IRAtom* vatomY )
2990 {
2991    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2992    // PCast the RM, and widen it to 256 bits
2993    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2994    // Roll it into the result
2995    t1 = mkUifUV256(mce, t1, t2);
2996    return t1;
2997 }
2998
2999 /* --- 64Fx2 unary FP ops, with rounding mode --- */
3000
3001 static
3002 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3003 {
3004    /* Same scheme as binary64Fx2_w_rm. */
3005    // "do" the vector arg
3006    IRAtom* t1 = unary64Fx2(mce, vatomX);
3007    // PCast the RM, and widen it to 128 bits
3008    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3009    // Roll it into the result
3010    t1 = mkUifUV128(mce, t1, t2);
3011    return t1;
3012 }
3013
3014 /* --- ... and ... 32Fx4 versions of the same --- */
3015
3016 static
3017 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3018 {
3019    /* Same scheme as binaryFx4_w_rm. */
3020    IRAtom* t1 = unary32Fx4(mce, vatomX);
3021    // PCast the RM, and widen it to 128 bits
3022    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3023    // Roll it into the result
3024    t1 = mkUifUV128(mce, t1, t2);
3025    return t1;
3026 }
3027
3028 /* --- ... and ... 16Fx8 versions of the same --- */
3029
3030 static
3031 IRAtom* unary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3032 {
3033    /* Same scheme as binaryFx4_w_rm. */
3034    IRAtom* t1 = unary16Fx8(mce, vatomX);
3035    // PCast the RM, and widen it to 128 bits
3036    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3037    // Roll it into the result
3038    t1 = mkUifUV128(mce, t1, t2);
3039    return t1;
3040 }
3041
3042 /* --- ... and ... 32Fx8 versions of the same --- */
3043
3044 static
3045 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3046 {
3047    /* Same scheme as unary32Fx8_w_rm. */
3048    IRAtom* t1 = unary32Fx8(mce, vatomX);
3049    // PCast the RM, and widen it to 256 bits
3050    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
3051    // Roll it into the result
3052    t1 = mkUifUV256(mce, t1, t2);
3053    return t1;
3054 }
3055
3056
3057 /* --- --- Vector saturated narrowing --- --- */
3058
3059 /* We used to do something very clever here, but on closer inspection
3060    (2011-Jun-15), and in particular bug #279698, it turns out to be
3061    wrong.  Part of the problem came from the fact that for a long
3062    time, the IR primops to do with saturated narrowing were
3063    underspecified and managed to confuse multiple cases which needed
3064    to be separate: the op names had a signedness qualifier, but in
3065    fact the source and destination signednesses needed to be specified
3066    independently, so the op names really need two independent
3067    signedness specifiers.
3068
3069    As of 2011-Jun-15 (ish) the underspecification was sorted out
3070    properly.  The incorrect instrumentation remained, though.  That
3071    has now (2011-Oct-22) been fixed.
3072
3073    What we now do is simple:
3074
3075    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
3076    number of lanes, X is the source lane width and signedness, and Y
3077    is the destination lane width and signedness.  In all cases the
3078    destination lane width is half the source lane width, so the names
3079    have a bit of redundancy, but are at least easy to read.
3080
3081    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
3082    to unsigned 16s.
3083
3084    Let Vanilla(OP) be a function that takes OP, one of these
3085    saturating narrowing ops, and produces the same "shaped" narrowing
3086    op which is not saturating, but merely dumps the most significant
3087    bits.  "same shape" means that the lane numbers and widths are the
3088    same as with OP.
3089
3090    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
3091                   = Iop_NarrowBin32to16x8,
3092    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
3093    dumping the top half of each lane.
3094
3095    So, with that in place, the scheme is simple, and it is simple to
3096    pessimise each lane individually and then apply Vanilla(OP) so as
3097    to get the result in the right "shape".  If the original OP is
3098    QNarrowBinXtoYxZ then we produce
3099
3100    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
3101
3102    or for the case when OP is unary (Iop_QNarrowUn*)
3103
3104    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
3105 */
3106 static
3107 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
3108 {
3109    switch (qnarrowOp) {
3110       /* Binary: (128, 128) -> 128 */
3111       case Iop_QNarrowBin16Sto8Ux16:
3112       case Iop_QNarrowBin16Sto8Sx16:
3113       case Iop_QNarrowBin16Uto8Ux16:
3114       case Iop_QNarrowBin64Sto32Sx4:
3115       case Iop_QNarrowBin64Uto32Ux4:
3116          return Iop_NarrowBin16to8x16;
3117       case Iop_QNarrowBin32Sto16Ux8:
3118       case Iop_QNarrowBin32Sto16Sx8:
3119       case Iop_QNarrowBin32Uto16Ux8:
3120          return Iop_NarrowBin32to16x8;
3121       /* Binary: (64, 64) -> 64 */
3122       case Iop_QNarrowBin32Sto16Sx4:
3123          return Iop_NarrowBin32to16x4;
3124       case Iop_QNarrowBin16Sto8Ux8:
3125       case Iop_QNarrowBin16Sto8Sx8:
3126          return Iop_NarrowBin16to8x8;
3127       /* Unary: 128 -> 64 */
3128       case Iop_QNarrowUn64Uto32Ux2:
3129       case Iop_QNarrowUn64Sto32Sx2:
3130       case Iop_QNarrowUn64Sto32Ux2:
3131          return Iop_NarrowUn64to32x2;
3132       case Iop_QNarrowUn32Uto16Ux4:
3133       case Iop_QNarrowUn32Sto16Sx4:
3134       case Iop_QNarrowUn32Sto16Ux4:
3135       case Iop_F32toF16x4_DEP:
3136          return Iop_NarrowUn32to16x4;
3137       case Iop_QNarrowUn16Uto8Ux8:
3138       case Iop_QNarrowUn16Sto8Sx8:
3139       case Iop_QNarrowUn16Sto8Ux8:
3140          return Iop_NarrowUn16to8x8;
3141       default:
3142          ppIROp(qnarrowOp);
3143          VG_(tool_panic)("vanillaNarrowOpOfShape");
3144    }
3145 }
3146
3147 static
3148 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
3149                               IRAtom* vatom1, IRAtom* vatom2)
3150 {
3151    IRAtom *at1, *at2, *at3;
3152    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3153    switch (narrow_op) {
3154       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
3155       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
3156       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
3157       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
3158       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
3159       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
3160       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
3161       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
3162       default: VG_(tool_panic)("vectorNarrowBinV128");
3163    }
3164    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3165    tl_assert(isShadowAtom(mce,vatom1));
3166    tl_assert(isShadowAtom(mce,vatom2));
3167    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3168    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
3169    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
3170    return at3;
3171 }
3172
3173 static
3174 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
3175                             IRAtom* vatom1, IRAtom* vatom2)
3176 {
3177    IRAtom *at1, *at2, *at3;
3178    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3179    switch (narrow_op) {
3180       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
3181       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
3182       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
3183       default: VG_(tool_panic)("vectorNarrowBin64");
3184    }
3185    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3186    tl_assert(isShadowAtom(mce,vatom1));
3187    tl_assert(isShadowAtom(mce,vatom2));
3188    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
3189    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3190    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3191    return at3;
3192 }
3193
3194 static
3195 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3196                              IRAtom* vatom1)
3197 {
3198    IRAtom *at1, *at2;
3199    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3200    tl_assert(isShadowAtom(mce,vatom1));
3201    /* For vanilla narrowing (non-saturating), we can just apply
3202       the op directly to the V bits. */
3203    switch (narrow_op) {
3204       case Iop_NarrowUn16to8x8:
3205       case Iop_NarrowUn32to16x4:
3206       case Iop_NarrowUn64to32x2:
3207       case Iop_F32toF16x4_DEP:
3208          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3209          return at1;
3210       default:
3211          break; /* Do Plan B */
3212    }
3213    /* Plan B: for ops that involve a saturation operation on the args,
3214       we must PCast before the vanilla narrow. */
3215    switch (narrow_op) {
3216       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
3217       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
3218       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
3219       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3220       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3221       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3222       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3223       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3224       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3225       default: VG_(tool_panic)("vectorNarrowUnV128");
3226    }
3227    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3228    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3229    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3230    return at2;
3231 }
3232
3233 static
3234 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3235                          IRAtom* vatom1)
3236 {
3237    IRAtom *at1, *at2;
3238    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3239    switch (longen_op) {
3240       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
3241       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
3242       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3243       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3244       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3245       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3246       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
3247       default: VG_(tool_panic)("vectorWidenI64");
3248    }
3249    tl_assert(isShadowAtom(mce,vatom1));
3250    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3251    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3252    return at2;
3253 }
3254
3255
3256 /* --- --- Vector integer arithmetic --- --- */
3257
3258 /* Simple ... UifU the args and per-lane pessimise the results. */
3259
3260 /* --- V256-bit versions --- */
3261
3262 static
3263 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3264 {
3265    IRAtom* at;
3266    at = mkUifUV256(mce, vatom1, vatom2);
3267    at = mkPCast8x32(mce, at);
3268    return at;
3269 }
3270
3271 static
3272 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3273 {
3274    IRAtom* at;
3275    at = mkUifUV256(mce, vatom1, vatom2);
3276    at = mkPCast16x16(mce, at);
3277    return at;
3278 }
3279
3280 static
3281 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3282 {
3283    IRAtom* at;
3284    at = mkUifUV256(mce, vatom1, vatom2);
3285    at = mkPCast32x8(mce, at);
3286    return at;
3287 }
3288
3289 static
3290 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3291 {
3292    IRAtom* at;
3293    at = mkUifUV256(mce, vatom1, vatom2);
3294    at = mkPCast64x4(mce, at);
3295    return at;
3296 }
3297
3298 /* --- V128-bit versions --- */
3299
3300 static
3301 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3302 {
3303    IRAtom* at;
3304    at = mkUifUV128(mce, vatom1, vatom2);
3305    at = mkPCast8x16(mce, at);
3306    return at;
3307 }
3308
3309 static
3310 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3311 {
3312    IRAtom* at;
3313    at = mkUifUV128(mce, vatom1, vatom2);
3314    at = mkPCast16x8(mce, at);
3315    return at;
3316 }
3317
3318 static
3319 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3320 {
3321    IRAtom* at;
3322    at = mkUifUV128(mce, vatom1, vatom2);
3323    at = mkPCast32x4(mce, at);
3324    return at;
3325 }
3326
3327 static
3328 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3329 {
3330    IRAtom* at;
3331    at = mkUifUV128(mce, vatom1, vatom2);
3332    at = mkPCast64x2(mce, at);
3333    return at;
3334 }
3335
3336 static
3337 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3338 {
3339    IRAtom* at;
3340    at = mkUifUV128(mce, vatom1, vatom2);
3341    at = mkPCast128x1(mce, at);
3342    return at;
3343 }
3344
3345 /* --- 64-bit versions --- */
3346
3347 static
3348 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3349 {
3350    IRAtom* at;
3351    at = mkUifU64(mce, vatom1, vatom2);
3352    at = mkPCast8x8(mce, at);
3353    return at;
3354 }
3355
3356 static
3357 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3358 {
3359    IRAtom* at;
3360    at = mkUifU64(mce, vatom1, vatom2);
3361    at = mkPCast16x4(mce, at);
3362    return at;
3363 }
3364
3365 static
3366 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3367 {
3368    IRAtom* at;
3369    at = mkUifU64(mce, vatom1, vatom2);
3370    at = mkPCast32x2(mce, at);
3371    return at;
3372 }
3373
3374 static
3375 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3376 {
3377    IRAtom* at;
3378    at = mkUifU64(mce, vatom1, vatom2);
3379    at = mkPCastTo(mce, Ity_I64, at);
3380    return at;
3381 }
3382
3383 /* --- 32-bit versions --- */
3384
3385 static
3386 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3387 {
3388    IRAtom* at;
3389    at = mkUifU32(mce, vatom1, vatom2);
3390    at = mkPCast8x4(mce, at);
3391    return at;
3392 }
3393
3394 static
3395 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3396 {
3397    IRAtom* at;
3398    at = mkUifU32(mce, vatom1, vatom2);
3399    at = mkPCast16x2(mce, at);
3400    return at;
3401 }
3402
3403
3404 /*------------------------------------------------------------*/
3405 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
3406 /*------------------------------------------------------------*/
3407
3408 static
3409 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3410                          IROp op,
3411                          IRAtom* atom1, IRAtom* atom2,
3412                          IRAtom* atom3, IRAtom* atom4 )
3413 {
3414    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3415    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3416    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3417    IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3418
3419    tl_assert(isOriginalAtom(mce,atom1));
3420    tl_assert(isOriginalAtom(mce,atom2));
3421    tl_assert(isOriginalAtom(mce,atom3));
3422    tl_assert(isOriginalAtom(mce,atom4));
3423    tl_assert(isShadowAtom(mce,vatom1));
3424    tl_assert(isShadowAtom(mce,vatom2));
3425    tl_assert(isShadowAtom(mce,vatom3));
3426    tl_assert(isShadowAtom(mce,vatom4));
3427    tl_assert(sameKindedAtoms(atom1,vatom1));
3428    tl_assert(sameKindedAtoms(atom2,vatom2));
3429    tl_assert(sameKindedAtoms(atom3,vatom3));
3430    tl_assert(sameKindedAtoms(atom4,vatom4));
3431    switch (op) {
3432       case Iop_MAddF64:
3433       case Iop_MAddF64r32:
3434       case Iop_MSubF64:
3435       case Iop_MSubF64r32:
3436          /* I32(rm) x F64 x F64 x F64 -> F64 */
3437          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3438
3439       case Iop_MAddF32:
3440       case Iop_MSubF32:
3441          /* I32(rm) x F32 x F32 x F32 -> F32 */
3442          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3443
3444       case Iop_MAddF128:
3445       case Iop_MSubF128:
3446       case Iop_NegMAddF128:
3447       case Iop_NegMSubF128:
3448          /* I32(rm) x F128 x F128 x F128 -> F128 */
3449          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3450
3451       /* V256-bit data-steering */
3452       case Iop_64x4toV256:
3453          return assignNew('V', mce, Ity_V256,
3454                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3455
3456       /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3457       case Iop_Rotx32:
3458          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3459       case Iop_Rotx64:
3460          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3461       default:
3462          ppIROp(op);
3463          VG_(tool_panic)("memcheck:expr2vbits_Qop");
3464    }
3465 }
3466
3467
3468 static
3469 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3470                            IROp op,
3471                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3472 {
3473    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3474    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3475    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3476
3477    tl_assert(isOriginalAtom(mce,atom1));
3478    tl_assert(isOriginalAtom(mce,atom2));
3479    tl_assert(isOriginalAtom(mce,atom3));
3480    tl_assert(isShadowAtom(mce,vatom1));
3481    tl_assert(isShadowAtom(mce,vatom2));
3482    tl_assert(isShadowAtom(mce,vatom3));
3483    tl_assert(sameKindedAtoms(atom1,vatom1));
3484    tl_assert(sameKindedAtoms(atom2,vatom2));
3485    tl_assert(sameKindedAtoms(atom3,vatom3));
3486    switch (op) {
3487       case Iop_AddF128:
3488       case Iop_SubF128:
3489       case Iop_MulF128:
3490       case Iop_DivF128:
3491       case Iop_AddD128:
3492       case Iop_SubD128:
3493       case Iop_MulD128:
3494       case Iop_DivD128:
3495       case Iop_QuantizeD128:
3496          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3497          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3498       case Iop_AddF64:
3499       case Iop_AddD64:
3500       case Iop_AddF64r32:
3501       case Iop_SubF64:
3502       case Iop_SubD64:
3503       case Iop_SubF64r32:
3504       case Iop_MulF64:
3505       case Iop_MulD64:
3506       case Iop_MulF64r32:
3507       case Iop_DivF64:
3508       case Iop_DivD64:
3509       case Iop_DivF64r32:
3510       case Iop_ScaleF64:
3511       case Iop_Yl2xF64:
3512       case Iop_Yl2xp1F64:
3513       case Iop_AtanF64:
3514       case Iop_PRemF64:
3515       case Iop_PRem1F64:
3516       case Iop_QuantizeD64:
3517          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3518          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3519       case Iop_PRemC3210F64:
3520       case Iop_PRem1C3210F64:
3521          /* I32(rm) x F64 x F64 -> I32 */
3522          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3523       case Iop_AddF32:
3524       case Iop_SubF32:
3525       case Iop_MulF32:
3526       case Iop_DivF32:
3527          /* I32(rm) x F32 x F32 -> I32 */
3528          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3529       case Iop_AddF16:
3530       case Iop_SubF16:
3531          /* I32(rm) x F16 x F16 -> I16 */
3532          return mkLazy3(mce, Ity_I16, vatom1, vatom2, vatom3);
3533       case Iop_SignificanceRoundD64:
3534          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3535          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3536       case Iop_SignificanceRoundD128:
3537          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3538          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3539       case Iop_SliceV128:
3540          /* (V128, V128, I8) -> V128 */
3541          complainIfUndefined(mce, atom3, NULL);
3542          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3543       case Iop_Slice64:
3544          /* (I64, I64, I8) -> I64 */
3545          complainIfUndefined(mce, atom3, NULL);
3546          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3547       case Iop_SetElem8x8:
3548       case Iop_SetElem16x4:
3549       case Iop_SetElem32x2:
3550          complainIfUndefined(mce, atom2, NULL);
3551          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3552
3553       case Iop_SetElem8x16:
3554       case Iop_SetElem16x8:
3555       case Iop_SetElem32x4:
3556       case Iop_SetElem64x2:
3557          complainIfUndefined(mce, atom2, NULL);
3558          return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3559
3560       /* Int 128-bit Integer three arg  */
3561       case Iop_2xMultU64Add128CarryOut:
3562       case Iop_Perm8x16x2:
3563          /* (V128, V128, V128) -> V128 */
3564             complainIfUndefined(mce, atom3, NULL);
3565             return mkUifUV128(
3566                    mce,
3567                    assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3568                    mkPCast8x16(mce, vatom3)
3569                 );
3570
3571       /* Vector FP with rounding mode as the first arg */
3572       case Iop_Add64Fx2:
3573       case Iop_Sub64Fx2:
3574       case Iop_Mul64Fx2:
3575       case Iop_Div64Fx2:
3576       case Iop_Scale2_64Fx2:
3577          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3578
3579       case Iop_Add32Fx4:
3580       case Iop_Sub32Fx4:
3581       case Iop_Mul32Fx4:
3582       case Iop_Div32Fx4:
3583       case Iop_Scale2_32Fx4:
3584         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3585
3586       case Iop_Add64Fx4:
3587       case Iop_Sub64Fx4:
3588       case Iop_Mul64Fx4:
3589       case Iop_Div64Fx4:
3590          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3591
3592       /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3593          IR is implemented.
3594       */
3595       case Iop_Add16Fx8:
3596       case Iop_Sub16Fx8:
3597         return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3598
3599       case Iop_Add32Fx8:
3600       case Iop_Sub32Fx8:
3601       case Iop_Mul32Fx8:
3602       case Iop_Div32Fx8:
3603          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3604
3605       case Iop_F32x4_2toQ16x8:
3606          return assignNew('V', mce, Ity_V128,
3607                           binop(Iop_PackEvenLanes16x8,
3608                                 unary32Fx4_w_rm(mce, vatom1, vatom2),
3609                                 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3610       case Iop_F64x2_2toQ32x4:
3611          return assignNew('V', mce, Ity_V128,
3612                           binop(Iop_PackEvenLanes32x4,
3613                                 unary64Fx2_w_rm(mce, vatom1, vatom2),
3614                                 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3615
3616       default:
3617          ppIROp(op);
3618          VG_(tool_panic)("memcheck:expr2vbits_Triop");
3619    }
3620 }
3621
3622
3623 static
3624 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3625                            IROp op,
3626                            IRAtom* atom1, IRAtom* atom2,
3627                            HowUsed hu/*use HuOth if unknown*/ )
3628 {
3629    IRType  and_or_ty = Ity_INVALID;
3630    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3631    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3632    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3633
3634    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3635    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3636
3637    tl_assert(isOriginalAtom(mce,atom1));
3638    tl_assert(isOriginalAtom(mce,atom2));
3639    tl_assert(isShadowAtom(mce,vatom1));
3640    tl_assert(isShadowAtom(mce,vatom2));
3641    tl_assert(sameKindedAtoms(atom1,vatom1));
3642    tl_assert(sameKindedAtoms(atom2,vatom2));
3643    switch (op) {
3644
3645       /* 32-bit SIMD */
3646
3647       case Iop_Add16x2:
3648       case Iop_HAdd16Ux2:
3649       case Iop_HAdd16Sx2:
3650       case Iop_Sub16x2:
3651       case Iop_HSub16Ux2:
3652       case Iop_HSub16Sx2:
3653       case Iop_QAdd16Sx2:
3654       case Iop_QSub16Sx2:
3655       case Iop_QSub16Ux2:
3656       case Iop_QAdd16Ux2:
3657          return binary16Ix2(mce, vatom1, vatom2);
3658
3659       case Iop_Add8x4:
3660       case Iop_HAdd8Ux4:
3661       case Iop_HAdd8Sx4:
3662       case Iop_Sub8x4:
3663       case Iop_HSub8Ux4:
3664       case Iop_HSub8Sx4:
3665       case Iop_QSub8Ux4:
3666       case Iop_QAdd8Ux4:
3667       case Iop_QSub8Sx4:
3668       case Iop_QAdd8Sx4:
3669          return binary8Ix4(mce, vatom1, vatom2);
3670
3671       /* 64-bit SIMD */
3672
3673       case Iop_ShrN8x8:
3674       case Iop_ShrN16x4:
3675       case Iop_ShrN32x2:
3676       case Iop_SarN8x8:
3677       case Iop_SarN16x4:
3678       case Iop_SarN32x2:
3679       case Iop_ShlN16x4:
3680       case Iop_ShlN32x2:
3681       case Iop_ShlN8x8:
3682          /* Same scheme as with all other shifts. */
3683          complainIfUndefined(mce, atom2, NULL);
3684          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3685
3686       case Iop_QNarrowBin32Sto16Sx4:
3687       case Iop_QNarrowBin16Sto8Sx8:
3688       case Iop_QNarrowBin16Sto8Ux8:
3689          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3690
3691       case Iop_Min8Ux8:
3692       case Iop_Min8Sx8:
3693       case Iop_Max8Ux8:
3694       case Iop_Max8Sx8:
3695       case Iop_Avg8Ux8:
3696       case Iop_QSub8Sx8:
3697       case Iop_QSub8Ux8:
3698       case Iop_Sub8x8:
3699       case Iop_CmpGT8Sx8:
3700       case Iop_CmpGT8Ux8:
3701       case Iop_CmpEQ8x8:
3702       case Iop_QAdd8Sx8:
3703       case Iop_QAdd8Ux8:
3704       case Iop_QSal8x8:
3705       case Iop_QShl8x8:
3706       case Iop_Add8x8:
3707       case Iop_Mul8x8:
3708       case Iop_PolynomialMul8x8:
3709          return binary8Ix8(mce, vatom1, vatom2);
3710
3711       case Iop_Min16Sx4:
3712       case Iop_Min16Ux4:
3713       case Iop_Max16Sx4:
3714       case Iop_Max16Ux4:
3715       case Iop_Avg16Ux4:
3716       case Iop_QSub16Ux4:
3717       case Iop_QSub16Sx4:
3718       case Iop_Sub16x4:
3719       case Iop_Mul16x4:
3720       case Iop_MulHi16Sx4:
3721       case Iop_MulHi16Ux4:
3722       case Iop_CmpGT16Sx4:
3723       case Iop_CmpGT16Ux4:
3724       case Iop_CmpEQ16x4:
3725       case Iop_QAdd16Sx4:
3726       case Iop_QAdd16Ux4:
3727       case Iop_QSal16x4:
3728       case Iop_QShl16x4:
3729       case Iop_Add16x4:
3730       case Iop_QDMulHi16Sx4:
3731       case Iop_QRDMulHi16Sx4:
3732          return binary16Ix4(mce, vatom1, vatom2);
3733
3734       case Iop_Sub32x2:
3735       case Iop_Mul32x2:
3736       case Iop_Max32Sx2:
3737       case Iop_Max32Ux2:
3738       case Iop_Min32Sx2:
3739       case Iop_Min32Ux2:
3740       case Iop_CmpGT32Sx2:
3741       case Iop_CmpGT32Ux2:
3742       case Iop_CmpEQ32x2:
3743       case Iop_Add32x2:
3744       case Iop_QAdd32Ux2:
3745       case Iop_QAdd32Sx2:
3746       case Iop_QSub32Ux2:
3747       case Iop_QSub32Sx2:
3748       case Iop_QSal32x2:
3749       case Iop_QShl32x2:
3750       case Iop_QDMulHi32Sx2:
3751       case Iop_QRDMulHi32Sx2:
3752          return binary32Ix2(mce, vatom1, vatom2);
3753
3754       case Iop_QSub64Ux1:
3755       case Iop_QSub64Sx1:
3756       case Iop_QAdd64Ux1:
3757       case Iop_QAdd64Sx1:
3758       case Iop_QSal64x1:
3759       case Iop_QShl64x1:
3760       case Iop_Sal64x1:
3761          return binary64Ix1(mce, vatom1, vatom2);
3762
3763       case Iop_QShlNsatSU8x8:
3764       case Iop_QShlNsatUU8x8:
3765       case Iop_QShlNsatSS8x8:
3766          complainIfUndefined(mce, atom2, NULL);
3767          return mkPCast8x8(mce, vatom1);
3768
3769       case Iop_QShlNsatSU16x4:
3770       case Iop_QShlNsatUU16x4:
3771       case Iop_QShlNsatSS16x4:
3772          complainIfUndefined(mce, atom2, NULL);
3773          return mkPCast16x4(mce, vatom1);
3774
3775       case Iop_QShlNsatSU32x2:
3776       case Iop_QShlNsatUU32x2:
3777       case Iop_QShlNsatSS32x2:
3778          complainIfUndefined(mce, atom2, NULL);
3779          return mkPCast32x2(mce, vatom1);
3780
3781       case Iop_QShlNsatSU64x1:
3782       case Iop_QShlNsatUU64x1:
3783       case Iop_QShlNsatSS64x1:
3784          complainIfUndefined(mce, atom2, NULL);
3785          return mkPCast32x2(mce, vatom1);
3786
3787       case Iop_PwMax32Sx2:
3788       case Iop_PwMax32Ux2:
3789       case Iop_PwMin32Sx2:
3790       case Iop_PwMin32Ux2:
3791       case Iop_PwMax32Fx2:
3792       case Iop_PwMin32Fx2:
3793          return assignNew('V', mce, Ity_I64,
3794                           binop(Iop_PwMax32Ux2,
3795                                 mkPCast32x2(mce, vatom1),
3796                                 mkPCast32x2(mce, vatom2)));
3797
3798       case Iop_PwMax16Sx4:
3799       case Iop_PwMax16Ux4:
3800       case Iop_PwMin16Sx4:
3801       case Iop_PwMin16Ux4:
3802          return assignNew('V', mce, Ity_I64,
3803                           binop(Iop_PwMax16Ux4,
3804                                 mkPCast16x4(mce, vatom1),
3805                                 mkPCast16x4(mce, vatom2)));
3806
3807       case Iop_PwMax8Sx8:
3808       case Iop_PwMax8Ux8:
3809       case Iop_PwMin8Sx8:
3810       case Iop_PwMin8Ux8:
3811          return assignNew('V', mce, Ity_I64,
3812                           binop(Iop_PwMax8Ux8,
3813                                 mkPCast8x8(mce, vatom1),
3814                                 mkPCast8x8(mce, vatom2)));
3815
3816       case Iop_PwAdd32x2:
3817       case Iop_PwAdd32Fx2:
3818          return mkPCast32x2(mce,
3819                assignNew('V', mce, Ity_I64,
3820                          binop(Iop_PwAdd32x2,
3821                                mkPCast32x2(mce, vatom1),
3822                                mkPCast32x2(mce, vatom2))));
3823
3824       case Iop_PwAdd16x4:
3825          return mkPCast16x4(mce,
3826                assignNew('V', mce, Ity_I64,
3827                          binop(op, mkPCast16x4(mce, vatom1),
3828                                    mkPCast16x4(mce, vatom2))));
3829
3830       case Iop_PwAdd8x8:
3831          return mkPCast8x8(mce,
3832                assignNew('V', mce, Ity_I64,
3833                          binop(op, mkPCast8x8(mce, vatom1),
3834                                    mkPCast8x8(mce, vatom2))));
3835
3836       case Iop_Shl8x8:
3837       case Iop_Shr8x8:
3838       case Iop_Sar8x8:
3839       case Iop_Sal8x8:
3840          return mkUifU64(mce,
3841                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3842                    mkPCast8x8(mce,vatom2)
3843                 );
3844
3845       case Iop_Shl16x4:
3846       case Iop_Shr16x4:
3847       case Iop_Sar16x4:
3848       case Iop_Sal16x4:
3849          return mkUifU64(mce,
3850                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3851                    mkPCast16x4(mce,vatom2)
3852                 );
3853
3854       case Iop_Shl32x2:
3855       case Iop_Shr32x2:
3856       case Iop_Sar32x2:
3857       case Iop_Sal32x2:
3858          return mkUifU64(mce,
3859                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3860                    mkPCast32x2(mce,vatom2)
3861                 );
3862
3863       /* 64-bit data-steering */
3864       case Iop_InterleaveLO32x2:
3865       case Iop_InterleaveLO16x4:
3866       case Iop_InterleaveLO8x8:
3867       case Iop_InterleaveHI32x2:
3868       case Iop_InterleaveHI16x4:
3869       case Iop_InterleaveHI8x8:
3870       case Iop_CatOddLanes8x8:
3871       case Iop_CatEvenLanes8x8:
3872       case Iop_CatOddLanes16x4:
3873       case Iop_CatEvenLanes16x4:
3874       case Iop_InterleaveOddLanes8x8:
3875       case Iop_InterleaveEvenLanes8x8:
3876       case Iop_InterleaveOddLanes16x4:
3877       case Iop_InterleaveEvenLanes16x4:
3878          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3879
3880       case Iop_GetElem8x8:
3881          complainIfUndefined(mce, atom2, NULL);
3882          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3883       case Iop_GetElem16x4:
3884          complainIfUndefined(mce, atom2, NULL);
3885          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3886       case Iop_GetElem32x2:
3887          complainIfUndefined(mce, atom2, NULL);
3888          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3889
3890       /* Perm8x8: rearrange values in left arg using steering values from
3891          right arg.  So rearrange the vbits in the same way but pessimise wrt
3892          steering values.  We assume that unused bits in the steering value
3893          are defined zeros, so we can safely PCast within each lane of the the
3894          steering value without having to take precautions to avoid a
3895          dependency on those unused bits.
3896
3897          This is also correct for PermOrZero8x8, but it is a bit subtle.  For
3898          each lane, if bit 7 of the steering value is zero, then we'll steer
3899          the shadow value exactly as per Perm8x8.  If that bit is one, then
3900          the operation will set the resulting (concrete) value to zero.  That
3901          means it is defined, and should have a shadow value of zero.  Hence
3902          in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3903          as Perm8x8) and then pessimise against the steering values.  */
3904       case Iop_Perm8x8:
3905       case Iop_PermOrZero8x8:
3906          return mkUifU64(
3907                    mce,
3908                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3909                    mkPCast8x8(mce, vatom2)
3910                 );
3911
3912       /* V128-bit SIMD */
3913
3914       case Iop_I32StoF32x4:
3915       case Iop_F32toI32Sx4:
3916       case Iop_Sqrt16Fx8:
3917          return unary16Fx8_w_rm(mce, vatom1, vatom2);
3918       case Iop_Sqrt32Fx4:
3919          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3920       case Iop_Sqrt64Fx2:
3921          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3922
3923       case Iop_ShrN8x16:
3924       case Iop_ShrN16x8:
3925       case Iop_ShrN32x4:
3926       case Iop_ShrN64x2:
3927       case Iop_SarN8x16:
3928       case Iop_SarN16x8:
3929       case Iop_SarN32x4:
3930       case Iop_SarN64x2:
3931       case Iop_ShlN8x16:
3932       case Iop_ShlN16x8:
3933       case Iop_ShlN32x4:
3934       case Iop_ShlN64x2:
3935          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3936             this is wrong now, scalar shifts are done properly lazily.
3937             Vector shifts should be fixed too. */
3938          complainIfUndefined(mce, atom2, NULL);
3939          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3940
3941       /* V x V shifts/rotates are done using the standard lazy scheme. */
3942       /* For the non-rounding variants of bi-di vector x vector
3943          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3944          But note that this is overly pessimistic, because in fact only
3945          the bottom 8 bits of each lane of the second argument are taken
3946          into account when shifting.  So really we ought to ignore
3947          undefinedness in bits 8 and above of each lane in the
3948          second argument. */
3949       case Iop_Shl8x16:
3950       case Iop_Shr8x16:
3951       case Iop_Sar8x16:
3952       case Iop_Sal8x16:
3953       case Iop_Rol8x16:
3954       case Iop_Sh8Sx16:
3955       case Iop_Sh8Ux16:
3956          return mkUifUV128(mce,
3957                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3958                    mkPCast8x16(mce,vatom2)
3959                 );
3960
3961       case Iop_Shl16x8:
3962       case Iop_Shr16x8:
3963       case Iop_Sar16x8:
3964       case Iop_Sal16x8:
3965       case Iop_Rol16x8:
3966       case Iop_Sh16Sx8:
3967       case Iop_Sh16Ux8:
3968          return mkUifUV128(mce,
3969                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3970                    mkPCast16x8(mce,vatom2)
3971                 );
3972
3973       case Iop_Shl32x4:
3974       case Iop_Shr32x4:
3975       case Iop_Sar32x4:
3976       case Iop_Sal32x4:
3977       case Iop_Rol32x4:
3978       case Iop_Sh32Sx4:
3979       case Iop_Sh32Ux4:
3980          return mkUifUV128(mce,
3981                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3982                    mkPCast32x4(mce,vatom2)
3983                 );
3984
3985       case Iop_Shl64x2:
3986       case Iop_Shr64x2:
3987       case Iop_Sar64x2:
3988       case Iop_Sal64x2:
3989       case Iop_Rol64x2:
3990       case Iop_Sh64Sx2:
3991       case Iop_Sh64Ux2:
3992          return mkUifUV128(mce,
3993                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3994                    mkPCast64x2(mce,vatom2)
3995                 );
3996
3997       /* For the rounding variants of bi-di vector x vector shifts, the
3998          rounding adjustment can cause undefinedness to propagate through
3999          the entire lane, in the worst case.  Too complex to handle
4000          properly .. just UifU the arguments and then PCast them.
4001          Suboptimal but safe. */
4002       case Iop_Rsh8Sx16:
4003       case Iop_Rsh8Ux16:
4004          return binary8Ix16(mce, vatom1, vatom2);
4005       case Iop_Rsh16Sx8:
4006       case Iop_Rsh16Ux8:
4007          return binary16Ix8(mce, vatom1, vatom2);
4008       case Iop_Rsh32Sx4:
4009       case Iop_Rsh32Ux4:
4010          return binary32Ix4(mce, vatom1, vatom2);
4011       case Iop_Rsh64Sx2:
4012       case Iop_Rsh64Ux2:
4013          return binary64Ix2(mce, vatom1, vatom2);
4014
4015       case Iop_F32ToFixed32Ux4_RZ:
4016       case Iop_F32ToFixed32Sx4_RZ:
4017       case Iop_Fixed32UToF32x4_RN:
4018       case Iop_Fixed32SToF32x4_RN:
4019          complainIfUndefined(mce, atom2, NULL);
4020          return mkPCast32x4(mce, vatom1);
4021
4022       case Iop_F32ToFixed32Ux2_RZ:
4023       case Iop_F32ToFixed32Sx2_RZ:
4024       case Iop_Fixed32UToF32x2_RN:
4025       case Iop_Fixed32SToF32x2_RN:
4026          complainIfUndefined(mce, atom2, NULL);
4027          return mkPCast32x2(mce, vatom1);
4028
4029       case Iop_QSub8Ux16:
4030       case Iop_QSub8Sx16:
4031       case Iop_Sub8x16:
4032       case Iop_Min8Ux16:
4033       case Iop_Min8Sx16:
4034       case Iop_Max8Ux16:
4035       case Iop_Max8Sx16:
4036       case Iop_CmpEQ8x16:
4037       case Iop_Avg8Ux16:
4038       case Iop_Avg8Sx16:
4039       case Iop_QAdd8Ux16:
4040       case Iop_QAdd8Sx16:
4041       case Iop_QAddExtUSsatSS8x16:
4042       case Iop_QAddExtSUsatUU8x16:
4043       case Iop_QSal8x16:
4044       case Iop_QShl8x16:
4045       case Iop_Add8x16:
4046       case Iop_Mul8x16:
4047       case Iop_MulHi8Sx16:
4048       case Iop_MulHi8Ux16:
4049       case Iop_PolynomialMul8x16:
4050       case Iop_PolynomialMulAdd8x16:
4051          return binary8Ix16(mce, vatom1, vatom2);
4052
4053       case Iop_QSub16Ux8:
4054       case Iop_QSub16Sx8:
4055       case Iop_Sub16x8:
4056       case Iop_Mul16x8:
4057       case Iop_MulHi16Sx8:
4058       case Iop_MulHi16Ux8:
4059       case Iop_Min16Sx8:
4060       case Iop_Min16Ux8:
4061       case Iop_Max16Sx8:
4062       case Iop_Max16Ux8:
4063       case Iop_CmpEQ16x8:
4064       case Iop_Avg16Ux8:
4065       case Iop_Avg16Sx8:
4066       case Iop_QAdd16Ux8:
4067       case Iop_QAdd16Sx8:
4068       case Iop_QAddExtUSsatSS16x8:
4069       case Iop_QAddExtSUsatUU16x8:
4070       case Iop_QSal16x8:
4071       case Iop_QShl16x8:
4072       case Iop_Add16x8:
4073       case Iop_QDMulHi16Sx8:
4074       case Iop_QRDMulHi16Sx8:
4075       case Iop_PolynomialMulAdd16x8:
4076       /* PwExtUSMulQAdd8x16 is a bit subtle.  The effect of it is that each
4077          16-bit chunk of the output is formed from corresponding 16-bit chunks
4078          of the input args, so we can treat it like an other binary 16x8
4079          operation.  That's despite it having '8x16' in its name. */
4080       case Iop_PwExtUSMulQAdd8x16:
4081          return binary16Ix8(mce, vatom1, vatom2);
4082
4083       case Iop_CmpGT64Sx2:
4084       case Iop_CmpGT64Ux2:
4085       case Iop_CmpGT32Sx4:
4086       case Iop_CmpGT32Ux4:
4087       case Iop_CmpGT16Sx8:
4088       case Iop_CmpGT16Ux8:
4089       case Iop_CmpGT8Sx16:
4090       case Iop_CmpGT8Ux16:
4091          return expensiveCmpGT(mce, op,
4092                                vatom1, vatom2, atom1, atom2);
4093       case Iop_Sub32x4:
4094       case Iop_CmpEQ32x4:
4095       case Iop_QAdd32Sx4:
4096       case Iop_QAdd32Ux4:
4097       case Iop_QSub32Sx4:
4098       case Iop_QSub32Ux4:
4099       case Iop_QAddExtUSsatSS32x4:
4100       case Iop_QAddExtSUsatUU32x4:
4101       case Iop_QSal32x4:
4102       case Iop_QShl32x4:
4103       case Iop_Avg32Ux4:
4104       case Iop_Avg32Sx4:
4105       case Iop_Add32x4:
4106       case Iop_Max32Ux4:
4107       case Iop_Max32Sx4:
4108       case Iop_Min32Ux4:
4109       case Iop_Min32Sx4:
4110       case Iop_Mul32x4:
4111       case Iop_MulHi32Sx4:
4112       case Iop_MulHi32Ux4:
4113       case Iop_QDMulHi32Sx4:
4114       case Iop_QRDMulHi32Sx4:
4115       case Iop_PolynomialMulAdd32x4:
4116          return binary32Ix4(mce, vatom1, vatom2);
4117
4118       case Iop_Sub64x2:
4119       case Iop_Add64x2:
4120       case Iop_Avg64Ux2:
4121       case Iop_Avg64Sx2:
4122       case Iop_Max64Sx2:
4123       case Iop_Max64Ux2:
4124       case Iop_Min64Sx2:
4125       case Iop_Min64Ux2:
4126       case Iop_CmpEQ64x2:
4127       case Iop_QSal64x2:
4128       case Iop_QShl64x2:
4129       case Iop_QAdd64Ux2:
4130       case Iop_QAdd64Sx2:
4131       case Iop_QSub64Ux2:
4132       case Iop_QSub64Sx2:
4133       case Iop_QAddExtUSsatSS64x2:
4134       case Iop_QAddExtSUsatUU64x2:
4135       case Iop_PolynomialMulAdd64x2:
4136       case Iop_CipherV128:
4137       case Iop_CipherLV128:
4138       case Iop_NCipherV128:
4139       case Iop_NCipherLV128:
4140       case Iop_MulI128by10E:
4141       case Iop_MulI128by10ECarry:
4142         return binary64Ix2(mce, vatom1, vatom2);
4143
4144       case Iop_Add128x1:
4145       case Iop_Sub128x1:
4146       case Iop_CmpNEZ128x1:
4147          return binary128Ix1(mce, vatom1, vatom2);
4148
4149       case Iop_DivU128:
4150       case Iop_DivS128:
4151       case Iop_DivU128E:
4152       case Iop_DivS128E:
4153       case Iop_ModU128:
4154       case Iop_ModS128:
4155          /* I128 x I128 -> I128 */
4156          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4157
4158       case Iop_QNarrowBin64Sto32Sx4:
4159       case Iop_QNarrowBin64Uto32Ux4:
4160       case Iop_QNarrowBin32Sto16Sx8:
4161       case Iop_QNarrowBin32Uto16Ux8:
4162       case Iop_QNarrowBin32Sto16Ux8:
4163       case Iop_QNarrowBin16Sto8Sx16:
4164       case Iop_QNarrowBin16Uto8Ux16:
4165       case Iop_QNarrowBin16Sto8Ux16:
4166          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
4167
4168       case Iop_Min64Fx2:
4169       case Iop_Max64Fx2:
4170       case Iop_CmpLT64Fx2:
4171       case Iop_CmpLE64Fx2:
4172       case Iop_CmpEQ64Fx2:
4173       case Iop_CmpUN64Fx2:
4174       case Iop_RecipStep64Fx2:
4175       case Iop_RSqrtStep64Fx2:
4176          return binary64Fx2(mce, vatom1, vatom2);
4177
4178       case Iop_CmpLT16Fx8:
4179       case Iop_CmpLE16Fx8:
4180       case Iop_CmpEQ16Fx8:
4181          return binary16Fx8(mce, vatom1, vatom2);
4182
4183       case Iop_Sub64F0x2:
4184       case Iop_Mul64F0x2:
4185       case Iop_Min64F0x2:
4186       case Iop_Max64F0x2:
4187       case Iop_Div64F0x2:
4188       case Iop_CmpLT64F0x2:
4189       case Iop_CmpLE64F0x2:
4190       case Iop_CmpEQ64F0x2:
4191       case Iop_CmpUN64F0x2:
4192       case Iop_Add64F0x2:
4193          return binary64F0x2(mce, vatom1, vatom2);
4194
4195       case Iop_Min32Fx4:
4196       case Iop_Max32Fx4:
4197       case Iop_CmpLT32Fx4:
4198       case Iop_CmpLE32Fx4:
4199       case Iop_CmpEQ32Fx4:
4200       case Iop_CmpUN32Fx4:
4201       case Iop_CmpGT32Fx4:
4202       case Iop_CmpGE32Fx4:
4203       case Iop_RecipStep32Fx4:
4204       case Iop_RSqrtStep32Fx4:
4205          return binary32Fx4(mce, vatom1, vatom2);
4206
4207       case Iop_Sub32Fx2:
4208       case Iop_Mul32Fx2:
4209       case Iop_Min32Fx2:
4210       case Iop_Max32Fx2:
4211       case Iop_CmpEQ32Fx2:
4212       case Iop_CmpGT32Fx2:
4213       case Iop_CmpGE32Fx2:
4214       case Iop_Add32Fx2:
4215       case Iop_RecipStep32Fx2:
4216       case Iop_RSqrtStep32Fx2:
4217          return binary32Fx2(mce, vatom1, vatom2);
4218
4219       case Iop_Sub32F0x4:
4220       case Iop_Mul32F0x4:
4221       case Iop_Min32F0x4:
4222       case Iop_Max32F0x4:
4223       case Iop_Div32F0x4:
4224       case Iop_CmpLT32F0x4:
4225       case Iop_CmpLE32F0x4:
4226       case Iop_CmpEQ32F0x4:
4227       case Iop_CmpUN32F0x4:
4228       case Iop_Add32F0x4:
4229          return binary32F0x4(mce, vatom1, vatom2);
4230
4231       case Iop_QShlNsatSU8x16:
4232       case Iop_QShlNsatUU8x16:
4233       case Iop_QShlNsatSS8x16:
4234          complainIfUndefined(mce, atom2, NULL);
4235          return mkPCast8x16(mce, vatom1);
4236
4237       case Iop_QShlNsatSU16x8:
4238       case Iop_QShlNsatUU16x8:
4239       case Iop_QShlNsatSS16x8:
4240          complainIfUndefined(mce, atom2, NULL);
4241          return mkPCast16x8(mce, vatom1);
4242
4243       case Iop_QShlNsatSU32x4:
4244       case Iop_QShlNsatUU32x4:
4245       case Iop_QShlNsatSS32x4:
4246          complainIfUndefined(mce, atom2, NULL);
4247          return mkPCast32x4(mce, vatom1);
4248
4249       case Iop_QShlNsatSU64x2:
4250       case Iop_QShlNsatUU64x2:
4251       case Iop_QShlNsatSS64x2:
4252          complainIfUndefined(mce, atom2, NULL);
4253          return mkPCast32x4(mce, vatom1);
4254
4255       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4256          To make this simpler, do the following:
4257          * complain if the shift amount (the I8) is undefined
4258          * pcast each lane at the wide width
4259          * truncate each lane to half width
4260          * pcast the resulting 64-bit value to a single bit and use
4261            that as the least significant bit of the upper half of the
4262            result. */
4263       case Iop_QandQShrNnarrow64Uto32Ux2:
4264       case Iop_QandQSarNnarrow64Sto32Sx2:
4265       case Iop_QandQSarNnarrow64Sto32Ux2:
4266       case Iop_QandQRShrNnarrow64Uto32Ux2:
4267       case Iop_QandQRSarNnarrow64Sto32Sx2:
4268       case Iop_QandQRSarNnarrow64Sto32Ux2:
4269       case Iop_QandQShrNnarrow32Uto16Ux4:
4270       case Iop_QandQSarNnarrow32Sto16Sx4:
4271       case Iop_QandQSarNnarrow32Sto16Ux4:
4272       case Iop_QandQRShrNnarrow32Uto16Ux4:
4273       case Iop_QandQRSarNnarrow32Sto16Sx4:
4274       case Iop_QandQRSarNnarrow32Sto16Ux4:
4275       case Iop_QandQShrNnarrow16Uto8Ux8:
4276       case Iop_QandQSarNnarrow16Sto8Sx8:
4277       case Iop_QandQSarNnarrow16Sto8Ux8:
4278       case Iop_QandQRShrNnarrow16Uto8Ux8:
4279       case Iop_QandQRSarNnarrow16Sto8Sx8:
4280       case Iop_QandQRSarNnarrow16Sto8Ux8:
4281       {
4282          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4283          IROp opNarrow = Iop_INVALID;
4284          switch (op) {
4285             case Iop_QandQShrNnarrow64Uto32Ux2:
4286             case Iop_QandQSarNnarrow64Sto32Sx2:
4287             case Iop_QandQSarNnarrow64Sto32Ux2:
4288             case Iop_QandQRShrNnarrow64Uto32Ux2:
4289             case Iop_QandQRSarNnarrow64Sto32Sx2:
4290             case Iop_QandQRSarNnarrow64Sto32Ux2:
4291                fnPessim = mkPCast64x2;
4292                opNarrow = Iop_NarrowUn64to32x2;
4293                break;
4294             case Iop_QandQShrNnarrow32Uto16Ux4:
4295             case Iop_QandQSarNnarrow32Sto16Sx4:
4296             case Iop_QandQSarNnarrow32Sto16Ux4:
4297             case Iop_QandQRShrNnarrow32Uto16Ux4:
4298             case Iop_QandQRSarNnarrow32Sto16Sx4:
4299             case Iop_QandQRSarNnarrow32Sto16Ux4:
4300                fnPessim = mkPCast32x4;
4301                opNarrow = Iop_NarrowUn32to16x4;
4302                break;
4303             case Iop_QandQShrNnarrow16Uto8Ux8:
4304             case Iop_QandQSarNnarrow16Sto8Sx8:
4305             case Iop_QandQSarNnarrow16Sto8Ux8:
4306             case Iop_QandQRShrNnarrow16Uto8Ux8:
4307             case Iop_QandQRSarNnarrow16Sto8Sx8:
4308             case Iop_QandQRSarNnarrow16Sto8Ux8:
4309                fnPessim = mkPCast16x8;
4310                opNarrow = Iop_NarrowUn16to8x8;
4311                break;
4312             default:
4313                tl_assert(0);
4314          }
4315          complainIfUndefined(mce, atom2, NULL);
4316          // Pessimised shift result
4317          IRAtom* shV
4318             = fnPessim(mce, vatom1);
4319          // Narrowed, pessimised shift result
4320          IRAtom* shVnarrowed
4321             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4322          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4323          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4324          // and assemble the result
4325          return assignNew('V', mce, Ity_V128,
4326                           binop(Iop_64HLtoV128, qV, shVnarrowed));
4327       }
4328
4329       case Iop_Mull32Sx2:
4330       case Iop_Mull32Ux2:
4331       case Iop_QDMull32Sx2:
4332          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4333                                     mkUifU64(mce, vatom1, vatom2));
4334
4335       case Iop_Mull16Sx4:
4336       case Iop_Mull16Ux4:
4337       case Iop_QDMull16Sx4:
4338          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4339                                     mkUifU64(mce, vatom1, vatom2));
4340
4341       case Iop_Mull8Sx8:
4342       case Iop_Mull8Ux8:
4343       case Iop_PolynomialMull8x8:
4344          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4345                                     mkUifU64(mce, vatom1, vatom2));
4346
4347       case Iop_PwAdd32x4:
4348          return mkPCast32x4(mce,
4349                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4350                      mkPCast32x4(mce, vatom2))));
4351
4352       case Iop_PwAdd16x8:
4353          return mkPCast16x8(mce,
4354                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4355                      mkPCast16x8(mce, vatom2))));
4356
4357       case Iop_PwAdd8x16:
4358          return mkPCast8x16(mce,
4359                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4360                      mkPCast8x16(mce, vatom2))));
4361
4362       /* V128-bit data-steering */
4363       case Iop_SetV128lo32:
4364       case Iop_SetV128lo64:
4365       case Iop_64HLtoV128:
4366       case Iop_InterleaveLO64x2:
4367       case Iop_InterleaveLO32x4:
4368       case Iop_InterleaveLO16x8:
4369       case Iop_InterleaveLO8x16:
4370       case Iop_InterleaveHI64x2:
4371       case Iop_InterleaveHI32x4:
4372       case Iop_InterleaveHI16x8:
4373       case Iop_InterleaveHI8x16:
4374       case Iop_CatOddLanes8x16:
4375       case Iop_CatOddLanes16x8:
4376       case Iop_CatOddLanes32x4:
4377       case Iop_CatEvenLanes8x16:
4378       case Iop_CatEvenLanes16x8:
4379       case Iop_CatEvenLanes32x4:
4380       case Iop_InterleaveOddLanes8x16:
4381       case Iop_InterleaveOddLanes16x8:
4382       case Iop_InterleaveOddLanes32x4:
4383       case Iop_InterleaveEvenLanes8x16:
4384       case Iop_InterleaveEvenLanes16x8:
4385       case Iop_InterleaveEvenLanes32x4:
4386       case Iop_PackOddLanes8x16:
4387       case Iop_PackOddLanes16x8:
4388       case Iop_PackOddLanes32x4:
4389       case Iop_PackEvenLanes8x16:
4390       case Iop_PackEvenLanes16x8:
4391       case Iop_PackEvenLanes32x4:
4392          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4393
4394       case Iop_GetElem8x16:
4395          complainIfUndefined(mce, atom2, NULL);
4396          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4397       case Iop_GetElem16x8:
4398          complainIfUndefined(mce, atom2, NULL);
4399          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4400       case Iop_GetElem32x4:
4401          complainIfUndefined(mce, atom2, NULL);
4402          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4403       case Iop_GetElem64x2:
4404          complainIfUndefined(mce, atom2, NULL);
4405          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4406
4407       /* Perm8x16: rearrange values in left arg using steering values
4408          from right arg.  So rearrange the vbits in the same way but
4409          pessimise wrt steering values.  Perm32x4 ditto. */
4410       /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4411       case Iop_Perm8x16:
4412       case Iop_PermOrZero8x16:
4413          return mkUifUV128(
4414                    mce,
4415                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4416                    mkPCast8x16(mce, vatom2)
4417                 );
4418       case Iop_Perm32x4:
4419          return mkUifUV128(
4420                    mce,
4421                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4422                    mkPCast32x4(mce, vatom2)
4423                 );
4424
4425      /* These two take the lower half of each 16-bit lane, sign/zero
4426         extend it to 32, and multiply together, producing a 32x4
4427         result (and implicitly ignoring half the operand bits).  So
4428         treat it as a bunch of independent 16x8 operations, but then
4429         do 32-bit shifts left-right to copy the lower half results
4430         (which are all 0s or all 1s due to PCasting in binary16Ix8)
4431         into the upper half of each result lane. */
4432       case Iop_MullEven16Ux8:
4433       case Iop_MullEven16Sx8: {
4434          IRAtom* at;
4435          at = binary16Ix8(mce,vatom1,vatom2);
4436          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4437          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4438          return at;
4439       }
4440
4441       /* Same deal as Iop_MullEven16{S,U}x8 */
4442       case Iop_MullEven8Ux16:
4443       case Iop_MullEven8Sx16: {
4444          IRAtom* at;
4445          at = binary8Ix16(mce,vatom1,vatom2);
4446          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4447          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4448          return at;
4449       }
4450
4451       /* Same deal as Iop_MullEven16{S,U}x8 */
4452       case Iop_MullEven32Ux4:
4453       case Iop_MullEven32Sx4: {
4454          IRAtom* at;
4455          at = binary32Ix4(mce,vatom1,vatom2);
4456          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4457          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4458          return at;
4459       }
4460
4461       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4462          32x4 -> 16x8 laneage, discarding the upper half of each lane.
4463          Simply apply same op to the V bits, since this really no more
4464          than a data steering operation. */
4465       case Iop_NarrowBin32to16x8:
4466       case Iop_NarrowBin16to8x16:
4467       case Iop_NarrowBin64to32x4:
4468          return assignNew('V', mce, Ity_V128,
4469                                     binop(op, vatom1, vatom2));
4470
4471       case Iop_ShrV128:
4472       case Iop_SarV128:
4473       case Iop_ShlV128:
4474       case Iop_I128StoBCD128:
4475          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
4476             this is wrong now, scalar shifts are done properly lazily.
4477             Vector shifts should be fixed too. */
4478          complainIfUndefined(mce, atom2, NULL);
4479          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4480
4481       case Iop_I128UtoF128:      /* I128 -> F128 */
4482       case Iop_I128StoF128:      /* I128 -> F128 */
4483          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4484
4485       case Iop_BCDAdd:
4486       case Iop_BCDSub:
4487          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4488
4489       /* SHA Iops */
4490       case Iop_SHA256:
4491       case Iop_SHA512:
4492          complainIfUndefined(mce, atom2, NULL);
4493          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4494
4495       /* I128-bit data-steering */
4496       case Iop_64HLto128:
4497          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4498
4499       /* V256-bit SIMD */
4500
4501       case Iop_Max64Fx4:
4502       case Iop_Min64Fx4:
4503          return binary64Fx4(mce, vatom1, vatom2);
4504
4505       case Iop_Max32Fx8:
4506       case Iop_Min32Fx8:
4507          return binary32Fx8(mce, vatom1, vatom2);
4508
4509       /* V256-bit data-steering */
4510       case Iop_V128HLtoV256:
4511          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4512
4513       /* Scalar floating point */
4514
4515       case Iop_F32toI64S:
4516       case Iop_F32toI64U:
4517          /* I32(rm) x F32 -> I64 */
4518          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4519
4520       case Iop_I64StoF32:
4521          /* I32(rm) x I64 -> F32 */
4522          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4523
4524       case Iop_RoundF64toInt:
4525       case Iop_RoundF64toF32:
4526       case Iop_F64toI64S:
4527       case Iop_F64toI64U:
4528       case Iop_I64StoF64:
4529       case Iop_I64UtoF64:
4530       case Iop_SinF64:
4531       case Iop_CosF64:
4532       case Iop_TanF64:
4533       case Iop_2xm1F64:
4534       case Iop_SqrtF64:
4535       case Iop_RecpExpF64:
4536          /* I32(rm) x I64/F64 -> I64/F64 */
4537          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4538
4539       case Iop_ShlD64:
4540       case Iop_ShrD64:
4541       case Iop_RoundD64toInt:
4542          /* I32(rm) x D64 -> D64 */
4543          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4544
4545       case Iop_ShlD128:
4546       case Iop_ShrD128:
4547       case Iop_RoundD128toInt:
4548          /* I32(rm) x D128 -> D128 */
4549          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4550
4551       case Iop_RoundF128toInt:
4552          /* I32(rm) x F128 -> F128 */
4553          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4554
4555       case Iop_D64toI64S:
4556       case Iop_D64toI64U:
4557       case Iop_I64StoD64:
4558       case Iop_I64UtoD64:
4559          /* I32(rm) x I64/D64 -> D64/I64 */
4560          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4561
4562       case Iop_F32toD32:
4563       case Iop_F64toD32:
4564       case Iop_F128toD32:
4565       case Iop_D32toF32:
4566       case Iop_D64toF32:
4567       case Iop_D128toF32:
4568          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4569          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4570
4571       case Iop_F32toD64:
4572       case Iop_F64toD64:
4573       case Iop_F128toD64:
4574       case Iop_D32toF64:
4575       case Iop_D64toF64:
4576       case Iop_D128toF64:
4577          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4578          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4579
4580       case Iop_F32toD128:
4581       case Iop_F64toD128:
4582       case Iop_F128toD128:
4583       case Iop_D32toF128:
4584       case Iop_D64toF128:
4585       case Iop_D128toF128:
4586       case Iop_I128StoD128:
4587          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4588          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4589
4590       case Iop_SqrtF16:
4591          /* I32(rm) x F16 -> F16 */
4592          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4593
4594       case Iop_RoundF32toInt:
4595       case Iop_SqrtF32:
4596       case Iop_RecpExpF32:
4597          /* I32(rm) x I32/F32 -> I32/F32 */
4598          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4599
4600       case Iop_SqrtF128:
4601          /* I32(rm) x F128 -> F128 */
4602          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4603
4604       case Iop_I32StoF32:
4605       case Iop_I32UtoF32:
4606       case Iop_F32toI32S:
4607       case Iop_F32toI32U:
4608          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4609          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4610
4611       case Iop_F64toF16:
4612       case Iop_F32toF16:
4613          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4614          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4615
4616       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
4617       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
4618       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
4619       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
4620       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
4621          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4622
4623       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
4624       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
4625       case Iop_D128toI128S:   /* IRRoundingMode(I32) x D128 -> signed I128 */
4626          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4627
4628       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
4629       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
4630       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
4631       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
4632       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
4633       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
4634          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4635
4636       case Iop_F64HLtoF128:
4637       case Iop_D64HLtoD128:
4638          return assignNew('V', mce, Ity_I128,
4639                           binop(Iop_64HLto128, vatom1, vatom2));
4640
4641       case Iop_F64toI32U:
4642       case Iop_F64toI32S:
4643       case Iop_F64toF32:
4644       case Iop_I64UtoF32:
4645       case Iop_D64toI32U:
4646       case Iop_D64toI32S:
4647          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4648          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4649
4650       case Iop_D64toD32:
4651          /* First arg is I32 (rounding mode), second is D64 (data). */
4652          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4653
4654       case Iop_F64toI16S:
4655          /* First arg is I32 (rounding mode), second is F64 (data). */
4656          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4657
4658       case Iop_InsertExpD64:
4659          /*  I64 x I64 -> D64 */
4660          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4661
4662       case Iop_InsertExpD128:
4663          /*  I64 x I128 -> D128 */
4664          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4665
4666       case Iop_CmpF16:
4667       case Iop_CmpF32:
4668       case Iop_CmpF64:
4669       case Iop_CmpF128:
4670       case Iop_CmpD64:
4671       case Iop_CmpD128:
4672       case Iop_CmpExpD64:
4673       case Iop_CmpExpD128:
4674          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4675
4676       case Iop_MaxNumF32:
4677       case Iop_MinNumF32:
4678          /* F32 x F32 -> F32 */
4679          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4680
4681       case Iop_MaxNumF64:
4682       case Iop_MinNumF64:
4683          /* F64 x F64 -> F64 */
4684          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4685
4686       /* non-FP after here */
4687
4688       case Iop_DivModU64to32:
4689       case Iop_DivModS64to32:
4690          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4691
4692       case Iop_DivModU128to64:
4693       case Iop_DivModS128to64:
4694          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4695
4696       case Iop_8HLto16:
4697          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4698       case Iop_16HLto32:
4699          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4700       case Iop_32HLto64:
4701          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4702
4703       case Iop_DivModU64to64:
4704       case Iop_DivModS64to64: {
4705          IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4706          return assignNew('V', mce, Ity_I128,
4707                           binop(Iop_64HLto128, vTmp64, vTmp64));
4708       }
4709
4710       case Iop_MullS64:
4711       case Iop_MullU64: {
4712          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4713          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4714          return assignNew('V', mce, Ity_I128,
4715                           binop(Iop_64HLto128, vHi64, vLo64));
4716       }
4717
4718       case Iop_DivModU32to32:
4719       case Iop_DivModS32to32: {
4720          IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4721          return assignNew('V', mce, Ity_I64,
4722                           binop(Iop_32HLto64, vTmp32, vTmp32));
4723       }
4724
4725       case Iop_MullS32:
4726       case Iop_MullU32: {
4727          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4728          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4729          return assignNew('V', mce, Ity_I64,
4730                           binop(Iop_32HLto64, vHi32, vLo32));
4731       }
4732
4733       case Iop_MullS16:
4734       case Iop_MullU16: {
4735          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4736          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4737          return assignNew('V', mce, Ity_I32,
4738                           binop(Iop_16HLto32, vHi16, vLo16));
4739       }
4740
4741       case Iop_MullS8:
4742       case Iop_MullU8: {
4743          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4744          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4745          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4746       }
4747
4748       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4749       case Iop_DivS32:
4750       case Iop_DivU32:
4751       case Iop_DivU32E:
4752       case Iop_DivS32E:
4753       case Iop_QAdd32S: /* could probably do better */
4754       case Iop_QSub32S: /* could probably do better */
4755          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4756
4757       case Iop_DivS64:
4758       case Iop_DivU64:
4759       case Iop_DivS64E:
4760       case Iop_DivU64E:
4761          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4762
4763       case Iop_Add32:
4764          if (mce->dlbo.dl_Add32 == DLexpensive
4765              || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4766              return expensiveAddSub(mce,True,Ity_I32,
4767                                     vatom1,vatom2, atom1,atom2);
4768          } else {
4769              goto cheap_AddSub32;
4770          }
4771       case Iop_Sub32:
4772          if (mce->dlbo.dl_Sub32 == DLexpensive
4773              || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4774              return expensiveAddSub(mce,False,Ity_I32,
4775                                     vatom1,vatom2, atom1,atom2);
4776          } else {
4777              goto cheap_AddSub32;
4778          }
4779
4780       cheap_AddSub32:
4781       case Iop_Mul32:
4782          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4783
4784       case Iop_CmpORD32S:
4785       case Iop_CmpORD32U:
4786       case Iop_CmpORD64S:
4787       case Iop_CmpORD64U:
4788          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4789
4790       case Iop_Add64:
4791          if (mce->dlbo.dl_Add64 == DLexpensive
4792              || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4793              return expensiveAddSub(mce,True,Ity_I64,
4794                                     vatom1,vatom2, atom1,atom2);
4795          } else {
4796              goto cheap_AddSub64;
4797          }
4798       case Iop_Sub64:
4799          if (mce->dlbo.dl_Sub64 == DLexpensive
4800              || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4801              return expensiveAddSub(mce,False,Ity_I64,
4802                                     vatom1,vatom2, atom1,atom2);
4803          } else {
4804              goto cheap_AddSub64;
4805          }
4806
4807       cheap_AddSub64:
4808       case Iop_Mul64:
4809          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4810
4811       case Iop_Mul16:
4812       case Iop_Add16:
4813       case Iop_Sub16:
4814          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4815
4816       case Iop_Mul8:
4817       case Iop_Sub8:
4818       case Iop_Add8:
4819          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4820
4821       ////---- CmpXX64
4822       case Iop_CmpEQ64: case Iop_CmpNE64:
4823          if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4824             goto expensive_cmp64;
4825          else
4826             goto cheap_cmp64;
4827
4828       expensive_cmp64:
4829       case Iop_ExpCmpNE64:
4830          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4831
4832       cheap_cmp64:
4833       case Iop_CmpLE64S: case Iop_CmpLE64U:
4834       case Iop_CmpLT64U: case Iop_CmpLT64S:
4835          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4836
4837       ////---- CmpXX32
4838       case Iop_CmpEQ32: case Iop_CmpNE32:
4839          if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4840             goto expensive_cmp32;
4841          else
4842             goto cheap_cmp32;
4843
4844       expensive_cmp32:
4845       case Iop_ExpCmpNE32:
4846          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4847
4848       cheap_cmp32:
4849       case Iop_CmpLE32S: case Iop_CmpLE32U:
4850       case Iop_CmpLT32U: case Iop_CmpLT32S:
4851          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4852
4853       ////---- CmpXX16
4854       case Iop_CmpEQ16: case Iop_CmpNE16:
4855          if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4856             goto expensive_cmp16;
4857          else
4858             goto cheap_cmp16;
4859
4860       expensive_cmp16:
4861       case Iop_ExpCmpNE16:
4862          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4863
4864       cheap_cmp16:
4865          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4866
4867       ////---- CmpXX8
4868       case Iop_CmpEQ8: case Iop_CmpNE8:
4869          if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4870             goto expensive_cmp8;
4871          else
4872             goto cheap_cmp8;
4873
4874       expensive_cmp8:
4875          return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4876
4877       cheap_cmp8:
4878          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4879
4880       ////---- end CmpXX{64,32,16,8}
4881
4882       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4883       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4884       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4885       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4886          /* Just say these all produce a defined result, regardless
4887             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4888          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4889
4890       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4891          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4892
4893       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4894          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4895
4896       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4897          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4898
4899       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4900          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4901
4902       case Iop_AndV256:
4903          uifu = mkUifUV256; difd = mkDifDV256;
4904          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4905       case Iop_AndV128:
4906          uifu = mkUifUV128; difd = mkDifDV128;
4907          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4908       case Iop_And64:
4909          uifu = mkUifU64; difd = mkDifD64;
4910          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4911       case Iop_And32:
4912          uifu = mkUifU32; difd = mkDifD32;
4913          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4914       case Iop_And16:
4915          uifu = mkUifU16; difd = mkDifD16;
4916          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4917       case Iop_And8:
4918          uifu = mkUifU8; difd = mkDifD8;
4919          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4920       case Iop_And1:
4921          uifu = mkUifU1; difd = mkDifD1;
4922          and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4923
4924       case Iop_OrV256:
4925          uifu = mkUifUV256; difd = mkDifDV256;
4926          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4927       case Iop_OrV128:
4928          uifu = mkUifUV128; difd = mkDifDV128;
4929          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4930       case Iop_Or64:
4931          uifu = mkUifU64; difd = mkDifD64;
4932          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4933       case Iop_Or32:
4934          uifu = mkUifU32; difd = mkDifD32;
4935          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4936       case Iop_Or16:
4937          uifu = mkUifU16; difd = mkDifD16;
4938          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4939       case Iop_Or8:
4940          uifu = mkUifU8; difd = mkDifD8;
4941          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4942       case Iop_Or1:
4943          uifu = mkUifU1; difd = mkDifD1;
4944          and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4945
4946       do_And_Or:
4947          return assignNew('V', mce, and_or_ty,
4948             difd(mce, uifu(mce, vatom1, vatom2),
4949                       difd(mce, improve(mce, atom1, vatom1),
4950                                 improve(mce, atom2, vatom2) ) ) );
4951
4952       case Iop_Xor8:
4953          return mkUifU8(mce, vatom1, vatom2);
4954       case Iop_Xor16:
4955          return mkUifU16(mce, vatom1, vatom2);
4956       case Iop_Xor32:
4957          return mkUifU32(mce, vatom1, vatom2);
4958       case Iop_Xor64:
4959          return mkUifU64(mce, vatom1, vatom2);
4960       case Iop_XorV128:
4961          return mkUifUV128(mce, vatom1, vatom2);
4962       case Iop_XorV256:
4963          return mkUifUV256(mce, vatom1, vatom2);
4964
4965       /* V256-bit SIMD */
4966
4967       case Iop_ShrN16x16:
4968       case Iop_ShrN32x8:
4969       case Iop_ShrN64x4:
4970       case Iop_SarN16x16:
4971       case Iop_SarN32x8:
4972       case Iop_ShlN16x16:
4973       case Iop_ShlN32x8:
4974       case Iop_ShlN64x4:
4975          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4976             this is wrong now, scalar shifts are done properly lazily.
4977             Vector shifts should be fixed too. */
4978          complainIfUndefined(mce, atom2, NULL);
4979          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4980
4981       case Iop_QSub8Ux32:
4982       case Iop_QSub8Sx32:
4983       case Iop_Sub8x32:
4984       case Iop_Min8Ux32:
4985       case Iop_Min8Sx32:
4986       case Iop_Max8Ux32:
4987       case Iop_Max8Sx32:
4988       case Iop_CmpGT8Sx32:
4989       case Iop_CmpEQ8x32:
4990       case Iop_Avg8Ux32:
4991       case Iop_QAdd8Ux32:
4992       case Iop_QAdd8Sx32:
4993       case Iop_Add8x32:
4994          return binary8Ix32(mce, vatom1, vatom2);
4995
4996       case Iop_QSub16Ux16:
4997       case Iop_QSub16Sx16:
4998       case Iop_Sub16x16:
4999       case Iop_Mul16x16:
5000       case Iop_MulHi16Sx16:
5001       case Iop_MulHi16Ux16:
5002       case Iop_Min16Sx16:
5003       case Iop_Min16Ux16:
5004       case Iop_Max16Sx16:
5005       case Iop_Max16Ux16:
5006       case Iop_CmpGT16Sx16:
5007       case Iop_CmpEQ16x16:
5008       case Iop_Avg16Ux16:
5009       case Iop_QAdd16Ux16:
5010       case Iop_QAdd16Sx16:
5011       case Iop_Add16x16:
5012          return binary16Ix16(mce, vatom1, vatom2);
5013
5014       case Iop_Sub32x8:
5015       case Iop_CmpGT32Sx8:
5016       case Iop_CmpEQ32x8:
5017       case Iop_Add32x8:
5018       case Iop_Max32Ux8:
5019       case Iop_Max32Sx8:
5020       case Iop_Min32Ux8:
5021       case Iop_Min32Sx8:
5022       case Iop_Mul32x8:
5023          return binary32Ix8(mce, vatom1, vatom2);
5024
5025       case Iop_Sub64x4:
5026       case Iop_Add64x4:
5027       case Iop_CmpEQ64x4:
5028       case Iop_CmpGT64Sx4:
5029          return binary64Ix4(mce, vatom1, vatom2);
5030
5031       case Iop_I32StoF32x8:
5032       case Iop_F32toI32Sx8:
5033          return unary32Fx8_w_rm(mce, vatom1, vatom2);
5034
5035       /* Perm32x8: rearrange values in left arg using steering values
5036          from right arg.  So rearrange the vbits in the same way but
5037          pessimise wrt steering values. */
5038       case Iop_Perm32x8:
5039          return mkUifUV256(
5040                    mce,
5041                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
5042                    mkPCast32x8(mce, vatom2)
5043                 );
5044
5045       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
5046          Handle the shifted results in the same way that other
5047          binary Q ops are handled, eg QSub: UifU the two args,
5048          then pessimise -- which is binaryNIxM.  But for the upper
5049          V128, we require to generate just 1 bit which is the
5050          pessimised shift result, with 127 defined zeroes above it.
5051
5052          Note that this overly pessimistic in that in fact only the
5053          bottom 8 bits of each lane of the second arg determine the shift
5054          amount.  Really we ought to ignore any undefinedness in the
5055          rest of the lanes of the second arg. */
5056       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
5057       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
5058       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
5059       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
5060       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
5061       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
5062       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
5063       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
5064       {
5065          // The function to generate the pessimised shift result
5066          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
5067          switch (op) {
5068             case Iop_QandSQsh64x2:
5069             case Iop_QandUQsh64x2:
5070             case Iop_QandSQRsh64x2:
5071             case Iop_QandUQRsh64x2:
5072                binaryNIxM = binary64Ix2;
5073                break;
5074             case Iop_QandSQsh32x4:
5075             case Iop_QandUQsh32x4:
5076             case Iop_QandSQRsh32x4:
5077             case Iop_QandUQRsh32x4:
5078                binaryNIxM = binary32Ix4;
5079                break;
5080             case Iop_QandSQsh16x8:
5081             case Iop_QandUQsh16x8:
5082             case Iop_QandSQRsh16x8:
5083             case Iop_QandUQRsh16x8:
5084                binaryNIxM = binary16Ix8;
5085                break;
5086             case Iop_QandSQsh8x16:
5087             case Iop_QandUQsh8x16:
5088             case Iop_QandSQRsh8x16:
5089             case Iop_QandUQRsh8x16:
5090                binaryNIxM = binary8Ix16;
5091                break;
5092             default:
5093                tl_assert(0);
5094          }
5095          tl_assert(binaryNIxM);
5096          // Pessimised shift result, shV[127:0]
5097          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
5098          // Generates: Def--(127)--Def PCast-to-I1(shV)
5099          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
5100          // and assemble the result
5101          return assignNew('V', mce, Ity_V256,
5102                           binop(Iop_V128HLtoV256, qV, shV));
5103       }
5104
5105       case Iop_F32toF16x4: {
5106          // First, PCast the input vector, retaining the 32x4 format.
5107          IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
5108          // Now truncate each 32 bit lane to 16 bits.  Since we already PCasted
5109          // the input, we're not going to lose any information.
5110          IRAtom* pcHI64
5111             = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
5112          IRAtom* pcLO64
5113             = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
5114          IRAtom* narrowed
5115             = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
5116                                                  pcHI64, pcLO64)); // 16x4
5117          // Finally, roll in any badness from the rounding mode.
5118          IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
5119          return mkUifU64(mce, narrowed, rmPCasted);
5120       }
5121
5122       case Iop_F32toF16x8: {
5123          // Same scheme as for Iop_F32toF16x4.
5124          IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
5125          IRAtom* pcHI128
5126             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
5127                                                  pcasted)); // 32x4
5128          IRAtom* pcLO128
5129             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
5130                                                  pcasted)); // 32x4
5131          IRAtom* narrowed
5132             = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
5133                                                   pcHI128, pcLO128)); // 16x8
5134          // Finally, roll in any badness from the rounding mode.
5135          IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
5136          return mkUifUV128(mce, narrowed, rmPCasted);
5137       }
5138
5139       default:
5140          ppIROp(op);
5141          VG_(tool_panic)("memcheck:expr2vbits_Binop");
5142    }
5143 }
5144
5145
5146 static
5147 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
5148 {
5149    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5150       selection of shadow operation implicitly duplicates the logic in
5151       do_shadow_LoadG and should be kept in sync (in the very unlikely
5152       event that the interpretation of such widening ops changes in
5153       future).  See comment in do_shadow_LoadG. */
5154    IRAtom* vatom = expr2vbits( mce, atom, HuOth );
5155    tl_assert(isOriginalAtom(mce,atom));
5156    switch (op) {
5157
5158       case Iop_Abs64Fx2:
5159       case Iop_Neg64Fx2:
5160       case Iop_RSqrtEst64Fx2:
5161       case Iop_RecipEst64Fx2:
5162       case Iop_Log2_64Fx2:
5163          return unary64Fx2(mce, vatom);
5164
5165       case Iop_Sqrt64F0x2:
5166          return unary64F0x2(mce, vatom);
5167
5168       case Iop_Sqrt32Fx8:
5169       case Iop_RSqrtEst32Fx8:
5170       case Iop_RecipEst32Fx8:
5171          return unary32Fx8(mce, vatom);
5172
5173       case Iop_Sqrt64Fx4:
5174          return unary64Fx4(mce, vatom);
5175
5176       case Iop_RecipEst32Fx4:
5177       case Iop_I32UtoF32x4_DEP:
5178       case Iop_I32StoF32x4_DEP:
5179       case Iop_QF32toI32Ux4_RZ:
5180       case Iop_QF32toI32Sx4_RZ:
5181       case Iop_RoundF32x4_RM:
5182       case Iop_RoundF32x4_RP:
5183       case Iop_RoundF32x4_RN:
5184       case Iop_RoundF32x4_RZ:
5185       case Iop_RecipEst32Ux4:
5186       case Iop_Abs32Fx4:
5187       case Iop_Neg32Fx4:
5188       case Iop_RSqrtEst32Fx4:
5189       case Iop_Log2_32Fx4:
5190       case Iop_Exp2_32Fx4:
5191          return unary32Fx4(mce, vatom);
5192
5193       case Iop_I32UtoF32x2_DEP:
5194       case Iop_I32StoF32x2_DEP:
5195       case Iop_RecipEst32Fx2:
5196       case Iop_RecipEst32Ux2:
5197       case Iop_Abs32Fx2:
5198       case Iop_Neg32Fx2:
5199       case Iop_RSqrtEst32Fx2:
5200          return unary32Fx2(mce, vatom);
5201
5202       case Iop_Sqrt32F0x4:
5203       case Iop_RSqrtEst32F0x4:
5204       case Iop_RecipEst32F0x4:
5205          return unary32F0x4(mce, vatom);
5206
5207       case Iop_Abs16Fx8:
5208       case Iop_Neg16Fx8:
5209          return unary16Fx8(mce, vatom);
5210
5211       // These are self-shadowing.
5212       case Iop_32UtoV128:
5213       case Iop_64UtoV128:
5214       case Iop_Dup8x16:
5215       case Iop_Dup16x8:
5216       case Iop_Dup32x4:
5217       case Iop_Reverse1sIn8_x16:
5218       case Iop_Reverse8sIn16_x8:
5219       case Iop_Reverse8sIn32_x4:
5220       case Iop_Reverse16sIn32_x4:
5221       case Iop_Reverse8sIn64_x2:
5222       case Iop_Reverse16sIn64_x2:
5223       case Iop_Reverse32sIn64_x2:
5224       case Iop_V256toV128_1: case Iop_V256toV128_0:
5225       case Iop_ZeroHI64ofV128:
5226       case Iop_ZeroHI96ofV128:
5227       case Iop_ZeroHI112ofV128:
5228       case Iop_ZeroHI120ofV128:
5229       case Iop_ReinterpI128asV128:  /* I128 -> V128 */
5230          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5231
5232       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
5233       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
5234          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5235
5236       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
5237       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
5238          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5239
5240       case Iop_NegF128:
5241       case Iop_AbsF128:
5242       case Iop_RndF128:
5243       case Iop_TruncF128toI128S: /* F128 -> I128S */
5244       case Iop_TruncF128toI128U: /* F128 -> I128U */
5245       case Iop_ReinterpV128asI128:  /* V128 -> I128 */
5246       case Iop_ReinterpI128asF128:
5247       case Iop_ReinterpF128asI128:
5248          return mkPCastTo(mce, Ity_I128, vatom);
5249
5250       case Iop_BCD128toI128S:
5251       case Iop_MulI128by10:
5252       case Iop_MulI128by10Carry:
5253       case Iop_F16toF64x2:
5254       case Iop_F64toF16x2_DEP:
5255          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5256          return vatom;
5257
5258       case Iop_ReinterpI32asF32:
5259       case Iop_ReinterpF32asI32:
5260          return assignNew('V', mce, Ity_I32, vatom);
5261
5262       case Iop_ReinterpF64asI64:
5263       case Iop_ReinterpI64asF64:
5264       case Iop_ReinterpI64asD64:
5265       case Iop_ReinterpD64asI64:
5266          return assignNew('V', mce, Ity_I64, vatom);
5267
5268       case Iop_I32StoF128: /* signed I32 -> F128 */
5269       case Iop_I64StoF128: /* signed I64 -> F128 */
5270       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5271       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5272       case Iop_F32toF128:  /* F32 -> F128 */
5273       case Iop_F64toF128:  /* F64 -> F128 */
5274       case Iop_I32StoD128: /* signed I64 -> D128 */
5275       case Iop_I64StoD128: /* signed I64 -> D128 */
5276       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5277       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5278          return mkPCastTo(mce, Ity_I128, vatom);
5279
5280       case Iop_F16toF64:
5281       case Iop_F32toF64:
5282       case Iop_I32StoF64:
5283       case Iop_I32UtoF64:
5284       case Iop_NegF64:
5285       case Iop_AbsF64:
5286       case Iop_RSqrtEst5GoodF64:
5287       case Iop_RoundF64toF64_NEAREST:
5288       case Iop_RoundF64toF64_NegINF:
5289       case Iop_RoundF64toF64_PosINF:
5290       case Iop_RoundF64toF64_ZERO:
5291       case Iop_RoundF64toIntA0:
5292       case Iop_RoundF64toIntE:
5293       case Iop_D32toD64:
5294       case Iop_I32StoD64:
5295       case Iop_I32UtoD64:
5296       case Iop_ExtractExpD64:    /* D64  -> I64 */
5297       case Iop_ExtractExpD128:   /* D128 -> I64 */
5298       case Iop_ExtractSigD64:    /* D64  -> I64 */
5299       case Iop_ExtractSigD128:   /* D128 -> I64 */
5300       case Iop_DPBtoBCD:
5301       case Iop_BCDtoDPB:
5302          return mkPCastTo(mce, Ity_I64, vatom);
5303
5304       case Iop_D64toD128:
5305          return mkPCastTo(mce, Ity_I128, vatom);
5306
5307       case Iop_TruncF64asF32:
5308       case Iop_NegF32:
5309       case Iop_AbsF32:
5310       case Iop_F16toF32:
5311       case Iop_RoundF32toIntA0:
5312       case Iop_RoundF32toIntE:
5313          return mkPCastTo(mce, Ity_I32, vatom);
5314
5315       case Iop_AbsF16:
5316       case Iop_NegF16:
5317          return mkPCastTo(mce, Ity_I16, vatom);
5318
5319       case Iop_Ctz32: case Iop_CtzNat32:
5320       case Iop_Ctz64: case Iop_CtzNat64:
5321          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5322
5323       case Iop_Clz32: case Iop_ClzNat32:
5324       case Iop_Clz64: case Iop_ClzNat64:
5325          return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5326
5327       // PopCount32: this is slightly pessimistic.  It is true that the
5328       // result depends on all input bits, so that aspect of the PCast is
5329       // correct.  However, regardless of the input, only the lowest 5 bits
5330       // out of the output can ever be undefined.  So we could actually
5331       // "improve" the results here by marking the top 27 bits of output as
5332       // defined.  A similar comment applies for PopCount64.
5333       case Iop_PopCount32:
5334          return mkPCastTo(mce, Ity_I32, vatom);
5335       case Iop_PopCount64:
5336          return mkPCastTo(mce, Ity_I64, vatom);
5337
5338       // These are self-shadowing.
5339       case Iop_1Uto64:
5340       case Iop_1Sto64:
5341       case Iop_8Uto64:
5342       case Iop_8Sto64:
5343       case Iop_16Uto64:
5344       case Iop_16Sto64:
5345       case Iop_32Sto64:
5346       case Iop_32Uto64:
5347       case Iop_V128to64:
5348       case Iop_V128HIto64:
5349       case Iop_128HIto64:
5350       case Iop_128to64:
5351       case Iop_Dup8x8:
5352       case Iop_Dup16x4:
5353       case Iop_Dup32x2:
5354       case Iop_Reverse8sIn16_x4:
5355       case Iop_Reverse8sIn32_x2:
5356       case Iop_Reverse16sIn32_x2:
5357       case Iop_Reverse8sIn64_x1:
5358       case Iop_Reverse16sIn64_x1:
5359       case Iop_Reverse32sIn64_x1:
5360       case Iop_V256to64_0: case Iop_V256to64_1:
5361       case Iop_V256to64_2: case Iop_V256to64_3:
5362          return assignNew('V', mce, Ity_I64, unop(op, vatom));
5363
5364       // These are self-shadowing.
5365       case Iop_64to32:
5366       case Iop_64HIto32:
5367       case Iop_1Uto32:
5368       case Iop_1Sto32:
5369       case Iop_8Uto32:
5370       case Iop_16Uto32:
5371       case Iop_16Sto32:
5372       case Iop_8Sto32:
5373       case Iop_V128to32:
5374       case Iop_Reverse8sIn32_x1:
5375          return assignNew('V', mce, Ity_I32, unop(op, vatom));
5376
5377       // These are self-shadowing.
5378       case Iop_1Sto16:
5379       case Iop_8Sto16:
5380       case Iop_8Uto16:
5381       case Iop_32to16:
5382       case Iop_32HIto16:
5383       case Iop_64to16:
5384       case Iop_GetMSBs8x16:
5385          return assignNew('V', mce, Ity_I16, unop(op, vatom));
5386
5387       // These are self-shadowing.
5388       case Iop_1Uto8:
5389       case Iop_1Sto8:
5390       case Iop_16to8:
5391       case Iop_16HIto8:
5392       case Iop_32to8:
5393       case Iop_64to8:
5394       case Iop_GetMSBs8x8:
5395          return assignNew('V', mce, Ity_I8, unop(op, vatom));
5396
5397       case Iop_32to1:
5398          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5399
5400       case Iop_64to1:
5401          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5402
5403       case Iop_NotV256:
5404       case Iop_NotV128:
5405       case Iop_Not64:
5406       case Iop_Not32:
5407       case Iop_Not16:
5408       case Iop_Not8:
5409       case Iop_Not1:
5410          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5411          return vatom;
5412
5413       case Iop_CmpNEZ8x8:
5414       case Iop_Cnt8x8:
5415       case Iop_Clz8x8:
5416       case Iop_Cls8x8:
5417       case Iop_Abs8x8:
5418          return mkPCast8x8(mce, vatom);
5419
5420       case Iop_CmpNEZ8x16:
5421       case Iop_Cnt8x16:
5422       case Iop_Clz8x16:
5423       case Iop_Cls8x16:
5424       case Iop_Abs8x16:
5425       case Iop_Ctz8x16:
5426          return mkPCast8x16(mce, vatom);
5427
5428       case Iop_CmpNEZ16x4:
5429       case Iop_Clz16x4:
5430       case Iop_Cls16x4:
5431       case Iop_Abs16x4:
5432          return mkPCast16x4(mce, vatom);
5433
5434       case Iop_CmpNEZ16x8:
5435       case Iop_Clz16x8:
5436       case Iop_Cls16x8:
5437       case Iop_Abs16x8:
5438       case Iop_Ctz16x8:
5439          return mkPCast16x8(mce, vatom);
5440
5441       case Iop_CmpNEZ32x2:
5442       case Iop_Clz32x2:
5443       case Iop_Cls32x2:
5444       case Iop_F32toI32Ux2_RZ:
5445       case Iop_F32toI32Sx2_RZ:
5446       case Iop_Abs32x2:
5447          return mkPCast32x2(mce, vatom);
5448
5449       case Iop_CmpNEZ32x4:
5450       case Iop_Clz32x4:
5451       case Iop_Cls32x4:
5452       case Iop_F32toI32Ux4_RZ:
5453       case Iop_F32toI32Sx4_RZ:
5454       case Iop_Abs32x4:
5455       case Iop_RSqrtEst32Ux4:
5456       case Iop_Ctz32x4:
5457          return mkPCast32x4(mce, vatom);
5458
5459       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5460       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5461       case Iop_CmpwNEZ32:
5462          return mkPCastTo(mce, Ity_I32, vatom);
5463
5464       case Iop_TruncF128toI64S: /* F128 -> I64S */
5465       case Iop_TruncF128toI64U: /* F128 -> I64U */
5466       case Iop_CmpwNEZ64:
5467          return mkPCastTo(mce, Ity_I64, vatom);
5468
5469       case Iop_CmpNEZ64x2:
5470       case Iop_CipherSV128:
5471       case Iop_Clz64x2:
5472       case Iop_Abs64x2:
5473       case Iop_Ctz64x2:
5474          return mkPCast64x2(mce, vatom);
5475
5476       // This is self-shadowing.
5477       case Iop_PwBitMtxXpose64x2:
5478          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5479
5480       case Iop_NarrowUn16to8x8:
5481       case Iop_NarrowUn32to16x4:
5482       case Iop_NarrowUn64to32x2:
5483       case Iop_QNarrowUn16Sto8Sx8:
5484       case Iop_QNarrowUn16Sto8Ux8:
5485       case Iop_QNarrowUn16Uto8Ux8:
5486       case Iop_QNarrowUn32Sto16Sx4:
5487       case Iop_QNarrowUn32Sto16Ux4:
5488       case Iop_QNarrowUn32Uto16Ux4:
5489       case Iop_QNarrowUn64Sto32Sx2:
5490       case Iop_QNarrowUn64Sto32Ux2:
5491       case Iop_QNarrowUn64Uto32Ux2:
5492          return vectorNarrowUnV128(mce, op, vatom);
5493
5494       // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5495       // right.
5496       case Iop_F32toF16x4_DEP:
5497          return vectorNarrowUnV128(mce, op, vatom);
5498
5499       case Iop_Widen8Sto16x8:
5500       case Iop_Widen8Uto16x8:
5501       case Iop_Widen16Sto32x4:
5502       case Iop_Widen16Uto32x4:
5503       case Iop_Widen32Sto64x2:
5504       case Iop_Widen32Uto64x2:
5505          return vectorWidenI64(mce, op, vatom);
5506
5507       case Iop_F16toF32x4:
5508          // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5509          // OK by accident if -- as seems likely -- the F16 to F32 conversion
5510          // preserves will generate an output 32 bits with at least one 1 bit
5511          // set if there's one or more 1 bits set in the input 16 bits.  More
5512          // correct code for this is just below, but commented out, so as to
5513          // avoid short-term backend failures on targets that can't do
5514          // Iop_Interleave{LO,HI}16x4.
5515          return vectorWidenI64(mce, op, vatom);
5516
5517       case Iop_F16toF32x8: {
5518          // PCast the input at 16x8.  This makes each lane hold either all
5519          // zeroes or all ones.
5520          IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5521          // Now double the width of each lane to 32 bits.  Because the lanes are
5522          // all zeroes or all ones, we can just copy the each lane twice into
5523          // the result.  Here's the low half:
5524          IRAtom* widenedLO // :: I32x4
5525             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5526                                                   pcasted, pcasted));
5527          // And the high half:
5528          IRAtom* widenedHI // :: I32x4
5529             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5530                                                   pcasted, pcasted));
5531          // Glue them back together:
5532          return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5533                                                     widenedHI, widenedLO));
5534       }
5535
5536       // See comment just above, for Iop_F16toF32x4
5537       //case Iop_F16toF32x4: {
5538       //   // Same scheme as F16toF32x4
5539       //   IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5540       //   IRAtom* widenedLO // :: I32x2
5541       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5542       //                                            pcasted, pcasted));
5543       //   IRAtom* widenedHI // :: I32x4
5544       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5545       //                                            pcasted, pcasted));
5546       //   // Glue them back together:
5547       //   return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5548       //                                              widenedHI, widenedLO));
5549       //}
5550
5551       case Iop_PwAddL32Ux2:
5552       case Iop_PwAddL32Sx2:
5553          return mkPCastTo(mce, Ity_I64,
5554                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5555
5556       case Iop_PwAddL16Ux4:
5557       case Iop_PwAddL16Sx4:
5558          return mkPCast32x2(mce,
5559                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5560
5561       case Iop_PwAddL8Ux8:
5562       case Iop_PwAddL8Sx8:
5563          return mkPCast16x4(mce,
5564                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5565
5566       case Iop_PwAddL32Ux4:
5567       case Iop_PwAddL32Sx4:
5568          return mkPCast64x2(mce,
5569                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5570
5571       case Iop_PwAddL64Ux2:
5572          return mkPCast128x1(mce,
5573                assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5574
5575       case Iop_PwAddL16Ux8:
5576       case Iop_PwAddL16Sx8:
5577          return mkPCast32x4(mce,
5578                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5579
5580       case Iop_PwAddL8Ux16:
5581       case Iop_PwAddL8Sx16:
5582          return mkPCast16x8(mce,
5583                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5584
5585       case Iop_I64UtoF32:
5586       default:
5587          ppIROp(op);
5588          VG_(tool_panic)("memcheck:expr2vbits_Unop");
5589    }
5590 }
5591
5592
5593 /* Worker function -- do not call directly.  See comments on
5594    expr2vbits_Load for the meaning of |guard|.
5595
5596    Generates IR to (1) perform a definedness test of |addr|, (2)
5597    perform a validity test of |addr|, and (3) return the Vbits for the
5598    location indicated by |addr|.  All of this only happens when
5599    |guard| is NULL or |guard| evaluates to True at run time.
5600
5601    If |guard| evaluates to False at run time, the returned value is
5602    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5603    performed.
5604
5605    The definedness of |guard| itself is not checked.  That is assumed
5606    to have been done before this point, by the caller. */
5607 static
5608 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5609                               IREndness end, IRType ty,
5610                               IRAtom* addr, UInt bias, IRAtom* guard )
5611 {
5612    tl_assert(isOriginalAtom(mce,addr));
5613    tl_assert(end == Iend_LE || end == Iend_BE);
5614
5615    /* First, emit a definedness test for the address.  This also sets
5616       the address (shadow) to 'defined' following the test. */
5617    complainIfUndefined( mce, addr, guard );
5618
5619    /* Now cook up a call to the relevant helper function, to read the data V
5620       bits from shadow memory.  Note that I128 loads are done by pretending
5621       we're doing a V128 load, and then converting the resulting V128 vbits
5622       word to an I128, right at the end of this function -- see `castedToI128`
5623       below.  (It's only a minor hack :-) This pertains to bug 444399. */
5624    ty = shadowTypeV(ty);
5625
5626    void*        helper           = NULL;
5627    const HChar* hname            = NULL;
5628    Bool         ret_via_outparam = False;
5629
5630    if (end == Iend_LE) {
5631       switch (ty) {
5632          case Ity_V256: helper = &MC_(helperc_LOADV256le);
5633                         hname = "MC_(helperc_LOADV256le)";
5634                         ret_via_outparam = True;
5635                         break;
5636          case Ity_I128: // fallthrough.  See comment above.
5637          case Ity_V128: helper = &MC_(helperc_LOADV128le);
5638                         hname = "MC_(helperc_LOADV128le)";
5639                         ret_via_outparam = True;
5640                         break;
5641          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
5642                         hname = "MC_(helperc_LOADV64le)";
5643                         break;
5644          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
5645                         hname = "MC_(helperc_LOADV32le)";
5646                         break;
5647          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
5648                         hname = "MC_(helperc_LOADV16le)";
5649                         break;
5650          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5651                         hname = "MC_(helperc_LOADV8)";
5652                         break;
5653          default:       ppIRType(ty);
5654                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5655       }
5656    } else {
5657       switch (ty) {
5658          case Ity_V256: helper = &MC_(helperc_LOADV256be);
5659                         hname = "MC_(helperc_LOADV256be)";
5660                         ret_via_outparam = True;
5661                         break;
5662          case Ity_V128: helper = &MC_(helperc_LOADV128be);
5663                         hname = "MC_(helperc_LOADV128be)";
5664                         ret_via_outparam = True;
5665                         break;
5666          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
5667                         hname = "MC_(helperc_LOADV64be)";
5668                         break;
5669          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
5670                         hname = "MC_(helperc_LOADV32be)";
5671                         break;
5672          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
5673                         hname = "MC_(helperc_LOADV16be)";
5674                         break;
5675          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5676                         hname = "MC_(helperc_LOADV8)";
5677                         break;
5678          default:       ppIRType(ty);
5679                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5680       }
5681    }
5682
5683    tl_assert(helper);
5684    tl_assert(hname);
5685
5686    /* Generate the actual address into addrAct. */
5687    IRAtom* addrAct;
5688    if (bias == 0) {
5689       addrAct = addr;
5690    } else {
5691       IROp    mkAdd;
5692       IRAtom* eBias;
5693       IRType  tyAddr  = mce->hWordTy;
5694       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5695       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5696       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5697       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5698    }
5699
5700    /* We need to have a place to park the V bits we're just about to
5701       read. */
5702    IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
5703
5704    /* Here's the call. */
5705    IRDirty* di;
5706    if (ret_via_outparam) {
5707       di = unsafeIRDirty_1_N( datavbits,
5708                               2/*regparms*/,
5709                               hname, VG_(fnptr_to_fnentry)( helper ),
5710                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5711    } else {
5712       di = unsafeIRDirty_1_N( datavbits,
5713                               1/*regparms*/,
5714                               hname, VG_(fnptr_to_fnentry)( helper ),
5715                               mkIRExprVec_1( addrAct ) );
5716    }
5717
5718    setHelperAnns( mce, di );
5719    if (guard) {
5720       di->guard = guard;
5721       /* Ideally the didn't-happen return value here would be all-ones
5722          (all-undefined), so it'd be obvious if it got used
5723          inadvertently.  We can get by with the IR-mandated default
5724          value (0b01 repeating, 0x55 etc) as that'll still look pretty
5725          undefined if it ever leaks out. */
5726    }
5727    stmt( 'V', mce, IRStmt_Dirty(di) );
5728
5729    if (ty == Ity_I128) {
5730       IRAtom* castedToI128
5731          = assignNew('V', mce, Ity_I128,
5732                      unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
5733       return castedToI128;
5734    } else {
5735       return mkexpr(datavbits);
5736    }
5737 }
5738
5739
5740 /* Generate IR to do a shadow load.  The helper is expected to check
5741    the validity of the address and return the V bits for that address.
5742    This can optionally be controlled by a guard, which is assumed to
5743    be True if NULL.  In the case where the guard is False at runtime,
5744    the helper will return the didn't-do-the-call value of 0x55..55.
5745    Since that means "completely undefined result", the caller of
5746    this function will need to fix up the result somehow in that
5747    case.
5748
5749    Caller of this function is also expected to have checked the
5750    definedness of |guard| before this point.
5751 */
5752 static
5753 IRAtom* expr2vbits_Load ( MCEnv* mce,
5754                           IREndness end, IRType ty,
5755                           IRAtom* addr, UInt bias,
5756                           IRAtom* guard )
5757 {
5758    tl_assert(end == Iend_LE || end == Iend_BE);
5759    switch (shadowTypeV(ty)) {
5760       case Ity_I8:
5761       case Ity_I16:
5762       case Ity_I32:
5763       case Ity_I64:
5764       case Ity_I128:
5765       case Ity_V128:
5766       case Ity_V256:
5767          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5768       default:
5769          VG_(tool_panic)("expr2vbits_Load");
5770    }
5771 }
5772
5773
5774 /* The most general handler for guarded loads.  Assumes the
5775    definedness of GUARD has already been checked by the caller.  A
5776    GUARD of NULL is assumed to mean "always True".  Generates code to
5777    check the definedness and validity of ADDR.
5778
5779    Generate IR to do a shadow load from ADDR and return the V bits.
5780    The loaded type is TY.  The loaded data is then (shadow) widened by
5781    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
5782    evaluates to False at run time then the returned Vbits are simply
5783    VALT instead.  Note therefore that the argument type of VWIDEN must
5784    be TY and the result type of VWIDEN must equal the type of VALT.
5785 */
5786 static
5787 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5788                                           IREndness end, IRType ty,
5789                                           IRAtom* addr, UInt bias,
5790                                           IRAtom* guard,
5791                                           IROp vwiden, IRAtom* valt )
5792 {
5793    /* Sanity check the conversion operation, and also set TYWIDE. */
5794    IRType tyWide = Ity_INVALID;
5795    switch (vwiden) {
5796       case Iop_INVALID:
5797          tyWide = ty;
5798          break;
5799       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5800          tyWide = Ity_I32;
5801          break;
5802       default:
5803          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5804    }
5805
5806    /* If the guard evaluates to True, this will hold the loaded V bits
5807       at TY.  If the guard evaluates to False, this will be all
5808       ones, meaning "all undefined", in which case we will have to
5809       replace it using an ITE below. */
5810    IRAtom* iftrue1
5811       = assignNew('V', mce, ty,
5812                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
5813    /* Now (shadow-) widen the loaded V bits to the desired width.  In
5814       the guard-is-False case, the allowable widening operators will
5815       in the worst case (unsigned widening) at least leave the
5816       pre-widened part as being marked all-undefined, and in the best
5817       case (signed widening) mark the whole widened result as
5818       undefined.  Anyway, it doesn't matter really, since in this case
5819       we will replace said value with the default value |valt| using an
5820       ITE. */
5821    IRAtom* iftrue2
5822       = vwiden == Iop_INVALID
5823            ? iftrue1
5824            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5825    /* These are the V bits we will return if the load doesn't take
5826       place. */
5827    IRAtom* iffalse
5828       = valt;
5829    /* Prepare the cond for the ITE.  Convert a NULL cond into
5830       something that iropt knows how to fold out later. */
5831    IRAtom* cond
5832       = guard == NULL  ? mkU1(1)  : guard;
5833    /* And assemble the final result. */
5834    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5835 }
5836
5837
5838 /* A simpler handler for guarded loads, in which there is no
5839    conversion operation, and the default V bit return (when the guard
5840    evaluates to False at runtime) is "all defined".  If there is no
5841    guard expression or the guard is always TRUE this function behaves
5842    like expr2vbits_Load.  It is assumed that definedness of GUARD has
5843    already been checked at the call site. */
5844 static
5845 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5846                                          IREndness end, IRType ty,
5847                                          IRAtom* addr, UInt bias,
5848                                          IRAtom *guard )
5849 {
5850    return expr2vbits_Load_guarded_General(
5851              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5852           );
5853 }
5854
5855
5856 static
5857 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5858                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5859 {
5860    IRAtom *vbitsC, *vbits0, *vbits1;
5861    IRType ty;
5862    /* Given ITE(cond, iftrue,  iffalse),  generate
5863             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5864       That is, steer the V bits like the originals, but trash the
5865       result if the steering value is undefined.  This gives
5866       lazy propagation. */
5867    tl_assert(isOriginalAtom(mce, cond));
5868    tl_assert(isOriginalAtom(mce, iftrue));
5869    tl_assert(isOriginalAtom(mce, iffalse));
5870
5871    vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5872    vbits1 = expr2vbits(mce, iftrue, HuOth);
5873    vbits0 = expr2vbits(mce, iffalse, HuOth);
5874    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5875
5876    return
5877       mkUifU(mce, ty, assignNew('V', mce, ty,
5878                                      IRExpr_ITE(cond, vbits1, vbits0)),
5879                       mkPCastTo(mce, ty, vbitsC) );
5880 }
5881
5882 /* --------- This is the main expression-handling function. --------- */
5883
5884 static
5885 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5886                      HowUsed hu/*use HuOth if unknown*/ )
5887 {
5888    switch (e->tag) {
5889
5890       case Iex_Get:
5891          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5892
5893       case Iex_GetI:
5894          return shadow_GETI( mce, e->Iex.GetI.descr,
5895                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
5896
5897       case Iex_RdTmp:
5898          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5899
5900       case Iex_Const:
5901          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5902
5903       case Iex_Qop:
5904          return expr2vbits_Qop(
5905                    mce,
5906                    e->Iex.Qop.details->op,
5907                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5908                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5909                 );
5910
5911       case Iex_Triop:
5912          return expr2vbits_Triop(
5913                    mce,
5914                    e->Iex.Triop.details->op,
5915                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5916                    e->Iex.Triop.details->arg3
5917                 );
5918
5919       case Iex_Binop:
5920          return expr2vbits_Binop(
5921                    mce,
5922                    e->Iex.Binop.op,
5923                    e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5924                    hu
5925                 );
5926
5927       case Iex_Unop:
5928          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5929
5930       case Iex_Load:
5931          return expr2vbits_Load( mce, e->Iex.Load.end,
5932                                       e->Iex.Load.ty,
5933                                       e->Iex.Load.addr, 0/*addr bias*/,
5934                                       NULL/* guard == "always True"*/ );
5935
5936       case Iex_CCall:
5937          return mkLazyN( mce, e->Iex.CCall.args,
5938                               e->Iex.CCall.retty,
5939                               e->Iex.CCall.cee );
5940
5941       case Iex_ITE:
5942          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5943                                      e->Iex.ITE.iffalse);
5944
5945       default:
5946          VG_(printf)("\n");
5947          ppIRExpr(e);
5948          VG_(printf)("\n");
5949          VG_(tool_panic)("memcheck: expr2vbits");
5950    }
5951 }
5952
5953
5954 /*------------------------------------------------------------*/
5955 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5956 /*------------------------------------------------------------*/
5957
5958 /* Widen a value to the host word size. */
5959
5960 static
5961 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5962 {
5963    IRType ty, tyH;
5964
5965    /* vatom is vbits-value and as such can only have a shadow type. */
5966    tl_assert(isShadowAtom(mce,vatom));
5967
5968    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5969    tyH = mce->hWordTy;
5970
5971    if (tyH == Ity_I32) {
5972       switch (ty) {
5973          case Ity_I32:
5974             return vatom;
5975          case Ity_I16:
5976             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5977          case Ity_I8:
5978             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5979          default:
5980             goto unhandled;
5981       }
5982    } else
5983    if (tyH == Ity_I64) {
5984       switch (ty) {
5985          case Ity_I32:
5986             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5987          case Ity_I16:
5988             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5989                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5990          case Ity_I8:
5991             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5992                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5993          default:
5994             goto unhandled;
5995       }
5996    } else {
5997       goto unhandled;
5998    }
5999   unhandled:
6000    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
6001    VG_(tool_panic)("zwidenToHostWord");
6002 }
6003
6004
6005 /* Generate a shadow store.  |addr| is always the original address
6006    atom.  You can pass in either originals or V-bits for the data
6007    atom, but obviously not both.  This function generates a check for
6008    the definedness and (indirectly) the validity of |addr|, but only
6009    when |guard| evaluates to True at run time (or is NULL).
6010
6011    |guard| :: Ity_I1 controls whether the store really happens; NULL
6012    means it unconditionally does.  Note that |guard| itself is not
6013    checked for definedness; the caller of this function must do that
6014    if necessary.
6015 */
6016 static
6017 void do_shadow_Store ( MCEnv* mce,
6018                        IREndness end,
6019                        IRAtom* addr, UInt bias,
6020                        IRAtom* data, IRAtom* vdata,
6021                        IRAtom* guard )
6022 {
6023    IROp     mkAdd;
6024    IRType   ty, tyAddr;
6025    void*    helper = NULL;
6026    const HChar* hname = NULL;
6027    IRConst* c;
6028
6029    tyAddr = mce->hWordTy;
6030    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
6031    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
6032    tl_assert( end == Iend_LE || end == Iend_BE );
6033
6034    if (data) {
6035       tl_assert(!vdata);
6036       tl_assert(isOriginalAtom(mce, data));
6037       tl_assert(bias == 0);
6038       vdata = expr2vbits( mce, data, HuOth );
6039    } else {
6040       tl_assert(vdata);
6041    }
6042
6043    tl_assert(isOriginalAtom(mce,addr));
6044    tl_assert(isShadowAtom(mce,vdata));
6045
6046    if (guard) {
6047       tl_assert(isOriginalAtom(mce, guard));
6048       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6049    }
6050
6051    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
6052
6053    // If we're not doing undefined value checking, pretend that this value
6054    // is "all valid".  That lets Vex's optimiser remove some of the V bit
6055    // shadow computation ops that precede it.
6056    if (MC_(clo_mc_level) == 1) {
6057       switch (ty) {
6058          case Ity_V256: // V256 weirdness -- used four times
6059                         c = IRConst_V256(V_BITS32_DEFINED); break;
6060          case Ity_V128: // V128 weirdness -- used twice
6061                         c = IRConst_V128(V_BITS16_DEFINED); break;
6062          case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
6063          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
6064          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
6065          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
6066          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
6067          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6068       }
6069       vdata = IRExpr_Const( c );
6070    }
6071
6072    /* First, emit a definedness test for the address.  This also sets
6073       the address (shadow) to 'defined' following the test.  Both of
6074       those actions are gated on |guard|. */
6075    complainIfUndefined( mce, addr, guard );
6076
6077    /* Now decide which helper function to call to write the data V
6078       bits into shadow memory. */
6079    if (end == Iend_LE) {
6080       switch (ty) {
6081          case Ity_V256: /* we'll use the helper four times */
6082          case Ity_V128: /* we'll use the helper twice */
6083          case Ity_I128: /* we'll use the helper twice */
6084          case Ity_I64: helper = &MC_(helperc_STOREV64le);
6085                        hname = "MC_(helperc_STOREV64le)";
6086                        break;
6087          case Ity_I32: helper = &MC_(helperc_STOREV32le);
6088                        hname = "MC_(helperc_STOREV32le)";
6089                        break;
6090          case Ity_I16: helper = &MC_(helperc_STOREV16le);
6091                        hname = "MC_(helperc_STOREV16le)";
6092                        break;
6093          case Ity_I8:  helper = &MC_(helperc_STOREV8);
6094                        hname = "MC_(helperc_STOREV8)";
6095                        break;
6096          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6097       }
6098    } else {
6099       switch (ty) {
6100          case Ity_V128: /* we'll use the helper twice */
6101          case Ity_I64: helper = &MC_(helperc_STOREV64be);
6102                        hname = "MC_(helperc_STOREV64be)";
6103                        break;
6104          case Ity_I32: helper = &MC_(helperc_STOREV32be);
6105                        hname = "MC_(helperc_STOREV32be)";
6106                        break;
6107          case Ity_I16: helper = &MC_(helperc_STOREV16be);
6108                        hname = "MC_(helperc_STOREV16be)";
6109                        break;
6110          case Ity_I8:  helper = &MC_(helperc_STOREV8);
6111                        hname = "MC_(helperc_STOREV8)";
6112                        break;
6113          /* Note, no V256 case here, because no big-endian target that
6114             we support, has 256 vectors. */
6115          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
6116       }
6117    }
6118
6119    if (UNLIKELY(ty == Ity_V256)) {
6120
6121       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6122          Q3 being the most significant lane. */
6123       /* These are the offsets of the Qs in memory. */
6124       Int     offQ0, offQ1, offQ2, offQ3;
6125
6126       /* Various bits for constructing the 4 lane helper calls */
6127       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
6128       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
6129       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
6130       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
6131
6132       if (end == Iend_LE) {
6133          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
6134       } else {
6135          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
6136       }
6137
6138       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
6139       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
6140       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
6141       diQ0    = unsafeIRDirty_0_N(
6142                    1/*regparms*/,
6143                    hname, VG_(fnptr_to_fnentry)( helper ),
6144                    mkIRExprVec_2( addrQ0, vdataQ0 )
6145                 );
6146
6147       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
6148       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
6149       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
6150       diQ1    = unsafeIRDirty_0_N(
6151                    1/*regparms*/,
6152                    hname, VG_(fnptr_to_fnentry)( helper ),
6153                    mkIRExprVec_2( addrQ1, vdataQ1 )
6154                 );
6155
6156       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
6157       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
6158       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
6159       diQ2    = unsafeIRDirty_0_N(
6160                    1/*regparms*/,
6161                    hname, VG_(fnptr_to_fnentry)( helper ),
6162                    mkIRExprVec_2( addrQ2, vdataQ2 )
6163                 );
6164
6165       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
6166       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
6167       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
6168       diQ3    = unsafeIRDirty_0_N(
6169                    1/*regparms*/,
6170                    hname, VG_(fnptr_to_fnentry)( helper ),
6171                    mkIRExprVec_2( addrQ3, vdataQ3 )
6172                 );
6173
6174       if (guard)
6175          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
6176
6177       setHelperAnns( mce, diQ0 );
6178       setHelperAnns( mce, diQ1 );
6179       setHelperAnns( mce, diQ2 );
6180       setHelperAnns( mce, diQ3 );
6181       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
6182       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
6183       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
6184       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6185
6186    }
6187    else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6188
6189       /* V128/I128-bit case */
6190       /* See comment in next clause re 64-bit regparms */
6191       /* also, need to be careful about endianness */
6192
6193       Int     offLo64, offHi64;
6194       IRDirty *diLo64, *diHi64;
6195       IRAtom  *addrLo64, *addrHi64;
6196       IRAtom  *vdataLo64, *vdataHi64;
6197       IRAtom  *eBiasLo64, *eBiasHi64;
6198       IROp    opGetLO64,  opGetHI64;
6199
6200       if (end == Iend_LE) {
6201          offLo64 = 0;
6202          offHi64 = 8;
6203       } else {
6204          offLo64 = 8;
6205          offHi64 = 0;
6206       }
6207
6208       if (ty == Ity_V128) {
6209          opGetLO64 = Iop_V128to64;
6210          opGetHI64 = Iop_V128HIto64;
6211       } else {
6212          opGetLO64 = Iop_128to64;
6213          opGetHI64 = Iop_128HIto64;
6214       }
6215
6216       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6217       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6218       vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6219       diLo64    = unsafeIRDirty_0_N(
6220                      1/*regparms*/,
6221                      hname, VG_(fnptr_to_fnentry)( helper ),
6222                      mkIRExprVec_2( addrLo64, vdataLo64 )
6223                   );
6224       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6225       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6226       vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6227       diHi64    = unsafeIRDirty_0_N(
6228                      1/*regparms*/,
6229                      hname, VG_(fnptr_to_fnentry)( helper ),
6230                      mkIRExprVec_2( addrHi64, vdataHi64 )
6231                   );
6232       if (guard) diLo64->guard = guard;
6233       if (guard) diHi64->guard = guard;
6234       setHelperAnns( mce, diLo64 );
6235       setHelperAnns( mce, diHi64 );
6236       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
6237       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
6238
6239    } else {
6240
6241       IRDirty *di;
6242       IRAtom  *addrAct;
6243
6244       /* 8/16/32/64-bit cases */
6245       /* Generate the actual address into addrAct. */
6246       if (bias == 0) {
6247          addrAct = addr;
6248       } else {
6249          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
6250          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
6251       }
6252
6253       if (ty == Ity_I64) {
6254          /* We can't do this with regparm 2 on 32-bit platforms, since
6255             the back ends aren't clever enough to handle 64-bit
6256             regparm args.  Therefore be different. */
6257          di = unsafeIRDirty_0_N(
6258                  1/*regparms*/,
6259                  hname, VG_(fnptr_to_fnentry)( helper ),
6260                  mkIRExprVec_2( addrAct, vdata )
6261               );
6262       } else {
6263          di = unsafeIRDirty_0_N(
6264                  2/*regparms*/,
6265                  hname, VG_(fnptr_to_fnentry)( helper ),
6266                  mkIRExprVec_2( addrAct,
6267                                 zwidenToHostWord( mce, vdata ))
6268               );
6269       }
6270       if (guard) di->guard = guard;
6271       setHelperAnns( mce, di );
6272       stmt( 'V', mce, IRStmt_Dirty(di) );
6273    }
6274
6275 }
6276
6277
6278 /* Do lazy pessimistic propagation through a dirty helper call, by
6279    looking at the annotations on it.  This is the most complex part of
6280    Memcheck. */
6281
6282 static IRType szToITy ( Int n )
6283 {
6284    switch (n) {
6285       case 1: return Ity_I8;
6286       case 2: return Ity_I16;
6287       case 4: return Ity_I32;
6288       case 8: return Ity_I64;
6289       default: VG_(tool_panic)("szToITy(memcheck)");
6290    }
6291 }
6292
6293 static
6294 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6295 {
6296    Int       i, k, n, toDo, gSz, gOff;
6297    IRAtom    *src, *here, *curr;
6298    IRType    tySrc, tyDst;
6299    IRTemp    dst;
6300    IREndness end;
6301
6302    /* What's the native endianness?  We need to know this. */
6303 #  if defined(VG_BIGENDIAN)
6304    end = Iend_BE;
6305 #  elif defined(VG_LITTLEENDIAN)
6306    end = Iend_LE;
6307 #  else
6308 #    error "Unknown endianness"
6309 #  endif
6310
6311    /* First check the guard. */
6312    complainIfUndefined(mce, d->guard, NULL);
6313
6314    /* Now round up all inputs and PCast over them. */
6315    curr = definedOfType(Ity_I32);
6316
6317    /* Inputs: unmasked args
6318       Note: arguments are evaluated REGARDLESS of the guard expression */
6319    for (i = 0; d->args[i]; i++) {
6320       IRAtom* arg = d->args[i];
6321       if ( (d->cee->mcx_mask & (1<<i))
6322            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6323          /* ignore this arg */
6324       } else {
6325          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6326          curr = mkUifU32(mce, here, curr);
6327       }
6328    }
6329
6330    /* Inputs: guest state that we read. */
6331    for (i = 0; i < d->nFxState; i++) {
6332       tl_assert(d->fxState[i].fx != Ifx_None);
6333       if (d->fxState[i].fx == Ifx_Write)
6334          continue;
6335
6336       /* Enumerate the described state segments */
6337       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6338          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6339          gSz  = d->fxState[i].size;
6340
6341          /* Ignore any sections marked as 'always defined'. */
6342          if (isAlwaysDefd(mce, gOff, gSz)) {
6343             if (0)
6344             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6345                         gOff, gSz);
6346             continue;
6347          }
6348
6349          /* This state element is read or modified.  So we need to
6350             consider it.  If larger than 8 bytes, deal with it in
6351             8-byte chunks. */
6352          while (True) {
6353             tl_assert(gSz >= 0);
6354             if (gSz == 0) break;
6355             n = gSz <= 8 ? gSz : 8;
6356             /* update 'curr' with UifU of the state slice
6357                gOff .. gOff+n-1 */
6358             tySrc = szToITy( n );
6359
6360             /* Observe the guard expression. If it is false use an
6361                all-bits-defined bit pattern */
6362             IRAtom *cond, *iffalse, *iftrue;
6363
6364             cond    = assignNew('V', mce, Ity_I1, d->guard);
6365             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6366             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6367             src     = assignNew('V', mce, tySrc,
6368                                 IRExpr_ITE(cond, iftrue, iffalse));
6369
6370             here = mkPCastTo( mce, Ity_I32, src );
6371             curr = mkUifU32(mce, here, curr);
6372             gSz -= n;
6373             gOff += n;
6374          }
6375       }
6376    }
6377
6378    /* Inputs: memory.  First set up some info needed regardless of
6379       whether we're doing reads or writes. */
6380
6381    if (d->mFx != Ifx_None) {
6382       /* Because we may do multiple shadow loads/stores from the same
6383          base address, it's best to do a single test of its
6384          definedness right now.  Post-instrumentation optimisation
6385          should remove all but this test. */
6386       IRType tyAddr;
6387       tl_assert(d->mAddr);
6388       complainIfUndefined(mce, d->mAddr, d->guard);
6389
6390       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6391       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6392       tl_assert(tyAddr == mce->hWordTy); /* not really right */
6393    }
6394
6395    /* Deal with memory inputs (reads or modifies) */
6396    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6397       toDo   = d->mSize;
6398       /* chew off 32-bit chunks.  We don't care about the endianness
6399          since it's all going to be condensed down to a single bit,
6400          but nevertheless choose an endianness which is hopefully
6401          native to the platform. */
6402       while (toDo >= 4) {
6403          here = mkPCastTo(
6404                    mce, Ity_I32,
6405                    expr2vbits_Load_guarded_Simple(
6406                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6407                 );
6408          curr = mkUifU32(mce, here, curr);
6409          toDo -= 4;
6410       }
6411       /* chew off 16-bit chunks */
6412       while (toDo >= 2) {
6413          here = mkPCastTo(
6414                    mce, Ity_I32,
6415                    expr2vbits_Load_guarded_Simple(
6416                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6417                 );
6418          curr = mkUifU32(mce, here, curr);
6419          toDo -= 2;
6420       }
6421       /* chew off the remaining 8-bit chunk, if any */
6422       if (toDo == 1) {
6423          here = mkPCastTo(
6424                    mce, Ity_I32,
6425                    expr2vbits_Load_guarded_Simple(
6426                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6427                 );
6428          curr = mkUifU32(mce, here, curr);
6429          toDo -= 1;
6430       }
6431       tl_assert(toDo == 0);
6432    }
6433
6434    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
6435       all the inputs to the helper.  Now we need to re-distribute the
6436       results to all destinations. */
6437
6438    /* Outputs: the destination temporary, if there is one. */
6439    if (d->tmp != IRTemp_INVALID) {
6440       dst   = findShadowTmpV(mce, d->tmp);
6441       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6442       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6443    }
6444
6445    /* Outputs: guest state that we write or modify. */
6446    for (i = 0; i < d->nFxState; i++) {
6447       tl_assert(d->fxState[i].fx != Ifx_None);
6448       if (d->fxState[i].fx == Ifx_Read)
6449          continue;
6450
6451       /* Enumerate the described state segments */
6452       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6453          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6454          gSz  = d->fxState[i].size;
6455
6456          /* Ignore any sections marked as 'always defined'. */
6457          if (isAlwaysDefd(mce, gOff, gSz))
6458             continue;
6459
6460          /* This state element is written or modified.  So we need to
6461             consider it.  If larger than 8 bytes, deal with it in
6462             8-byte chunks. */
6463          while (True) {
6464             tl_assert(gSz >= 0);
6465             if (gSz == 0) break;
6466             n = gSz <= 8 ? gSz : 8;
6467             /* Write suitably-casted 'curr' to the state slice
6468                gOff .. gOff+n-1 */
6469             tyDst = szToITy( n );
6470             do_shadow_PUT( mce, gOff,
6471                                 NULL, /* original atom */
6472                                 mkPCastTo( mce, tyDst, curr ), d->guard );
6473             gSz -= n;
6474             gOff += n;
6475          }
6476       }
6477    }
6478
6479    /* Outputs: memory that we write or modify.  Same comments about
6480       endianness as above apply. */
6481    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6482       toDo   = d->mSize;
6483       /* chew off 32-bit chunks */
6484       while (toDo >= 4) {
6485          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6486                           NULL, /* original data */
6487                           mkPCastTo( mce, Ity_I32, curr ),
6488                           d->guard );
6489          toDo -= 4;
6490       }
6491       /* chew off 16-bit chunks */
6492       while (toDo >= 2) {
6493          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6494                           NULL, /* original data */
6495                           mkPCastTo( mce, Ity_I16, curr ),
6496                           d->guard );
6497          toDo -= 2;
6498       }
6499       /* chew off the remaining 8-bit chunk, if any */
6500       if (toDo == 1) {
6501          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6502                           NULL, /* original data */
6503                           mkPCastTo( mce, Ity_I8, curr ),
6504                           d->guard );
6505          toDo -= 1;
6506       }
6507       tl_assert(toDo == 0);
6508    }
6509
6510 }
6511
6512
6513 /* We have an ABI hint telling us that [base .. base+len-1] is to
6514    become undefined ("writable").  Generate code to call a helper to
6515    notify the A/V bit machinery of this fact.
6516
6517    We call
6518    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6519                                                     Addr nia );
6520 */
6521 static
6522 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6523 {
6524    IRDirty* di;
6525
6526    if (MC_(clo_mc_level) == 3) {
6527       di = unsafeIRDirty_0_N(
6528               3/*regparms*/,
6529               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6530               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6531               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6532            );
6533    } else {
6534       /* We ignore the supplied nia, since it is irrelevant. */
6535       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6536       /* Special-case the len==128 case, since that is for amd64-ELF,
6537          which is a very common target. */
6538       if (len == 128) {
6539          di = unsafeIRDirty_0_N(
6540                  1/*regparms*/,
6541                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6542                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6543                  mkIRExprVec_1( base )
6544               );
6545       } else {
6546          di = unsafeIRDirty_0_N(
6547                  2/*regparms*/,
6548                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6549                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6550                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6551               );
6552       }
6553    }
6554
6555    stmt( 'V', mce, IRStmt_Dirty(di) );
6556 }
6557
6558
6559 /* ------ Dealing with IRCAS (big and complex) ------ */
6560
6561 /* FWDS */
6562 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
6563                              IRAtom* baseaddr, Int offset );
6564 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6565 static void    gen_store_b ( MCEnv* mce, Int szB,
6566                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
6567                              IRAtom* guard );
6568
6569 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6570 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6571
6572
6573 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6574    IRExpr.Consts, else this asserts.  If they are both Consts, it
6575    doesn't do anything.  So that just leaves the RdTmp case.
6576
6577    In which case: this assigns the shadow value SHADOW to the IR
6578    shadow temporary associated with ORIG.  That is, ORIG, being an
6579    original temporary, will have a shadow temporary associated with
6580    it.  However, in the case envisaged here, there will so far have
6581    been no IR emitted to actually write a shadow value into that
6582    temporary.  What this routine does is to (emit IR to) copy the
6583    value in SHADOW into said temporary, so that after this call,
6584    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6585    value in SHADOW.
6586
6587    Point is to allow callers to compute "by hand" a shadow value for
6588    ORIG, and force it to be associated with ORIG.
6589
6590    How do we know that that shadow associated with ORIG has not so far
6591    been assigned to?  Well, we don't per se know that, but supposing
6592    it had.  Then this routine would create a second assignment to it,
6593    and later the IR sanity checker would barf.  But that never
6594    happens.  QED.
6595 */
6596 static void bind_shadow_tmp_to_orig ( UChar how,
6597                                       MCEnv* mce,
6598                                       IRAtom* orig, IRAtom* shadow )
6599 {
6600    tl_assert(isOriginalAtom(mce, orig));
6601    tl_assert(isShadowAtom(mce, shadow));
6602    switch (orig->tag) {
6603       case Iex_Const:
6604          tl_assert(shadow->tag == Iex_Const);
6605          break;
6606       case Iex_RdTmp:
6607          tl_assert(shadow->tag == Iex_RdTmp);
6608          if (how == 'V') {
6609             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6610                    shadow);
6611          } else {
6612             tl_assert(how == 'B');
6613             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6614                    shadow);
6615          }
6616          break;
6617       default:
6618          tl_assert(0);
6619    }
6620 }
6621
6622
6623 static
6624 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6625 {
6626    /* Scheme is (both single- and double- cases):
6627
6628       1. fetch data#,dataB (the proposed new value)
6629
6630       2. fetch expd#,expdB (what we expect to see at the address)
6631
6632       3. check definedness of address
6633
6634       4. load old#,oldB from shadow memory; this also checks
6635          addressibility of the address
6636
6637       5. the CAS itself
6638
6639       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
6640
6641       7. if "expected == old" (as computed by (6))
6642             store data#,dataB to shadow memory
6643
6644       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
6645       'data' but 7 stores 'data#'.  Hence it is possible for the
6646       shadow data to be incorrectly checked and/or updated:
6647
6648       * 7 is at least gated correctly, since the 'expected == old'
6649         condition is derived from outputs of 5.  However, the shadow
6650         write could happen too late: imagine after 5 we are
6651         descheduled, a different thread runs, writes a different
6652         (shadow) value at the address, and then we resume, hence
6653         overwriting the shadow value written by the other thread.
6654
6655       Because the original memory access is atomic, there's no way to
6656       make both the original and shadow accesses into a single atomic
6657       thing, hence this is unavoidable.
6658
6659       At least as Valgrind stands, I don't think it's a problem, since
6660       we're single threaded *and* we guarantee that there are no
6661       context switches during the execution of any specific superblock
6662       -- context switches can only happen at superblock boundaries.
6663
6664       If Valgrind ever becomes MT in the future, then it might be more
6665       of a problem.  A possible kludge would be to artificially
6666       associate with the location, a lock, which we must acquire and
6667       release around the transaction as a whole.  Hmm, that probably
6668       would't work properly since it only guards us against other
6669       threads doing CASs on the same location, not against other
6670       threads doing normal reads and writes.
6671
6672       ------------------------------------------------------------
6673
6674       COMMENT_ON_CasCmpEQ:
6675
6676       Note two things.  Firstly, in the sequence above, we compute
6677       "expected == old", but we don't check definedness of it.  Why
6678       not?  Also, the x86 and amd64 front ends use
6679       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6680       determination (expected == old ?) for themselves, and we also
6681       don't check definedness for those primops; we just say that the
6682       result is defined.  Why?  Details follow.
6683
6684       x86/amd64 contains various forms of locked insns:
6685       * lock prefix before all basic arithmetic insn;
6686         eg lock xorl %reg1,(%reg2)
6687       * atomic exchange reg-mem
6688       * compare-and-swaps
6689
6690       Rather than attempt to represent them all, which would be a
6691       royal PITA, I used a result from Maurice Herlihy
6692       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6693       demonstrates that compare-and-swap is a primitive more general
6694       than the other two, and so can be used to represent all of them.
6695       So the translation scheme for (eg) lock incl (%reg) is as
6696       follows:
6697
6698         again:
6699          old = * %reg
6700          new = old + 1
6701          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6702
6703       The "atomically" is the CAS bit.  The scheme is always the same:
6704       get old value from memory, compute new value, atomically stuff
6705       new value back in memory iff the old value has not changed (iow,
6706       no other thread modified it in the meantime).  If it has changed
6707       then we've been out-raced and we have to start over.
6708
6709       Now that's all very neat, but it has the bad side effect of
6710       introducing an explicit equality test into the translation.
6711       Consider the behaviour of said code on a memory location which
6712       is uninitialised.  We will wind up doing a comparison on
6713       uninitialised data, and mc duly complains.
6714
6715       What's difficult about this is, the common case is that the
6716       location is uncontended, and so we're usually comparing the same
6717       value (* %reg) with itself.  So we shouldn't complain even if it
6718       is undefined.  But mc doesn't know that.
6719
6720       My solution is to mark the == in the IR specially, so as to tell
6721       mc that it almost certainly compares a value with itself, and we
6722       should just regard the result as always defined.  Rather than
6723       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6724       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6725
6726       So there's always the question of, can this give a false
6727       negative?  eg, imagine that initially, * %reg is defined; and we
6728       read that; but then in the gap between the read and the CAS, a
6729       different thread writes an undefined (and different) value at
6730       the location.  Then the CAS in this thread will fail and we will
6731       go back to "again:", but without knowing that the trip back
6732       there was based on an undefined comparison.  No matter; at least
6733       the other thread won the race and the location is correctly
6734       marked as undefined.  What if it wrote an uninitialised version
6735       of the same value that was there originally, though?
6736
6737       etc etc.  Seems like there's a small corner case in which we
6738       might lose the fact that something's defined -- we're out-raced
6739       in between the "old = * reg" and the "atomically {", _and_ the
6740       other thread is writing in an undefined version of what's
6741       already there.  Well, that seems pretty unlikely.
6742
6743       ---
6744
6745       If we ever need to reinstate it .. code which generates a
6746       definedness test for "expected == old" was removed at r10432 of
6747       this file.
6748    */
6749    if (cas->oldHi == IRTemp_INVALID) {
6750       do_shadow_CAS_single( mce, cas );
6751    } else {
6752       do_shadow_CAS_double( mce, cas );
6753    }
6754 }
6755
6756
6757 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6758 {
6759    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6760    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6761    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6762    IRAtom *expd_eq_old = NULL;
6763    IROp   opCasCmpEQ;
6764    Int    elemSzB;
6765    IRType elemTy;
6766    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6767
6768    /* single CAS */
6769    tl_assert(cas->oldHi == IRTemp_INVALID);
6770    tl_assert(cas->expdHi == NULL);
6771    tl_assert(cas->dataHi == NULL);
6772
6773    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6774    switch (elemTy) {
6775       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
6776       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6777       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6778       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6779       default: tl_assert(0); /* IR defn disallows any other types */
6780    }
6781
6782    /* 1. fetch data# (the proposed new value) */
6783    tl_assert(isOriginalAtom(mce, cas->dataLo));
6784    vdataLo
6785       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6786    tl_assert(isShadowAtom(mce, vdataLo));
6787    if (otrak) {
6788       bdataLo
6789          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6790       tl_assert(isShadowAtom(mce, bdataLo));
6791    }
6792
6793    /* 2. fetch expected# (what we expect to see at the address) */
6794    tl_assert(isOriginalAtom(mce, cas->expdLo));
6795    vexpdLo
6796       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6797    tl_assert(isShadowAtom(mce, vexpdLo));
6798    if (otrak) {
6799       bexpdLo
6800          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6801       tl_assert(isShadowAtom(mce, bexpdLo));
6802    }
6803
6804    /* 3. check definedness of address */
6805    /* 4. fetch old# from shadow memory; this also checks
6806          addressibility of the address */
6807    voldLo
6808       = assignNew(
6809            'V', mce, elemTy,
6810            expr2vbits_Load(
6811               mce,
6812               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6813               NULL/*always happens*/
6814         ));
6815    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6816    if (otrak) {
6817       boldLo
6818          = assignNew('B', mce, Ity_I32,
6819                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6820       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6821    }
6822
6823    /* 5. the CAS itself */
6824    stmt( 'C', mce, IRStmt_CAS(cas) );
6825
6826    /* 6. compute "expected == old" */
6827    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6828    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6829       tree, but it's not copied from the input block. */
6830    expd_eq_old
6831       = assignNew('C', mce, Ity_I1,
6832                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6833
6834    /* 7. if "expected == old"
6835             store data# to shadow memory */
6836    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6837                     NULL/*data*/, vdataLo/*vdata*/,
6838                     expd_eq_old/*guard for store*/ );
6839    if (otrak) {
6840       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6841                    bdataLo/*bdata*/,
6842                    expd_eq_old/*guard for store*/ );
6843    }
6844 }
6845
6846
6847 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6848 {
6849    IRAtom *vdataHi = NULL, *bdataHi = NULL;
6850    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6851    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6852    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6853    IRAtom *voldHi  = NULL, *boldHi  = NULL;
6854    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6855    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6856    IRAtom *expd_eq_old = NULL, *zero = NULL;
6857    IROp   opCasCmpEQ, opOr, opXor;
6858    Int    elemSzB, memOffsLo, memOffsHi;
6859    IRType elemTy;
6860    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6861
6862    /* double CAS */
6863    tl_assert(cas->oldHi != IRTemp_INVALID);
6864    tl_assert(cas->expdHi != NULL);
6865    tl_assert(cas->dataHi != NULL);
6866
6867    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6868    switch (elemTy) {
6869       case Ity_I8:
6870          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6871          elemSzB = 1; zero = mkU8(0);
6872          break;
6873       case Ity_I16:
6874          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6875          elemSzB = 2; zero = mkU16(0);
6876          break;
6877       case Ity_I32:
6878          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6879          elemSzB = 4; zero = mkU32(0);
6880          break;
6881       case Ity_I64:
6882          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6883          elemSzB = 8; zero = mkU64(0);
6884          break;
6885       default:
6886          tl_assert(0); /* IR defn disallows any other types */
6887    }
6888
6889    /* 1. fetch data# (the proposed new value) */
6890    tl_assert(isOriginalAtom(mce, cas->dataHi));
6891    tl_assert(isOriginalAtom(mce, cas->dataLo));
6892    vdataHi
6893       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6894    vdataLo
6895       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6896    tl_assert(isShadowAtom(mce, vdataHi));
6897    tl_assert(isShadowAtom(mce, vdataLo));
6898    if (otrak) {
6899       bdataHi
6900          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6901       bdataLo
6902          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6903       tl_assert(isShadowAtom(mce, bdataHi));
6904       tl_assert(isShadowAtom(mce, bdataLo));
6905    }
6906
6907    /* 2. fetch expected# (what we expect to see at the address) */
6908    tl_assert(isOriginalAtom(mce, cas->expdHi));
6909    tl_assert(isOriginalAtom(mce, cas->expdLo));
6910    vexpdHi
6911       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6912    vexpdLo
6913       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6914    tl_assert(isShadowAtom(mce, vexpdHi));
6915    tl_assert(isShadowAtom(mce, vexpdLo));
6916    if (otrak) {
6917       bexpdHi
6918          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6919       bexpdLo
6920          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6921       tl_assert(isShadowAtom(mce, bexpdHi));
6922       tl_assert(isShadowAtom(mce, bexpdLo));
6923    }
6924
6925    /* 3. check definedness of address */
6926    /* 4. fetch old# from shadow memory; this also checks
6927          addressibility of the address */
6928    if (cas->end == Iend_LE) {
6929       memOffsLo = 0;
6930       memOffsHi = elemSzB;
6931    } else {
6932       tl_assert(cas->end == Iend_BE);
6933       memOffsLo = elemSzB;
6934       memOffsHi = 0;
6935    }
6936    voldHi
6937       = assignNew(
6938            'V', mce, elemTy,
6939            expr2vbits_Load(
6940               mce,
6941               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6942               NULL/*always happens*/
6943         ));
6944    voldLo
6945       = assignNew(
6946            'V', mce, elemTy,
6947            expr2vbits_Load(
6948               mce,
6949               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6950               NULL/*always happens*/
6951         ));
6952    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6953    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6954    if (otrak) {
6955       boldHi
6956          = assignNew('B', mce, Ity_I32,
6957                      gen_load_b(mce, elemSzB, cas->addr,
6958                                 memOffsHi/*addr bias*/));
6959       boldLo
6960          = assignNew('B', mce, Ity_I32,
6961                      gen_load_b(mce, elemSzB, cas->addr,
6962                                 memOffsLo/*addr bias*/));
6963       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6964       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6965    }
6966
6967    /* 5. the CAS itself */
6968    stmt( 'C', mce, IRStmt_CAS(cas) );
6969
6970    /* 6. compute "expected == old" */
6971    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6972    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6973       tree, but it's not copied from the input block. */
6974    /*
6975       xHi = oldHi ^ expdHi;
6976       xLo = oldLo ^ expdLo;
6977       xHL = xHi | xLo;
6978       expd_eq_old = xHL == 0;
6979    */
6980    xHi = assignNew('C', mce, elemTy,
6981                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6982    xLo = assignNew('C', mce, elemTy,
6983                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6984    xHL = assignNew('C', mce, elemTy,
6985                    binop(opOr, xHi, xLo));
6986    expd_eq_old
6987       = assignNew('C', mce, Ity_I1,
6988                   binop(opCasCmpEQ, xHL, zero));
6989
6990    /* 7. if "expected == old"
6991             store data# to shadow memory */
6992    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6993                     NULL/*data*/, vdataHi/*vdata*/,
6994                     expd_eq_old/*guard for store*/ );
6995    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6996                     NULL/*data*/, vdataLo/*vdata*/,
6997                     expd_eq_old/*guard for store*/ );
6998    if (otrak) {
6999       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
7000                    bdataHi/*bdata*/,
7001                    expd_eq_old/*guard for store*/ );
7002       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
7003                    bdataLo/*bdata*/,
7004                    expd_eq_old/*guard for store*/ );
7005    }
7006 }
7007
7008
7009 /* ------ Dealing with LL/SC (not difficult) ------ */
7010
7011 static void do_shadow_LLSC ( MCEnv*    mce,
7012                              IREndness stEnd,
7013                              IRTemp    stResult,
7014                              IRExpr*   stAddr,
7015                              IRExpr*   stStoredata )
7016 {
7017    /* In short: treat a load-linked like a normal load followed by an
7018       assignment of the loaded (shadow) data to the result temporary.
7019       Treat a store-conditional like a normal store, and mark the
7020       result temporary as defined. */
7021    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
7022    IRTemp resTmp = findShadowTmpV(mce, stResult);
7023
7024    tl_assert(isIRAtom(stAddr));
7025    if (stStoredata)
7026       tl_assert(isIRAtom(stStoredata));
7027
7028    if (stStoredata == NULL) {
7029       /* Load Linked */
7030       /* Just treat this as a normal load, followed by an assignment of
7031          the value to .result. */
7032       /* Stay sane */
7033       tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7034                 || resTy == Ity_I16 || resTy == Ity_I8);
7035       assign( 'V', mce, resTmp,
7036                    expr2vbits_Load(
7037                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
7038                       NULL/*always happens*/) );
7039    } else {
7040       /* Store Conditional */
7041       /* Stay sane */
7042       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
7043                                    stStoredata);
7044       tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
7045                 || dataTy == Ity_I16 || dataTy == Ity_I8);
7046       do_shadow_Store( mce, stEnd,
7047                             stAddr, 0/* addr bias */,
7048                             stStoredata,
7049                             NULL /* shadow data */,
7050                             NULL/*guard*/ );
7051       /* This is a store conditional, so it writes to .result a value
7052          indicating whether or not the store succeeded.  Just claim
7053          this value is always defined.  In the PowerPC interpretation
7054          of store-conditional, definedness of the success indication
7055          depends on whether the address of the store matches the
7056          reservation address.  But we can't tell that here (and
7057          anyway, we're not being PowerPC-specific).  At least we are
7058          guaranteed that the definedness of the store address, and its
7059          addressibility, will be checked as per normal.  So it seems
7060          pretty safe to just say that the success indication is always
7061          defined.
7062
7063          In schemeS, for origin tracking, we must correspondingly set
7064          a no-origin value for the origin shadow of .result.
7065       */
7066       tl_assert(resTy == Ity_I1);
7067       assign( 'V', mce, resTmp, definedOfType(resTy) );
7068    }
7069 }
7070
7071
7072 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7073
7074 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
7075 {
7076    complainIfUndefined(mce, sg->guard, NULL);
7077    /* do_shadow_Store will generate code to check the definedness and
7078       validity of sg->addr, in the case where sg->guard evaluates to
7079       True at run-time. */
7080    do_shadow_Store( mce, sg->end,
7081                     sg->addr, 0/* addr bias */,
7082                     sg->data,
7083                     NULL /* shadow data */,
7084                     sg->guard );
7085 }
7086
7087 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
7088 {
7089    complainIfUndefined(mce, lg->guard, NULL);
7090    /* expr2vbits_Load_guarded_General will generate code to check the
7091       definedness and validity of lg->addr, in the case where
7092       lg->guard evaluates to True at run-time. */
7093
7094    /* Look at the LoadG's built-in conversion operation, to determine
7095       the source (actual loaded data) type, and the equivalent IROp.
7096       NOTE that implicitly we are taking a widening operation to be
7097       applied to original atoms and producing one that applies to V
7098       bits.  Since signed and unsigned widening are self-shadowing,
7099       this is a straight copy of the op (modulo swapping from the
7100       IRLoadGOp form to the IROp form).  Note also therefore that this
7101       implicitly duplicates the logic to do with said widening ops in
7102       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
7103    IROp   vwiden   = Iop_INVALID;
7104    IRType loadedTy = Ity_INVALID;
7105    switch (lg->cvt) {
7106       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
7107       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
7108       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
7109       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
7110       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
7111       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
7112       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
7113       default: VG_(tool_panic)("do_shadow_LoadG");
7114    }
7115
7116    IRAtom* vbits_alt
7117       = expr2vbits( mce, lg->alt, HuOth );
7118    IRAtom* vbits_final
7119       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
7120                                         lg->addr, 0/*addr bias*/,
7121                                         lg->guard, vwiden, vbits_alt );
7122    /* And finally, bind the V bits to the destination temporary. */
7123    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
7124 }
7125
7126
7127 /*------------------------------------------------------------*/
7128 /*--- Origin tracking stuff                                ---*/
7129 /*------------------------------------------------------------*/
7130
7131 /* Almost identical to findShadowTmpV. */
7132 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
7133 {
7134    TempMapEnt* ent;
7135    /* VG_(indexXA) range-checks 'orig', hence no need to check
7136       here. */
7137    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7138    tl_assert(ent->kind == Orig);
7139    if (ent->shadowB == IRTemp_INVALID) {
7140       IRTemp tmpB
7141         = newTemp( mce, Ity_I32, BSh );
7142       /* newTemp may cause mce->tmpMap to resize, hence previous results
7143          from VG_(indexXA) are invalid. */
7144       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7145       tl_assert(ent->kind == Orig);
7146       tl_assert(ent->shadowB == IRTemp_INVALID);
7147       ent->shadowB = tmpB;
7148    }
7149    return ent->shadowB;
7150 }
7151
7152 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
7153 {
7154    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
7155 }
7156
7157
7158 /* Make a guarded origin load, with no special handling in the
7159    didn't-happen case.  A GUARD of NULL is assumed to mean "always
7160    True".
7161
7162    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7163    return the otag.  The loaded size is SZB.  If GUARD evaluates to
7164    False at run time then the returned otag is zero.
7165 */
7166 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
7167                                     IRAtom* baseaddr,
7168                                     Int offset, IRExpr* guard )
7169 {
7170    void*    hFun;
7171    const HChar* hName;
7172    IRTemp   bTmp;
7173    IRDirty* di;
7174    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7175    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7176    IRAtom*  ea    = baseaddr;
7177    if (offset != 0) {
7178       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7179                                    : mkU64( (Long)(Int)offset );
7180       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7181    }
7182    bTmp = newTemp(mce, mce->hWordTy, BSh);
7183
7184    switch (szB) {
7185       case 1: hFun  = (void*)&MC_(helperc_b_load1);
7186               hName = "MC_(helperc_b_load1)";
7187               break;
7188       case 2: hFun  = (void*)&MC_(helperc_b_load2);
7189               hName = "MC_(helperc_b_load2)";
7190               break;
7191       case 4: hFun  = (void*)&MC_(helperc_b_load4);
7192               hName = "MC_(helperc_b_load4)";
7193               break;
7194       case 8: hFun  = (void*)&MC_(helperc_b_load8);
7195               hName = "MC_(helperc_b_load8)";
7196               break;
7197       case 16: hFun  = (void*)&MC_(helperc_b_load16);
7198                hName = "MC_(helperc_b_load16)";
7199                break;
7200       case 32: hFun  = (void*)&MC_(helperc_b_load32);
7201                hName = "MC_(helperc_b_load32)";
7202                break;
7203       default:
7204          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
7205          tl_assert(0);
7206    }
7207    di = unsafeIRDirty_1_N(
7208            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
7209            mkIRExprVec_1( ea )
7210         );
7211    if (guard) {
7212       di->guard = guard;
7213       /* Ideally the didn't-happen return value here would be
7214          all-zeroes (unknown-origin), so it'd be harmless if it got
7215          used inadvertently.  We slum it out with the IR-mandated
7216          default value (0b01 repeating, 0x55 etc) as that'll probably
7217          trump all legitimate otags via Max32, and it's pretty
7218          obviously bogus. */
7219    }
7220    /* no need to mess with any annotations.  This call accesses
7221       neither guest state nor guest memory. */
7222    stmt( 'B', mce, IRStmt_Dirty(di) );
7223    if (mce->hWordTy == Ity_I64) {
7224       /* 64-bit host */
7225       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7226       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7227       return mkexpr(bTmp32);
7228    } else {
7229       /* 32-bit host */
7230       return mkexpr(bTmp);
7231    }
7232 }
7233
7234
7235 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
7236    loaded size is SZB.  The load is regarded as unconditional (always
7237    happens).
7238 */
7239 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7240                             Int offset )
7241 {
7242    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7243 }
7244
7245
7246 /* The most general handler for guarded origin loads.  A GUARD of NULL
7247    is assumed to mean "always True".
7248
7249    Generate IR to do a shadow origin load from ADDR+BIAS and return
7250    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
7251    run time then the returned B bits are simply BALT instead.
7252 */
7253 static
7254 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7255                                         IRType ty,
7256                                         IRAtom* addr, UInt bias,
7257                                         IRAtom* guard, IRAtom* balt )
7258 {
7259    /* If the guard evaluates to True, this will hold the loaded
7260       origin.  If the guard evaluates to False, this will be zero,
7261       meaning "unknown origin", in which case we will have to replace
7262       it using an ITE below. */
7263    IRAtom* iftrue
7264       = assignNew('B', mce, Ity_I32,
7265                   gen_guarded_load_b(mce, sizeofIRType(ty),
7266                                      addr, bias, guard));
7267    /* These are the bits we will return if the load doesn't take
7268       place. */
7269    IRAtom* iffalse
7270       = balt;
7271    /* Prepare the cond for the ITE.  Convert a NULL cond into
7272       something that iropt knows how to fold out later. */
7273    IRAtom* cond
7274       = guard == NULL  ? mkU1(1)  : guard;
7275    /* And assemble the final result. */
7276    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7277 }
7278
7279
7280 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
7281    the store really happens; NULL means it unconditionally does. */
7282 static void gen_store_b ( MCEnv* mce, Int szB,
7283                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
7284                           IRAtom* guard )
7285 {
7286    void*    hFun;
7287    const HChar* hName;
7288    IRDirty* di;
7289    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7290    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7291    IRAtom*  ea    = baseaddr;
7292    if (guard) {
7293       tl_assert(isOriginalAtom(mce, guard));
7294       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7295    }
7296    if (offset != 0) {
7297       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7298                                    : mkU64( (Long)(Int)offset );
7299       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
7300    }
7301    if (mce->hWordTy == Ity_I64)
7302       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7303
7304    switch (szB) {
7305       case 1: hFun  = (void*)&MC_(helperc_b_store1);
7306               hName = "MC_(helperc_b_store1)";
7307               break;
7308       case 2: hFun  = (void*)&MC_(helperc_b_store2);
7309               hName = "MC_(helperc_b_store2)";
7310               break;
7311       case 4: hFun  = (void*)&MC_(helperc_b_store4);
7312               hName = "MC_(helperc_b_store4)";
7313               break;
7314       case 8: hFun  = (void*)&MC_(helperc_b_store8);
7315               hName = "MC_(helperc_b_store8)";
7316               break;
7317       case 16: hFun  = (void*)&MC_(helperc_b_store16);
7318                hName = "MC_(helperc_b_store16)";
7319                break;
7320       case 32: hFun  = (void*)&MC_(helperc_b_store32);
7321                hName = "MC_(helperc_b_store32)";
7322                break;
7323       default:
7324          tl_assert(0);
7325    }
7326    di = unsafeIRDirty_0_N( 2/*regparms*/,
7327            hName, VG_(fnptr_to_fnentry)( hFun ),
7328            mkIRExprVec_2( ea, dataB )
7329         );
7330    /* no need to mess with any annotations.  This call accesses
7331       neither guest state nor guest memory. */
7332    if (guard) di->guard = guard;
7333    stmt( 'B', mce, IRStmt_Dirty(di) );
7334 }
7335
7336 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7337    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7338    if (eTy == Ity_I64)
7339       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7340    if (eTy == Ity_I32)
7341       return e;
7342    tl_assert(0);
7343 }
7344
7345 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7346    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7347    tl_assert(eTy == Ity_I32);
7348    if (dstTy == Ity_I64)
7349       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7350    tl_assert(0);
7351 }
7352
7353
7354 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7355 {
7356    tl_assert(MC_(clo_mc_level) == 3);
7357
7358    switch (e->tag) {
7359
7360       case Iex_GetI: {
7361          IRRegArray* descr_b;
7362          IRAtom      *t1, *t2, *t3, *t4;
7363          IRRegArray* descr      = e->Iex.GetI.descr;
7364          IRType equivIntTy
7365             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7366          /* If this array is unshadowable for whatever reason, use the
7367             usual approximation. */
7368          if (equivIntTy == Ity_INVALID)
7369             return mkU32(0);
7370          tl_assert(sizeofIRType(equivIntTy) >= 4);
7371          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7372          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7373                                  equivIntTy, descr->nElems );
7374          /* Do a shadow indexed get of the same size, giving t1.  Take
7375             the bottom 32 bits of it, giving t2.  Compute into t3 the
7376             origin for the index (almost certainly zero, but there's
7377             no harm in being completely general here, since iropt will
7378             remove any useless code), and fold it in, giving a final
7379             value t4. */
7380          t1 = assignNew( 'B', mce, equivIntTy,
7381                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7382                                                 e->Iex.GetI.bias ));
7383          t2 = narrowTo32( mce, t1 );
7384          t3 = schemeE( mce, e->Iex.GetI.ix );
7385          t4 = gen_maxU32( mce, t2, t3 );
7386          return t4;
7387       }
7388       case Iex_CCall: {
7389          Int i;
7390          IRAtom*  here;
7391          IRExpr** args = e->Iex.CCall.args;
7392          IRAtom*  curr = mkU32(0);
7393          for (i = 0; args[i]; i++) {
7394             tl_assert(i < 32);
7395             tl_assert(isOriginalAtom(mce, args[i]));
7396             /* Only take notice of this arg if the callee's
7397                mc-exclusion mask does not say it is to be excluded. */
7398             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7399                /* the arg is to be excluded from definedness checking.
7400                   Do nothing. */
7401                if (0) VG_(printf)("excluding %s(%d)\n",
7402                                   e->Iex.CCall.cee->name, i);
7403             } else {
7404                /* calculate the arg's definedness, and pessimistically
7405                   merge it in. */
7406                here = schemeE( mce, args[i] );
7407                curr = gen_maxU32( mce, curr, here );
7408             }
7409          }
7410          return curr;
7411       }
7412       case Iex_Load: {
7413          Int dszB;
7414          dszB = sizeofIRType(e->Iex.Load.ty);
7415          /* assert that the B value for the address is already
7416             available (somewhere) */
7417          tl_assert(isIRAtom(e->Iex.Load.addr));
7418          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7419          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7420       }
7421       case Iex_ITE: {
7422          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7423          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7424          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7425          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7426       }
7427       case Iex_Qop: {
7428          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7429          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7430          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7431          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7432          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7433                                  gen_maxU32( mce, b3, b4 ) );
7434       }
7435       case Iex_Triop: {
7436          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7437          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7438          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7439          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7440       }
7441       case Iex_Binop: {
7442          switch (e->Iex.Binop.op) {
7443             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
7444             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7445             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7446             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7447                /* Just say these all produce a defined result,
7448                   regardless of their arguments.  See
7449                   COMMENT_ON_CasCmpEQ in this file. */
7450                return mkU32(0);
7451             default: {
7452                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7453                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7454                return gen_maxU32( mce, b1, b2 );
7455             }
7456          }
7457          tl_assert(0);
7458          /*NOTREACHED*/
7459       }
7460       case Iex_Unop: {
7461          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7462          return b1;
7463       }
7464       case Iex_Const:
7465          return mkU32(0);
7466       case Iex_RdTmp:
7467          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7468       case Iex_Get: {
7469          Int b_offset = MC_(get_otrack_shadow_offset)(
7470                            e->Iex.Get.offset,
7471                            sizeofIRType(e->Iex.Get.ty)
7472                         );
7473          tl_assert(b_offset >= -1
7474                    && b_offset <= mce->layout->total_sizeB -4);
7475          if (b_offset >= 0) {
7476             /* FIXME: this isn't an atom! */
7477             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7478                                Ity_I32 );
7479          }
7480          return mkU32(0);
7481       }
7482       default:
7483          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7484          ppIRExpr(e);
7485          VG_(tool_panic)("memcheck:schemeE");
7486    }
7487 }
7488
7489
7490 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7491 {
7492    // This is a hacked version of do_shadow_Dirty
7493    Int       i, k, n, toDo, gSz, gOff;
7494    IRAtom    *here, *curr;
7495    IRTemp    dst;
7496
7497    /* First check the guard. */
7498    curr = schemeE( mce, d->guard );
7499
7500    /* Now round up all inputs and maxU32 over them. */
7501
7502    /* Inputs: unmasked args
7503       Note: arguments are evaluated REGARDLESS of the guard expression */
7504    for (i = 0; d->args[i]; i++) {
7505       IRAtom* arg = d->args[i];
7506       if ( (d->cee->mcx_mask & (1<<i))
7507            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7508          /* ignore this arg */
7509       } else {
7510          here = schemeE( mce, arg );
7511          curr = gen_maxU32( mce, curr, here );
7512       }
7513    }
7514
7515    /* Inputs: guest state that we read. */
7516    for (i = 0; i < d->nFxState; i++) {
7517       tl_assert(d->fxState[i].fx != Ifx_None);
7518       if (d->fxState[i].fx == Ifx_Write)
7519          continue;
7520
7521       /* Enumerate the described state segments */
7522       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7523          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7524          gSz  = d->fxState[i].size;
7525
7526          /* Ignore any sections marked as 'always defined'. */
7527          if (isAlwaysDefd(mce, gOff, gSz)) {
7528             if (0)
7529             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7530                         gOff, gSz);
7531             continue;
7532          }
7533
7534          /* This state element is read or modified.  So we need to
7535             consider it.  If larger than 4 bytes, deal with it in
7536             4-byte chunks. */
7537          while (True) {
7538             Int b_offset;
7539             tl_assert(gSz >= 0);
7540             if (gSz == 0) break;
7541             n = gSz <= 4 ? gSz : 4;
7542             /* update 'curr' with maxU32 of the state slice
7543                gOff .. gOff+n-1 */
7544             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7545             if (b_offset != -1) {
7546                /* Observe the guard expression. If it is false use 0, i.e.
7547                   nothing is known about the origin */
7548                IRAtom *cond, *iffalse, *iftrue;
7549
7550                cond = assignNew( 'B', mce, Ity_I1, d->guard);
7551                iffalse = mkU32(0);
7552                iftrue  = assignNew( 'B', mce, Ity_I32,
7553                                     IRExpr_Get(b_offset
7554                                                  + 2*mce->layout->total_sizeB,
7555                                                Ity_I32));
7556                here = assignNew( 'B', mce, Ity_I32,
7557                                  IRExpr_ITE(cond, iftrue, iffalse));
7558                curr = gen_maxU32( mce, curr, here );
7559             }
7560             gSz -= n;
7561             gOff += n;
7562          }
7563       }
7564    }
7565
7566    /* Inputs: memory */
7567
7568    if (d->mFx != Ifx_None) {
7569       /* Because we may do multiple shadow loads/stores from the same
7570          base address, it's best to do a single test of its
7571          definedness right now.  Post-instrumentation optimisation
7572          should remove all but this test. */
7573       tl_assert(d->mAddr);
7574       here = schemeE( mce, d->mAddr );
7575       curr = gen_maxU32( mce, curr, here );
7576    }
7577
7578    /* Deal with memory inputs (reads or modifies) */
7579    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7580       toDo   = d->mSize;
7581       /* chew off 32-bit chunks.  We don't care about the endianness
7582          since it's all going to be condensed down to a single bit,
7583          but nevertheless choose an endianness which is hopefully
7584          native to the platform. */
7585       while (toDo >= 4) {
7586          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7587                                     d->guard );
7588          curr = gen_maxU32( mce, curr, here );
7589          toDo -= 4;
7590       }
7591       /* handle possible 16-bit excess */
7592       while (toDo >= 2) {
7593          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7594                                     d->guard );
7595          curr = gen_maxU32( mce, curr, here );
7596          toDo -= 2;
7597       }
7598       /* chew off the remaining 8-bit chunk, if any */
7599       if (toDo == 1) {
7600          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7601                                     d->guard );
7602          curr = gen_maxU32( mce, curr, here );
7603          toDo -= 1;
7604       }
7605       tl_assert(toDo == 0);
7606    }
7607
7608    /* Whew!  So curr is a 32-bit B-value which should give an origin
7609       of some use if any of the inputs to the helper are undefined.
7610       Now we need to re-distribute the results to all destinations. */
7611
7612    /* Outputs: the destination temporary, if there is one. */
7613    if (d->tmp != IRTemp_INVALID) {
7614       dst   = findShadowTmpB(mce, d->tmp);
7615       assign( 'V', mce, dst, curr );
7616    }
7617
7618    /* Outputs: guest state that we write or modify. */
7619    for (i = 0; i < d->nFxState; i++) {
7620       tl_assert(d->fxState[i].fx != Ifx_None);
7621       if (d->fxState[i].fx == Ifx_Read)
7622          continue;
7623
7624       /* Enumerate the described state segments */
7625       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7626          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7627          gSz  = d->fxState[i].size;
7628
7629          /* Ignore any sections marked as 'always defined'. */
7630          if (isAlwaysDefd(mce, gOff, gSz))
7631             continue;
7632
7633          /* This state element is written or modified.  So we need to
7634             consider it.  If larger than 4 bytes, deal with it in
7635             4-byte chunks. */
7636          while (True) {
7637             Int b_offset;
7638             tl_assert(gSz >= 0);
7639             if (gSz == 0) break;
7640             n = gSz <= 4 ? gSz : 4;
7641             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7642             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7643             if (b_offset != -1) {
7644
7645                /* If the guard expression evaluates to false we simply Put
7646                   the value that is already stored in the guest state slot */
7647                IRAtom *cond, *iffalse;
7648
7649                cond    = assignNew('B', mce, Ity_I1,
7650                                    d->guard);
7651                iffalse = assignNew('B', mce, Ity_I32,
7652                                    IRExpr_Get(b_offset +
7653                                               2*mce->layout->total_sizeB,
7654                                               Ity_I32));
7655                curr = assignNew('V', mce, Ity_I32,
7656                                 IRExpr_ITE(cond, curr, iffalse));
7657
7658                stmt( 'B', mce, IRStmt_Put(b_offset
7659                                           + 2*mce->layout->total_sizeB,
7660                                           curr ));
7661             }
7662             gSz -= n;
7663             gOff += n;
7664          }
7665       }
7666    }
7667
7668    /* Outputs: memory that we write or modify.  Same comments about
7669       endianness as above apply. */
7670    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7671       toDo   = d->mSize;
7672       /* chew off 32-bit chunks */
7673       while (toDo >= 4) {
7674          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7675                       d->guard );
7676          toDo -= 4;
7677       }
7678       /* handle possible 16-bit excess */
7679       while (toDo >= 2) {
7680          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7681                       d->guard );
7682          toDo -= 2;
7683       }
7684       /* chew off the remaining 8-bit chunk, if any */
7685       if (toDo == 1) {
7686          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7687                       d->guard );
7688          toDo -= 1;
7689       }
7690       tl_assert(toDo == 0);
7691    }
7692 }
7693
7694
7695 /* Generate IR for origin shadowing for a general guarded store. */
7696 static void do_origins_Store_guarded ( MCEnv* mce,
7697                                        IREndness stEnd,
7698                                        IRExpr* stAddr,
7699                                        IRExpr* stData,
7700                                        IRExpr* guard )
7701 {
7702    Int     dszB;
7703    IRAtom* dataB;
7704    /* assert that the B value for the address is already available
7705       (somewhere), since the call to schemeE will want to see it.
7706       XXXX how does this actually ensure that?? */
7707    tl_assert(isIRAtom(stAddr));
7708    tl_assert(isIRAtom(stData));
7709    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7710    dataB = schemeE( mce, stData );
7711    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7712 }
7713
7714
7715 /* Generate IR for origin shadowing for a plain store. */
7716 static void do_origins_Store_plain ( MCEnv* mce,
7717                                      IREndness stEnd,
7718                                      IRExpr* stAddr,
7719                                      IRExpr* stData )
7720 {
7721    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7722                               NULL/*guard*/ );
7723 }
7724
7725
7726 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7727
7728 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7729 {
7730    do_origins_Store_guarded( mce, sg->end, sg->addr,
7731                              sg->data, sg->guard );
7732 }
7733
7734 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7735 {
7736    IRType loadedTy = Ity_INVALID;
7737    switch (lg->cvt) {
7738       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7739       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7740       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7741       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7742       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7743       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7744       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7745       default: VG_(tool_panic)("schemeS.IRLoadG");
7746    }
7747    IRAtom* ori_alt
7748       = schemeE( mce,lg->alt );
7749    IRAtom* ori_final
7750       = expr2ori_Load_guarded_General(mce, loadedTy,
7751                                       lg->addr, 0/*addr bias*/,
7752                                       lg->guard, ori_alt );
7753    /* And finally, bind the origin to the destination temporary. */
7754    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7755 }
7756
7757
7758 static void schemeS ( MCEnv* mce, IRStmt* st )
7759 {
7760    tl_assert(MC_(clo_mc_level) == 3);
7761
7762    switch (st->tag) {
7763
7764       case Ist_AbiHint:
7765          /* The value-check instrumenter handles this - by arranging
7766             to pass the address of the next instruction to
7767             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7768             happen for origin tracking w.r.t. AbiHints.  So there is
7769             nothing to do here. */
7770          break;
7771
7772       case Ist_PutI: {
7773          IRPutI *puti = st->Ist.PutI.details;
7774          IRRegArray* descr_b;
7775          IRAtom      *t1, *t2, *t3, *t4;
7776          IRRegArray* descr = puti->descr;
7777          IRType equivIntTy
7778             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7779          /* If this array is unshadowable for whatever reason,
7780             generate no code. */
7781          if (equivIntTy == Ity_INVALID)
7782             break;
7783          tl_assert(sizeofIRType(equivIntTy) >= 4);
7784          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7785          descr_b
7786             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7787                             equivIntTy, descr->nElems );
7788          /* Compute a value to Put - the conjoinment of the origin for
7789             the data to be Put-ted (obviously) and of the index value
7790             (not so obviously). */
7791          t1 = schemeE( mce, puti->data );
7792          t2 = schemeE( mce, puti->ix );
7793          t3 = gen_maxU32( mce, t1, t2 );
7794          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7795          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7796                                                puti->bias, t4) ));
7797          break;
7798       }
7799
7800       case Ist_Dirty:
7801          do_origins_Dirty( mce, st->Ist.Dirty.details );
7802          break;
7803
7804       case Ist_Store:
7805          do_origins_Store_plain( mce, st->Ist.Store.end,
7806                                       st->Ist.Store.addr,
7807                                       st->Ist.Store.data );
7808          break;
7809
7810       case Ist_StoreG:
7811          do_origins_StoreG( mce, st->Ist.StoreG.details );
7812          break;
7813
7814       case Ist_LoadG:
7815          do_origins_LoadG( mce, st->Ist.LoadG.details );
7816          break;
7817
7818       case Ist_LLSC: {
7819          /* In short: treat a load-linked like a normal load followed
7820             by an assignment of the loaded (shadow) data the result
7821             temporary.  Treat a store-conditional like a normal store,
7822             and mark the result temporary as defined. */
7823          if (st->Ist.LLSC.storedata == NULL) {
7824             /* Load Linked */
7825             IRType resTy
7826                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7827             IRExpr* vanillaLoad
7828                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7829             tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7830                       || resTy == Ity_I16 || resTy == Ity_I8);
7831             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7832                               schemeE(mce, vanillaLoad));
7833          } else {
7834             /* Store conditional */
7835             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7836                                     st->Ist.LLSC.addr,
7837                                     st->Ist.LLSC.storedata );
7838             /* For the rationale behind this, see comments at the
7839                place where the V-shadow for .result is constructed, in
7840                do_shadow_LLSC.  In short, we regard .result as
7841                always-defined. */
7842             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7843                               mkU32(0) );
7844          }
7845          break;
7846       }
7847
7848       case Ist_Put: {
7849          Int b_offset
7850             = MC_(get_otrack_shadow_offset)(
7851                  st->Ist.Put.offset,
7852                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7853               );
7854          if (b_offset >= 0) {
7855             /* FIXME: this isn't an atom! */
7856             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7857                                        schemeE( mce, st->Ist.Put.data )) );
7858          }
7859          break;
7860       }
7861
7862       case Ist_WrTmp:
7863          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7864                            schemeE(mce, st->Ist.WrTmp.data) );
7865          break;
7866
7867       case Ist_MBE:
7868       case Ist_NoOp:
7869       case Ist_Exit:
7870       case Ist_IMark:
7871          break;
7872
7873       default:
7874          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7875          ppIRStmt(st);
7876          VG_(tool_panic)("memcheck:schemeS");
7877    }
7878 }
7879
7880
7881 /*------------------------------------------------------------*/
7882 /*--- Post-tree-build final tidying                        ---*/
7883 /*------------------------------------------------------------*/
7884
7885 /* This exploits the observation that Memcheck often produces
7886    repeated conditional calls of the form
7887
7888    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7889
7890    with the same guard expression G guarding the same helper call.
7891    The second and subsequent calls are redundant.  This usually
7892    results from instrumentation of guest code containing multiple
7893    memory references at different constant offsets from the same base
7894    register.  After optimisation of the instrumentation, you get a
7895    test for the definedness of the base register for each memory
7896    reference, which is kinda pointless.  MC_(final_tidy) therefore
7897    looks for such repeated calls and removes all but the first. */
7898
7899
7900 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7901    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7902    get almost all the benefits of this transformation whilst causing
7903    the slide-back case to just often enough to be verifiably
7904    correct.  For posterity, the numbers are:
7905
7906    bz2-32
7907
7908    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
7909    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
7910    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
7911    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
7912    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
7913    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
7914    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
7915    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
7916    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
7917    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
7918    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
7919    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
7920    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7921
7922    bz2-64
7923
7924    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
7925    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
7926    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
7927    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
7928    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
7929    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
7930    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
7931    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
7932    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
7933    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
7934    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
7935    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
7936    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7937 */
7938
7939 /* Structs for recording which (helper, guard) pairs we have already
7940    seen. */
7941
7942 #define N_TIDYING_PAIRS 16
7943
7944 typedef
7945    struct { void* entry; IRExpr* guard; }
7946    Pair;
7947
7948 typedef
7949    struct {
7950       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7951       UInt pairsUsed;
7952    }
7953    Pairs;
7954
7955
7956 /* Return True if e1 and e2 definitely denote the same value (used to
7957    compare guards).  Return False if unknown; False is the safe
7958    answer.  Since guest registers and guest memory do not have the
7959    SSA property we must return False if any Gets or Loads appear in
7960    the expression.  This implicitly assumes that e1 and e2 have the
7961    same IR type, which is always true here -- the type is Ity_I1. */
7962
7963 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7964 {
7965    if (e1->tag != e2->tag)
7966       return False;
7967    switch (e1->tag) {
7968       case Iex_Const:
7969          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7970       case Iex_Binop:
7971          return e1->Iex.Binop.op == e2->Iex.Binop.op
7972                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7973                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7974       case Iex_Unop:
7975          return e1->Iex.Unop.op == e2->Iex.Unop.op
7976                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7977       case Iex_RdTmp:
7978          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7979       case Iex_ITE:
7980          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7981                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
7982                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7983       case Iex_Qop:
7984       case Iex_Triop:
7985       case Iex_CCall:
7986          /* be lazy.  Could define equality for these, but they never
7987             appear to be used. */
7988          return False;
7989       case Iex_Get:
7990       case Iex_GetI:
7991       case Iex_Load:
7992          /* be conservative - these may not give the same value each
7993             time */
7994          return False;
7995       case Iex_Binder:
7996          /* should never see this */
7997          /* fallthrough */
7998       default:
7999          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
8000          ppIRExpr(e1);
8001          VG_(tool_panic)("memcheck:sameIRValue");
8002          return False;
8003    }
8004 }
8005
8006 /* See if 'pairs' already has an entry for (entry, guard).  Return
8007    True if so.  If not, add an entry. */
8008
8009 static
8010 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
8011 {
8012    UInt i, n = tidyingEnv->pairsUsed;
8013    tl_assert(n <= N_TIDYING_PAIRS);
8014    for (i = 0; i < n; i++) {
8015       if (tidyingEnv->pairs[i].entry == entry
8016           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
8017          return True;
8018    }
8019    /* (guard, entry) wasn't found in the array.  Add it at the end.
8020       If the array is already full, slide the entries one slot
8021       backwards.  This means we will lose to ability to detect
8022       duplicates from the pair in slot zero, but that happens so
8023       rarely that it's unlikely to have much effect on overall code
8024       quality.  Also, this strategy loses the check for the oldest
8025       tracked exit (memory reference, basically) and so that is (I'd
8026       guess) least likely to be re-used after this point. */
8027    tl_assert(i == n);
8028    if (n == N_TIDYING_PAIRS) {
8029       for (i = 1; i < N_TIDYING_PAIRS; i++) {
8030          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
8031       }
8032       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
8033       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
8034    } else {
8035       tl_assert(n < N_TIDYING_PAIRS);
8036       tidyingEnv->pairs[n].entry = entry;
8037       tidyingEnv->pairs[n].guard = guard;
8038       n++;
8039       tidyingEnv->pairsUsed = n;
8040    }
8041    return False;
8042 }
8043
8044 static Bool is_helperc_value_checkN_fail ( const HChar* name )
8045 {
8046    /* This is expensive because it happens a lot.  We are checking to
8047       see whether |name| is one of the following 8 strings:
8048
8049          MC_(helperc_value_check8_fail_no_o)
8050          MC_(helperc_value_check4_fail_no_o)
8051          MC_(helperc_value_check0_fail_no_o)
8052          MC_(helperc_value_check1_fail_no_o)
8053          MC_(helperc_value_check8_fail_w_o)
8054          MC_(helperc_value_check0_fail_w_o)
8055          MC_(helperc_value_check1_fail_w_o)
8056          MC_(helperc_value_check4_fail_w_o)
8057
8058       To speed it up, check the common prefix just once, rather than
8059       all 8 times.
8060    */
8061    const HChar* prefix = "MC_(helperc_value_check";
8062
8063    HChar n, p;
8064    while (True) {
8065       n = *name;
8066       p = *prefix;
8067       if (p == 0) break; /* ran off the end of the prefix */
8068       /* We still have some prefix to use */
8069       if (n == 0) return False; /* have prefix, but name ran out */
8070       if (n != p) return False; /* have both pfx and name, but no match */
8071       name++;
8072       prefix++;
8073    }
8074
8075    /* Check the part after the prefix. */
8076    tl_assert(*prefix == 0 && *name != 0);
8077    return    0==VG_(strcmp)(name, "8_fail_no_o)")
8078           || 0==VG_(strcmp)(name, "4_fail_no_o)")
8079           || 0==VG_(strcmp)(name, "0_fail_no_o)")
8080           || 0==VG_(strcmp)(name, "1_fail_no_o)")
8081           || 0==VG_(strcmp)(name, "8_fail_w_o)")
8082           || 0==VG_(strcmp)(name, "4_fail_w_o)")
8083           || 0==VG_(strcmp)(name, "0_fail_w_o)")
8084           || 0==VG_(strcmp)(name, "1_fail_w_o)");
8085 }
8086
8087 IRSB* MC_(final_tidy) ( IRSB* sb_in )
8088 {
8089    Int       i;
8090    IRStmt*   st;
8091    IRDirty*  di;
8092    IRExpr*   guard;
8093    IRCallee* cee;
8094    Bool      alreadyPresent;
8095    Pairs     pairs;
8096
8097    pairs.pairsUsed = 0;
8098
8099    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
8100    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
8101
8102    /* Scan forwards through the statements.  Each time a call to one
8103       of the relevant helpers is seen, check if we have made a
8104       previous call to the same helper using the same guard
8105       expression, and if so, delete the call. */
8106    for (i = 0; i < sb_in->stmts_used; i++) {
8107       st = sb_in->stmts[i];
8108       tl_assert(st);
8109       if (st->tag != Ist_Dirty)
8110          continue;
8111       di = st->Ist.Dirty.details;
8112       guard = di->guard;
8113       tl_assert(guard);
8114       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
8115       cee = di->cee;
8116       if (!is_helperc_value_checkN_fail( cee->name ))
8117          continue;
8118        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8119           guard 'guard'.  Check if we have already seen a call to this
8120           function with the same guard.  If so, delete it.  If not,
8121           add it to the set of calls we do know about. */
8122       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
8123       if (alreadyPresent) {
8124          sb_in->stmts[i] = IRStmt_NoOp();
8125          if (0) VG_(printf)("XX\n");
8126       }
8127    }
8128
8129    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
8130    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
8131
8132    return sb_in;
8133 }
8134
8135 #undef N_TIDYING_PAIRS
8136
8137
8138 /*------------------------------------------------------------*/
8139 /*--- Startup assertion checking                           ---*/
8140 /*------------------------------------------------------------*/
8141
8142 void MC_(do_instrumentation_startup_checks)( void )
8143 {
8144    /* Make a best-effort check to see that is_helperc_value_checkN_fail
8145       is working as we expect. */
8146
8147 #  define CHECK(_expected, _string) \
8148       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8149
8150    /* It should identify these 8, and no others, as targets. */
8151    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
8152    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
8153    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
8154    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
8155    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
8156    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
8157    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
8158    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
8159
8160    /* Ad-hoc selection of other strings gathered via a quick test. */
8161    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
8162    CHECK(False, "amd64g_dirtyhelper_RDTSC");
8163    CHECK(False, "MC_(helperc_b_load1)");
8164    CHECK(False, "MC_(helperc_b_load2)");
8165    CHECK(False, "MC_(helperc_b_load4)");
8166    CHECK(False, "MC_(helperc_b_load8)");
8167    CHECK(False, "MC_(helperc_b_load16)");
8168    CHECK(False, "MC_(helperc_b_load32)");
8169    CHECK(False, "MC_(helperc_b_store1)");
8170    CHECK(False, "MC_(helperc_b_store2)");
8171    CHECK(False, "MC_(helperc_b_store4)");
8172    CHECK(False, "MC_(helperc_b_store8)");
8173    CHECK(False, "MC_(helperc_b_store16)");
8174    CHECK(False, "MC_(helperc_b_store32)");
8175    CHECK(False, "MC_(helperc_LOADV8)");
8176    CHECK(False, "MC_(helperc_LOADV16le)");
8177    CHECK(False, "MC_(helperc_LOADV32le)");
8178    CHECK(False, "MC_(helperc_LOADV64le)");
8179    CHECK(False, "MC_(helperc_LOADV128le)");
8180    CHECK(False, "MC_(helperc_LOADV256le)");
8181    CHECK(False, "MC_(helperc_STOREV16le)");
8182    CHECK(False, "MC_(helperc_STOREV32le)");
8183    CHECK(False, "MC_(helperc_STOREV64le)");
8184    CHECK(False, "MC_(helperc_STOREV8)");
8185    CHECK(False, "track_die_mem_stack_8");
8186    CHECK(False, "track_new_mem_stack_8_w_ECU");
8187    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8188    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
8189
8190 #  undef CHECK
8191 }
8192
8193
8194 /*------------------------------------------------------------*/
8195 /*--- Memcheck main                                        ---*/
8196 /*------------------------------------------------------------*/
8197
8198 static Bool isBogusAtom ( IRAtom* at )
8199 {
8200    if (at->tag == Iex_RdTmp)
8201       return False;
8202    tl_assert(at->tag == Iex_Const);
8203
8204    ULong n = 0;
8205    IRConst* con = at->Iex.Const.con;
8206    switch (con->tag) {
8207       case Ico_U1:   return False;
8208       case Ico_U8:   n = (ULong)con->Ico.U8; break;
8209       case Ico_U16:  n = (ULong)con->Ico.U16; break;
8210       case Ico_U32:  n = (ULong)con->Ico.U32; break;
8211       case Ico_U64:  n = (ULong)con->Ico.U64; break;
8212       case Ico_F32:  return False;
8213       case Ico_F64:  return False;
8214       case Ico_F32i: return False;
8215       case Ico_F64i: return False;
8216       case Ico_V128: return False;
8217       case Ico_V256: return False;
8218       default: ppIRExpr(at); tl_assert(0);
8219    }
8220    /* VG_(printf)("%llx\n", n); */
8221    /* Shortcuts */
8222    if (LIKELY(n <= 0x0000000000001000ULL)) return False;
8223    if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
8224    /* The list of bogus atoms is: */
8225    return (/*32*/    n == 0xFEFEFEFFULL
8226            /*32*/ || n == 0x80808080ULL
8227            /*32*/ || n == 0x7F7F7F7FULL
8228            /*32*/ || n == 0x7EFEFEFFULL
8229            /*32*/ || n == 0x81010100ULL
8230            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
8231            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
8232            /*64*/ || n == 0x0000000000008080ULL
8233            /*64*/ || n == 0x8080808080808080ULL
8234            /*64*/ || n == 0x0101010101010101ULL
8235           );
8236 }
8237
8238
8239 /* Does 'st' mention any of the literals identified/listed in
8240    isBogusAtom()? */
8241 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
8242 {
8243    Int      i;
8244    IRExpr*  e;
8245    IRDirty* d;
8246    IRCAS*   cas;
8247    switch (st->tag) {
8248       case Ist_WrTmp:
8249          e = st->Ist.WrTmp.data;
8250          switch (e->tag) {
8251             case Iex_Get:
8252             case Iex_RdTmp:
8253                return False;
8254             case Iex_Const:
8255                return isBogusAtom(e);
8256             case Iex_Unop:
8257                return isBogusAtom(e->Iex.Unop.arg)
8258                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
8259             case Iex_GetI:
8260                return isBogusAtom(e->Iex.GetI.ix);
8261             case Iex_Binop:
8262                return isBogusAtom(e->Iex.Binop.arg1)
8263                       || isBogusAtom(e->Iex.Binop.arg2);
8264             case Iex_Triop:
8265                return isBogusAtom(e->Iex.Triop.details->arg1)
8266                       || isBogusAtom(e->Iex.Triop.details->arg2)
8267                       || isBogusAtom(e->Iex.Triop.details->arg3);
8268             case Iex_Qop:
8269                return isBogusAtom(e->Iex.Qop.details->arg1)
8270                       || isBogusAtom(e->Iex.Qop.details->arg2)
8271                       || isBogusAtom(e->Iex.Qop.details->arg3)
8272                       || isBogusAtom(e->Iex.Qop.details->arg4);
8273             case Iex_ITE:
8274                return isBogusAtom(e->Iex.ITE.cond)
8275                       || isBogusAtom(e->Iex.ITE.iftrue)
8276                       || isBogusAtom(e->Iex.ITE.iffalse);
8277             case Iex_Load:
8278                return isBogusAtom(e->Iex.Load.addr);
8279             case Iex_CCall:
8280                for (i = 0; e->Iex.CCall.args[i]; i++)
8281                   if (isBogusAtom(e->Iex.CCall.args[i]))
8282                      return True;
8283                return False;
8284             default:
8285                goto unhandled;
8286          }
8287       case Ist_Dirty:
8288          d = st->Ist.Dirty.details;
8289          for (i = 0; d->args[i]; i++) {
8290             IRAtom* atom = d->args[i];
8291             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8292                if (isBogusAtom(atom))
8293                   return True;
8294             }
8295          }
8296          if (isBogusAtom(d->guard))
8297             return True;
8298          if (d->mAddr && isBogusAtom(d->mAddr))
8299             return True;
8300          return False;
8301       case Ist_Put:
8302          return isBogusAtom(st->Ist.Put.data);
8303       case Ist_PutI:
8304          return isBogusAtom(st->Ist.PutI.details->ix)
8305                 || isBogusAtom(st->Ist.PutI.details->data);
8306       case Ist_Store:
8307          return isBogusAtom(st->Ist.Store.addr)
8308                 || isBogusAtom(st->Ist.Store.data);
8309       case Ist_StoreG: {
8310          IRStoreG* sg = st->Ist.StoreG.details;
8311          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8312                 || isBogusAtom(sg->guard);
8313       }
8314       case Ist_LoadG: {
8315          IRLoadG* lg = st->Ist.LoadG.details;
8316          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8317                 || isBogusAtom(lg->guard);
8318       }
8319       case Ist_Exit:
8320          return isBogusAtom(st->Ist.Exit.guard);
8321       case Ist_AbiHint:
8322          return isBogusAtom(st->Ist.AbiHint.base)
8323                 || isBogusAtom(st->Ist.AbiHint.nia);
8324       case Ist_NoOp:
8325       case Ist_IMark:
8326       case Ist_MBE:
8327          return False;
8328       case Ist_CAS:
8329          cas = st->Ist.CAS.details;
8330          return isBogusAtom(cas->addr)
8331                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8332                 || isBogusAtom(cas->expdLo)
8333                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8334                 || isBogusAtom(cas->dataLo);
8335       case Ist_LLSC:
8336          return isBogusAtom(st->Ist.LLSC.addr)
8337                 || (st->Ist.LLSC.storedata
8338                        ? isBogusAtom(st->Ist.LLSC.storedata)
8339                        : False);
8340       default:
8341       unhandled:
8342          ppIRStmt(st);
8343          VG_(tool_panic)("hasBogusLiterals");
8344    }
8345 }
8346
8347
8348 /* This is the pre-instrumentation analysis.  It does a backwards pass over
8349    the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8350    the block.
8351
8352    Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8353    as a positive result from that is a strong indication that we need to
8354    expensively instrument add/sub in the block.  We do both analyses in one
8355    pass, even though they are independent, so as to avoid the overhead of
8356    having to traverse the whole block twice.
8357
8358    The usage pass proceeds as follows.  Let max= be the max operation in the
8359    HowUsed lattice, hence
8360
8361      X max= Y   means   X = max(X, Y)
8362
8363    then
8364
8365      for t in original tmps . useEnv[t] = HuUnU
8366
8367      for t used in the block's . next field
8368         useEnv[t] max= HuPCa  // because jmp targets are PCast-tested
8369
8370      for st iterating *backwards* in the block
8371
8372         match st
8373
8374            case "t1 = load(t2)"          // case 1
8375               useEnv[t2] max= HuPCa
8376
8377            case "t1 = add(t2, t3)"       // case 2
8378               useEnv[t2] max= useEnv[t1]
8379               useEnv[t3] max= useEnv[t1]
8380
8381            other
8382               for t in st.usedTmps       // case 3
8383                  useEnv[t] max= HuOth
8384                  // same as useEnv[t] = HuOth
8385
8386    The general idea is that we accumulate, in useEnv[], information about
8387    how each tmp is used.  That can be updated as we work further back
8388    through the block and find more uses of it, but its HowUsed value can
8389    only ascend the lattice, not descend.
8390
8391    Initially we mark all tmps as unused.  In case (1), if a tmp is seen to
8392    be used as a memory address, then its use is at least HuPCa.  The point
8393    is that for a memory address we will add instrumentation to check if any
8394    bit of the address is undefined, which means that we won't need expensive
8395    V-bit propagation through an add expression that computed the address --
8396    cheap add instrumentation will be equivalent.
8397
8398    Note in case (1) that if we have previously seen a non-memory-address use
8399    of the tmp, then its use will already be HuOth and will be unchanged by
8400    the max= operation.  And if it turns out that the source of the tmp was
8401    an add, then we'll have to expensively instrument the add, because we
8402    can't prove that, for the previous non-memory-address use of the tmp,
8403    cheap and expensive instrumentation will be equivalent.
8404
8405    In case 2, we propagate the usage-mode of the result of an add back
8406    through to its operands.  Again, we use max= so as to take account of the
8407    fact that t2 or t3 might later in the block (viz, earlier in the
8408    iteration) have been used in a way that requires expensive add
8409    instrumentation.
8410
8411    In case 3, we deal with all other tmp uses.  We assume that we'll need a
8412    result that is as accurate as possible, so we max= HuOth into its use
8413    mode.  Since HuOth is the top of the lattice, that's equivalent to just
8414    setting its use to HuOth.
8415
8416    The net result of all this is that:
8417
8418      tmps that are used either
8419        - only as a memory address, or
8420        - only as part of a tree of adds that computes a memory address,
8421          and has no other use
8422      are marked as HuPCa, and so we can instrument their generating Add
8423      nodes cheaply, which is the whole point of this analysis
8424
8425      tmps that are used any other way at all are marked as HuOth
8426
8427      tmps that are unused are marked as HuUnU.  We don't expect to see any
8428      since we expect that the incoming IR has had all dead assignments
8429      removed by previous optimisation passes.  Nevertheless the analysis is
8430      correct even in the presence of dead tmps.
8431
8432    A final comment on dead tmps.  In case 1 and case 2, we could actually
8433    conditionalise the updates thusly:
8434
8435      if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa }  // case 1
8436
8437      if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] }  // case 2
8438      if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] }  // case 2
8439
8440    In other words, if the assigned-to tmp |t1| is never used, then there's
8441    no point in propagating any use through to its operands.  That won't
8442    change the final HuPCa-vs-HuOth results, which is what we care about.
8443    Given that we expect to get dead-code-free inputs, there's no point in
8444    adding this extra refinement.
8445 */
8446
8447 /* Helper for |preInstrumentationAnalysis|. */
8448 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8449                                    UInt tyenvUsed,
8450                                    HowUsed newUse, IRAtom* at )
8451 {
8452    /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8453       seen a use of |newUse|.  So, merge that info into |t|'s accumulated
8454       use info. */
8455    switch (at->tag) {
8456       case Iex_GSPTR:
8457       case Iex_VECRET:
8458       case Iex_Const:
8459          return;
8460       case Iex_RdTmp: {
8461          IRTemp t = at->Iex.RdTmp.tmp;
8462          tl_assert(t < tyenvUsed); // "is an original tmp"
8463          // The "max" operation in the lattice
8464          if (newUse > useEnv[t]) useEnv[t] = newUse;
8465          return;
8466       }
8467       default:
8468          // We should never get here -- it implies non-flat IR
8469          ppIRExpr(at);
8470          VG_(tool_panic)("noteTmpUsesIn");
8471    }
8472    /*NOTREACHED*/
8473    tl_assert(0);
8474 }
8475
8476
8477 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8478                                          /*OUT*/Bool* hasBogusLiteralsP,
8479                                          const IRSB* sb_in )
8480 {
8481    const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8482
8483    // We've seen no bogus literals so far.
8484    Bool bogus = False;
8485
8486    // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8487    HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8488                                  nOrigTmps, sizeof(HowUsed));
8489
8490    // Firstly, roll in contributions from the final dst address.
8491    bogus = isBogusAtom(sb_in->next);
8492    noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8493
8494    // Now work backwards through the stmts.
8495    for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8496       IRStmt* st = sb_in->stmts[i];
8497
8498       // Deal with literals.
8499       if (LIKELY(!bogus)) {
8500          bogus = containsBogusLiterals(st);
8501       }
8502
8503       // Deal with tmp uses.
8504       switch (st->tag) {
8505          case Ist_WrTmp: {
8506             IRTemp  dst = st->Ist.WrTmp.tmp;
8507             IRExpr* rhs = st->Ist.WrTmp.data;
8508             // This is the one place where we have to consider all possible
8509             // tags for |rhs|, and can't just assume it is a tmp or a const.
8510             switch (rhs->tag) {
8511                case Iex_RdTmp:
8512                   // just propagate demand for |dst| into this tmp use.
8513                   noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8514                   break;
8515                case Iex_Unop:
8516                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8517                   break;
8518                case Iex_Binop:
8519                   if (rhs->Iex.Binop.op == Iop_Add64
8520                       || rhs->Iex.Binop.op == Iop_Add32) {
8521                      // propagate demand for |dst| through to the operands.
8522                      noteTmpUsesIn(useEnv, nOrigTmps,
8523                                    useEnv[dst], rhs->Iex.Binop.arg1);
8524                      noteTmpUsesIn(useEnv, nOrigTmps,
8525                                    useEnv[dst], rhs->Iex.Binop.arg2);
8526                   } else {
8527                      // just say that the operands are used in some unknown way.
8528                      noteTmpUsesIn(useEnv, nOrigTmps,
8529                                    HuOth, rhs->Iex.Binop.arg1);
8530                      noteTmpUsesIn(useEnv, nOrigTmps,
8531                                    HuOth, rhs->Iex.Binop.arg2);
8532                   }
8533                   break;
8534                case Iex_Triop: {
8535                   // All operands are used in some unknown way.
8536                   IRTriop* tri = rhs->Iex.Triop.details;
8537                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8538                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8539                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8540                   break;
8541                }
8542                case Iex_Qop: {
8543                   // All operands are used in some unknown way.
8544                   IRQop* qop = rhs->Iex.Qop.details;
8545                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8546                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8547                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8548                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8549                   break;
8550                }
8551                case Iex_Load:
8552                   // The address will be checked (== PCasted).
8553                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8554                   break;
8555                case Iex_ITE:
8556                   // The condition is PCasted, the then- and else-values
8557                   // aren't.
8558                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8559                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8560                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8561                   break;
8562                case Iex_CCall:
8563                   // The args are used in unknown ways.
8564                   for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8565                      noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8566                   }
8567                   break;
8568                case Iex_GetI: {
8569                   // The index will be checked/PCasted (see do_shadow_GETI)
8570                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8571                   break;
8572                }
8573                case Iex_Const:
8574                case Iex_Get:
8575                   break;
8576                default:
8577                   ppIRExpr(rhs);
8578                   VG_(tool_panic)("preInstrumentationAnalysis:"
8579                                   " unhandled IRExpr");
8580             }
8581             break;
8582          }
8583          case Ist_Store:
8584             // The address will be checked (== PCasted).  The data will be
8585             // used in some unknown way.
8586             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8587             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8588             break;
8589          case Ist_Exit:
8590             // The guard will be checked (== PCasted)
8591             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8592             break;
8593          case Ist_Put:
8594             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8595             break;
8596          case Ist_PutI: {
8597             IRPutI* putI = st->Ist.PutI.details;
8598             // The index will be checked/PCasted (see do_shadow_PUTI).  The
8599             // data will be used in an unknown way.
8600             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8601             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8602             break;
8603          }
8604          case Ist_Dirty: {
8605             IRDirty* d = st->Ist.Dirty.details;
8606             // The guard will be checked (== PCasted)
8607             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8608             // The args will be used in unknown ways.
8609             for (IRExpr** args = d->args; *args; args++) {
8610                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8611             }
8612             break;
8613          }
8614          case Ist_CAS: {
8615             IRCAS* cas = st->Ist.CAS.details;
8616             // Address will be pcasted, everything else used as unknown
8617             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8618             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8619             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8620             if (cas->expdHi)
8621                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8622             if (cas->dataHi)
8623                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8624             break;
8625          }
8626          case Ist_AbiHint:
8627             // Both exprs are used in unknown ways.  TODO: can we safely
8628             // just ignore AbiHints?
8629             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8630             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8631             break;
8632          case Ist_StoreG: {
8633             // We might be able to do better, and use HuPCa for the addr.
8634             // It's not immediately obvious that we can, because the address
8635             // is regarded as "used" only when the guard is true.
8636             IRStoreG* sg = st->Ist.StoreG.details;
8637             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8638             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8639             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8640             break;
8641          }
8642          case Ist_LoadG: {
8643             // Per similar comments to Ist_StoreG .. not sure whether this
8644             // is really optimal.
8645             IRLoadG* lg = st->Ist.LoadG.details;
8646             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8647             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8648             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8649             break;
8650          }
8651          case Ist_LLSC: {
8652             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8653             if (st->Ist.LLSC.storedata)
8654                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8655             break;
8656          }
8657          case Ist_MBE:
8658          case Ist_IMark:
8659          case Ist_NoOp:
8660             break;
8661          default: {
8662             ppIRStmt(st);
8663             VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8664          }
8665       }
8666    } // Now work backwards through the stmts.
8667
8668    // Return the computed use env and the bogus-atom flag.
8669    tl_assert(*useEnvP == NULL);
8670    *useEnvP = useEnv;
8671
8672    tl_assert(*hasBogusLiteralsP == False);
8673    *hasBogusLiteralsP = bogus;
8674 }
8675
8676
8677 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8678                         IRSB* sb_in,
8679                         const VexGuestLayout* layout,
8680                         const VexGuestExtents* vge,
8681                         const VexArchInfo* archinfo_host,
8682                         IRType gWordTy, IRType hWordTy )
8683 {
8684    Bool    verboze = 0||False;
8685    Int     i, j, first_stmt;
8686    IRStmt* st;
8687    MCEnv   mce;
8688    IRSB*   sb_out;
8689
8690    if (gWordTy != hWordTy) {
8691       /* We don't currently support this case. */
8692       VG_(tool_panic)("host/guest word size mismatch");
8693    }
8694
8695    /* Check we're not completely nuts */
8696    tl_assert(sizeof(UWord)  == sizeof(void*));
8697    tl_assert(sizeof(Word)   == sizeof(void*));
8698    tl_assert(sizeof(Addr)   == sizeof(void*));
8699    tl_assert(sizeof(ULong)  == 8);
8700    tl_assert(sizeof(Long)   == 8);
8701    tl_assert(sizeof(UInt)   == 4);
8702    tl_assert(sizeof(Int)    == 4);
8703
8704    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8705
8706    /* Set up SB */
8707    sb_out = deepCopyIRSBExceptStmts(sb_in);
8708
8709    /* Set up the running environment.  Both .sb and .tmpMap are
8710       modified as we go along.  Note that tmps are added to both
8711       .sb->tyenv and .tmpMap together, so the valid index-set for
8712       those two arrays should always be identical. */
8713    VG_(memset)(&mce, 0, sizeof(mce));
8714    mce.sb             = sb_out;
8715    mce.trace          = verboze;
8716    mce.layout         = layout;
8717    mce.hWordTy        = hWordTy;
8718    mce.tmpHowUsed     = NULL;
8719
8720    /* BEGIN decide on expense levels for instrumentation. */
8721
8722    /* Initially, select the cheap version of everything for which we have an
8723       option. */
8724    DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8725
8726    /* Take account of the --expensive-definedness-checks= flag. */
8727    if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8728       /* We just selected 'cheap for everything', so we don't need to do
8729          anything here.  mce.tmpHowUsed remains NULL. */
8730    }
8731    else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8732       /* Select 'expensive for everything'.  mce.tmpHowUsed remains NULL. */
8733       DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8734    }
8735    else {
8736       tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8737       /* We'll make our own selection, based on known per-target constraints
8738          and also on analysis of the block to be instrumented.  First, set
8739          up default values for detail levels.
8740
8741          On x86 and amd64, we'll routinely encounter code optimised by LLVM
8742          5 and above.  Enable accurate interpretation of the following.
8743          LLVM uses adds for some bitfield inserts, and we get a lot of false
8744          errors if the cheap interpretation is used, alas.  Could solve this
8745          much better if we knew which of such adds came from x86/amd64 LEA
8746          instructions, since these are the only ones really needing the
8747          expensive interpretation, but that would require some way to tag
8748          them in the _toIR.c front ends, which is a lot of faffing around.
8749          So for now we use preInstrumentationAnalysis() to detect adds which
8750          are used only to construct memory addresses, which is an
8751          approximation to the above, and is self-contained.*/
8752 #     if defined(VGA_x86)
8753       mce.dlbo.dl_Add32           = DLauto;
8754       mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8755       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8756 #     elif defined(VGA_amd64)
8757       mce.dlbo.dl_Add32           = DLexpensive;
8758       mce.dlbo.dl_Add64           = DLauto;
8759       mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8760       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8761       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8762 #     elif defined(VGA_ppc64le)
8763       // Needed by (at least) set_AV_CR6() in the front end.
8764       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8765 #     elif defined(VGA_arm64)
8766       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8767       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8768 #     elif defined(VGA_arm)
8769       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8770 #     endif
8771
8772       /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8773          fill it in. */
8774       Bool hasBogusLiterals = False;
8775       preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8776
8777       if (hasBogusLiterals) {
8778          /* This happens very rarely.  In this case just select expensive
8779             for everything, and throw away the tmp-use analysis results. */
8780          DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8781          VG_(free)( mce.tmpHowUsed );
8782          mce.tmpHowUsed = NULL;
8783       } else {
8784          /* Nothing.  mce.tmpHowUsed contains tmp-use analysis results,
8785             which will be used for some subset of Iop_{Add,Sub}{32,64},
8786             based on which ones are set to DLauto for this target. */
8787       }
8788    }
8789
8790    DetailLevelByOp__check_sanity( &mce.dlbo );
8791
8792    if (0) {
8793       // Debug printing: which tmps have been identified as PCast-only use
8794       if (mce.tmpHowUsed) {
8795          VG_(printf)("Cheapies: ");
8796          for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8797             if (mce.tmpHowUsed[q] == HuPCa) {
8798                VG_(printf)("t%u ", q);
8799             }
8800          }
8801          VG_(printf)("\n");
8802       }
8803
8804       // Debug printing: number of ops by detail level
8805       UChar nCheap     = DetailLevelByOp__count( &mce.dlbo, DLcheap     );
8806       UChar nAuto      = DetailLevelByOp__count( &mce.dlbo, DLauto      );
8807       UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8808       tl_assert(nCheap + nAuto + nExpensive == 8);
8809
8810       VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8811    }
8812    /* END decide on expense levels for instrumentation. */
8813
8814    /* Initialise the running the tmp environment. */
8815
8816    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8817                             sizeof(TempMapEnt));
8818    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8819    for (i = 0; i < sb_in->tyenv->types_used; i++) {
8820       TempMapEnt ent;
8821       ent.kind    = Orig;
8822       ent.shadowV = IRTemp_INVALID;
8823       ent.shadowB = IRTemp_INVALID;
8824       VG_(addToXA)( mce.tmpMap, &ent );
8825    }
8826    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8827
8828    /* Finally, begin instrumentation. */
8829    /* Copy verbatim any IR preamble preceding the first IMark */
8830
8831    tl_assert(mce.sb == sb_out);
8832    tl_assert(mce.sb != sb_in);
8833
8834    i = 0;
8835    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8836
8837       st = sb_in->stmts[i];
8838       tl_assert(st);
8839       tl_assert(isFlatIRStmt(st));
8840
8841       stmt( 'C', &mce, sb_in->stmts[i] );
8842       i++;
8843    }
8844
8845    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
8846       cause the IR following the preamble to contain references to IR
8847       temporaries defined in the preamble.  Because the preamble isn't
8848       instrumented, these temporaries don't have any shadows.
8849       Nevertheless uses of them following the preamble will cause
8850       memcheck to generate references to their shadows.  End effect is
8851       to cause IR sanity check failures, due to references to
8852       non-existent shadows.  This is only evident for the complex
8853       preambles used for function wrapping on TOC-afflicted platforms
8854       (ppc64-linux).
8855
8856       The following loop therefore scans the preamble looking for
8857       assignments to temporaries.  For each one found it creates an
8858       assignment to the corresponding (V) shadow temp, marking it as
8859       'defined'.  This is the same resulting IR as if the main
8860       instrumentation loop before had been applied to the statement
8861       'tmp = CONSTANT'.
8862
8863       Similarly, if origin tracking is enabled, we must generate an
8864       assignment for the corresponding origin (B) shadow, claiming
8865       no-origin, as appropriate for a defined value.
8866    */
8867    for (j = 0; j < i; j++) {
8868       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8869          /* findShadowTmpV checks its arg is an original tmp;
8870             no need to assert that here. */
8871          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8872          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8873          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
8874          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8875          if (MC_(clo_mc_level) == 3) {
8876             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8877             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8878             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8879          }
8880          if (0) {
8881             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8882             ppIRType( ty_v );
8883             VG_(printf)("\n");
8884          }
8885       }
8886    }
8887
8888    /* Iterate over the remaining stmts to generate instrumentation. */
8889
8890    tl_assert(sb_in->stmts_used > 0);
8891    tl_assert(i >= 0);
8892    tl_assert(i < sb_in->stmts_used);
8893    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8894
8895    for (/* use current i*/; i < sb_in->stmts_used; i++) {
8896
8897       st = sb_in->stmts[i];
8898       first_stmt = sb_out->stmts_used;
8899
8900       if (verboze) {
8901          VG_(printf)("\n");
8902          ppIRStmt(st);
8903          VG_(printf)("\n");
8904       }
8905
8906       if (MC_(clo_mc_level) == 3) {
8907          /* See comments on case Ist_CAS below. */
8908          if (st->tag != Ist_CAS)
8909             schemeS( &mce, st );
8910       }
8911
8912       /* Generate instrumentation code for each stmt ... */
8913
8914       switch (st->tag) {
8915
8916          case Ist_WrTmp: {
8917             IRTemp dst = st->Ist.WrTmp.tmp;
8918             tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8919             HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8920                                         : HuOth/*we don't know, so play safe*/;
8921             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8922                                expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8923             break;
8924          }
8925
8926          case Ist_Put:
8927             do_shadow_PUT( &mce,
8928                            st->Ist.Put.offset,
8929                            st->Ist.Put.data,
8930                            NULL /* shadow atom */, NULL /* guard */ );
8931             break;
8932
8933          case Ist_PutI:
8934             do_shadow_PUTI( &mce, st->Ist.PutI.details);
8935             break;
8936
8937          case Ist_Store:
8938             do_shadow_Store( &mce, st->Ist.Store.end,
8939                                    st->Ist.Store.addr, 0/* addr bias */,
8940                                    st->Ist.Store.data,
8941                                    NULL /* shadow data */,
8942                                    NULL/*guard*/ );
8943             break;
8944
8945          case Ist_StoreG:
8946             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8947             break;
8948
8949          case Ist_LoadG:
8950             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8951             break;
8952
8953          case Ist_Exit:
8954             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8955             break;
8956
8957          case Ist_IMark:
8958             break;
8959
8960          case Ist_NoOp:
8961          case Ist_MBE:
8962             break;
8963
8964          case Ist_Dirty:
8965             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8966             break;
8967
8968          case Ist_AbiHint:
8969             do_AbiHint( &mce, st->Ist.AbiHint.base,
8970                               st->Ist.AbiHint.len,
8971                               st->Ist.AbiHint.nia );
8972             break;
8973
8974          case Ist_CAS:
8975             do_shadow_CAS( &mce, st->Ist.CAS.details );
8976             /* Note, do_shadow_CAS copies the CAS itself to the output
8977                block, because it needs to add instrumentation both
8978                before and after it.  Hence skip the copy below.  Also
8979                skip the origin-tracking stuff (call to schemeS) above,
8980                since that's all tangled up with it too; do_shadow_CAS
8981                does it all. */
8982             break;
8983
8984          case Ist_LLSC:
8985             do_shadow_LLSC( &mce,
8986                             st->Ist.LLSC.end,
8987                             st->Ist.LLSC.result,
8988                             st->Ist.LLSC.addr,
8989                             st->Ist.LLSC.storedata );
8990             break;
8991
8992          default:
8993             VG_(printf)("\n");
8994             ppIRStmt(st);
8995             VG_(printf)("\n");
8996             VG_(tool_panic)("memcheck: unhandled IRStmt");
8997
8998       } /* switch (st->tag) */
8999
9000       if (0 && verboze) {
9001          for (j = first_stmt; j < sb_out->stmts_used; j++) {
9002             VG_(printf)("   ");
9003             ppIRStmt(sb_out->stmts[j]);
9004             VG_(printf)("\n");
9005          }
9006          VG_(printf)("\n");
9007       }
9008
9009       /* ... and finally copy the stmt itself to the output.  Except,
9010          skip the copy of IRCASs; see comments on case Ist_CAS
9011          above. */
9012       if (st->tag != Ist_CAS)
9013          stmt('C', &mce, st);
9014    }
9015
9016    /* Now we need to complain if the jump target is undefined. */
9017    first_stmt = sb_out->stmts_used;
9018
9019    if (verboze) {
9020       VG_(printf)("sb_in->next = ");
9021       ppIRExpr(sb_in->next);
9022       VG_(printf)("\n\n");
9023    }
9024
9025    complainIfUndefined( &mce, sb_in->next, NULL );
9026
9027    if (0 && verboze) {
9028       for (j = first_stmt; j < sb_out->stmts_used; j++) {
9029          VG_(printf)("   ");
9030          ppIRStmt(sb_out->stmts[j]);
9031          VG_(printf)("\n");
9032       }
9033       VG_(printf)("\n");
9034    }
9035
9036    /* If this fails, there's been some serious snafu with tmp management,
9037       that should be investigated. */
9038    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
9039    VG_(deleteXA)( mce.tmpMap );
9040
9041    if (mce.tmpHowUsed) {
9042       VG_(free)( mce.tmpHowUsed );
9043    }
9044
9045    tl_assert(mce.sb == sb_out);
9046    return sb_out;
9047 }
9048
9049
9050 /*--------------------------------------------------------------------*/
9051 /*--- end                                           mc_translate.c ---*/
9052 /*--------------------------------------------------------------------*/