memcheck/mc_translate.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Instrument IR to perform memory checking operations.         ---*/
   4 /*---                                               mc_translate.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of MemCheck, a heavyweight Valgrind tool for
   9    detecting memory errors.
  10
  11    Copyright (C) 2000-2017 Julian Seward
  12       jseward@acm.org
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 #include "pub_tool_basics.h"
  31 #include "pub_tool_poolalloc.h"     // For mc_include.h
  32 #include "pub_tool_hashtable.h"     // For mc_include.h
  33 #include "pub_tool_libcassert.h"
  34 #include "pub_tool_libcprint.h"
  35 #include "pub_tool_tooliface.h"
  36 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
  37 #include "pub_tool_xarray.h"
  38 #include "pub_tool_mallocfree.h"
  39 #include "pub_tool_libcbase.h"
  40
  41 #include "mc_include.h"
  42
  43
  44 /* FIXMEs JRS 2011-June-16.
  45
  46    Check the interpretation for vector narrowing and widening ops,
  47    particularly the saturating ones.  I suspect they are either overly
  48    pessimistic and/or wrong.
  49
  50    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
  51    saturating shifts): the interpretation is overly pessimistic.
  52    See comments on the relevant cases below for details.
  53
  54    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
  55    both rounding and non-rounding variants): ditto
  56 */
  57
  58 /* This file implements the Memcheck instrumentation, and in
  59    particular contains the core of its undefined value detection
  60    machinery.  For a comprehensive background of the terminology,
  61    algorithms and rationale used herein, read:
  62
  63      Using Valgrind to detect undefined value errors with
  64      bit-precision
  65
  66      Julian Seward and Nicholas Nethercote
  67
  68      2005 USENIX Annual Technical Conference (General Track),
  69      Anaheim, CA, USA, April 10-15, 2005.
  70
  71    ----
  72
  73    Here is as good a place as any to record exactly when V bits are and
  74    should be checked, why, and what function is responsible.
  75
  76
  77    Memcheck complains when an undefined value is used:
  78
  79    1. In the condition of a conditional branch.  Because it could cause
  80       incorrect control flow, and thus cause incorrect externally-visible
  81       behaviour.  [mc_translate.c:complainIfUndefined]
  82
  83    2. As an argument to a system call, or as the value that specifies
  84       the system call number.  Because it could cause an incorrect
  85       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
  86
  87    3. As the address in a load or store.  Because it could cause an
  88       incorrect value to be used later, which could cause externally-visible
  89       behaviour (eg. via incorrect control flow or an incorrect system call
  90       argument)  [complainIfUndefined]
  91
  92    4. As the target address of a branch.  Because it could cause incorrect
  93       control flow.  [complainIfUndefined]
  94
  95    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
  96       an incorrect value into the external environment.
  97       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
  98
  99    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
 100       [complainIfUndefined]
 101
 102    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
 103       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
 104       requested it.  [in memcheck.h]
 105
 106
 107    Memcheck also complains, but should not, when an undefined value is used:
 108
 109    8. As the shift value in certain SIMD shift operations (but not in the
 110       standard integer shift operations).  This inconsistency is due to
 111       historical reasons.)  [complainIfUndefined]
 112
 113
 114    Memcheck does not complain, but should, when an undefined value is used:
 115
 116    9. As an input to a client request.  Because the client request may
 117       affect the visible behaviour -- see bug #144362 for an example
 118       involving the malloc replacements in vg_replace_malloc.c and
 119       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
 120       isn't identified.  That bug report also has some info on how to solve
 121       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
 122
 123
 124    In practice, 1 and 2 account for the vast majority of cases.
 125 */
 126
 127 /* Generation of addr-definedness, addr-validity and
 128    guard-definedness checks pertaining to loads and stores (Iex_Load,
 129    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
 130    loads/stores) was re-checked 11 May 2013. */
 131
 132
 133 /*------------------------------------------------------------*/
 134 /*--- Forward decls                                        ---*/
 135 /*------------------------------------------------------------*/
 136
 137 struct _MCEnv;
 138
 139 // See below for comments explaining what this is for.
 140 typedef
 141    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 142    HowUsed;
 143
 144 static IRType  shadowTypeV ( IRType ty );
 145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
 146                             HowUsed hu/*use HuOth if unknown*/ );
 147 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
 148
 149 static IRExpr *i128_const_zero(void);
 150
 151
 152 /*------------------------------------------------------------*/
 153 /*--- Memcheck running state, and tmp management.          ---*/
 154 /*------------------------------------------------------------*/
 155
 156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
 157    propagation scheme, and a more expensive, more precise vbit propagation
 158    scheme.  This enum describes, for such an IROp, which scheme to use. */
 159 typedef
 160    enum {
 161       // Use the cheaper, less-exact variant.
 162       DLcheap=4,
 163       // Choose between cheap and expensive based on analysis of the block
 164       // to be instrumented.  Note that the choice may be done on a
 165       // per-instance basis of the IROp that this DetailLevel describes.
 166       DLauto,
 167       // Use the more expensive, more-exact variant.
 168       DLexpensive
 169    }
 170    DetailLevel;
 171
 172
 173 /* A readonly part of the running state.  For IROps that have both a
 174    less-exact and more-exact interpretation, records which interpretation is
 175    to be used.  */
 176 typedef
 177    struct {
 178       // For Add32/64 and Sub32/64, all 3 settings are allowed.  For the
 179       // DLauto case, a per-instance decision is to be made by inspecting
 180       // the associated tmp's entry in MCEnv.tmpHowUsed.
 181       DetailLevel dl_Add32;
 182       DetailLevel dl_Add64;
 183       DetailLevel dl_Sub32;
 184       DetailLevel dl_Sub64;
 185       // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
 186       // allowed.
 187       DetailLevel dl_CmpEQ64_CmpNE64;
 188       DetailLevel dl_CmpEQ32_CmpNE32;
 189       DetailLevel dl_CmpEQ16_CmpNE16;
 190       DetailLevel dl_CmpEQ8_CmpNE8;
 191    }
 192    DetailLevelByOp;
 193
 194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
 195                                        DetailLevel dl )
 196 {
 197    dlbo->dl_Add32           = dl;
 198    dlbo->dl_Add64           = dl;
 199    dlbo->dl_Sub32           = dl;
 200    dlbo->dl_Sub64           = dl;
 201    dlbo->dl_CmpEQ64_CmpNE64 = dl;
 202    dlbo->dl_CmpEQ32_CmpNE32 = dl;
 203    dlbo->dl_CmpEQ16_CmpNE16 = dl;
 204    dlbo->dl_CmpEQ8_CmpNE8   = dl;
 205 }
 206
 207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
 208 {
 209    tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
 210    tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
 211    tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
 212    tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
 213    tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
 214              || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
 215    tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
 216              || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
 217    tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
 218              || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
 219    tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
 220              || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
 221 }
 222
 223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
 224                                      DetailLevel dl )
 225 {
 226    UInt n = 0;
 227    n += (dlbo->dl_Add32 == dl            ? 1 : 0);
 228    n += (dlbo->dl_Add64 == dl            ? 1 : 0);
 229    n += (dlbo->dl_Sub32 == dl            ? 1 : 0);
 230    n += (dlbo->dl_Sub64 == dl            ? 1 : 0);
 231    n += (dlbo->dl_CmpEQ64_CmpNE64 == dl  ? 1 : 0);
 232    n += (dlbo->dl_CmpEQ32_CmpNE32 == dl  ? 1 : 0);
 233    n += (dlbo->dl_CmpEQ16_CmpNE16 == dl  ? 1 : 0);
 234    n += (dlbo->dl_CmpEQ8_CmpNE8 == dl    ? 1 : 0);
 235    return n;
 236 }
 237
 238
 239 /* Carries info about a particular tmp.  The tmp's number is not
 240    recorded, as this is implied by (equal to) its index in the tmpMap
 241    in MCEnv.  The tmp's type is also not recorded, as this is present
 242    in MCEnv.sb->tyenv.
 243
 244    When .kind is Orig, .shadowV and .shadowB may give the identities
 245    of the temps currently holding the associated definedness (shadowV)
 246    and origin (shadowB) values, or these may be IRTemp_INVALID if code
 247    to compute such values has not yet been emitted.
 248
 249    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
 250    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
 251    illogical for a shadow tmp itself to be shadowed.
 252 */
 253 typedef
 254    enum { Orig=1, VSh=2, BSh=3 }
 255    TempKind;
 256
 257 typedef
 258    struct {
 259       TempKind kind;
 260       IRTemp   shadowV;
 261       IRTemp   shadowB;
 262    }
 263    TempMapEnt;
 264
 265
 266 /* A |HowUsed| value carries analysis results about how values are used,
 267    pertaining to whether we need to instrument integer adds expensively or
 268    not.  The running state carries a (readonly) mapping from original tmp to
 269    a HowUsed value for it.  A usage value can be one of three values,
 270    forming a 3-point chain lattice.
 271
 272       HuOth   ("Other") used in some arbitrary way
 273        |
 274       HuPCa   ("PCast") used *only* in effectively a PCast, in which all
 275        |      we care about is the all-defined vs not-all-defined distinction
 276        |
 277       HuUnU   ("Unused") not used at all.
 278
 279    The "safe" (don't-know) end of the lattice is "HuOth".  See comments
 280    below in |preInstrumentationAnalysis| for further details.
 281 */
 282 /* DECLARED ABOVE:
 283 typedef
 284    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 285    HowUsed;
 286 */
 287
 288 // Not actually necessary, but we don't want to waste D1 space.
 289 STATIC_ASSERT(sizeof(HowUsed) == 1);
 290
 291
 292 /* Carries around state during memcheck instrumentation. */
 293 typedef
 294    struct _MCEnv {
 295       /* MODIFIED: the superblock being constructed.  IRStmts are
 296          added. */
 297       IRSB* sb;
 298       Bool  trace;
 299
 300       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
 301          current kind and possibly shadow temps for each temp in the
 302          IRSB being constructed.  Note that it does not contain the
 303          type of each tmp.  If you want to know the type, look at the
 304          relevant entry in sb->tyenv.  It follows that at all times
 305          during the instrumentation process, the valid indices for
 306          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
 307          total number of Orig, V- and B- temps allocated so far.
 308
 309          The reason for this strange split (types in one place, all
 310          other info in another) is that we need the types to be
 311          attached to sb so as to make it possible to do
 312          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
 313          instrumentation process. */
 314       XArray* /* of TempMapEnt */ tmpMap;
 315
 316       /* READONLY: contains details of which ops should be expensively
 317          instrumented. */
 318       DetailLevelByOp dlbo;
 319
 320       /* READONLY: for each original tmp, how the tmp is used.  This is
 321          computed by |preInstrumentationAnalysis|.  Valid indices are
 322          0 .. #temps_in_sb-1 (same as for tmpMap). */
 323       HowUsed* tmpHowUsed;
 324
 325       /* READONLY: the guest layout.  This indicates which parts of
 326          the guest state should be regarded as 'always defined'. */
 327       const VexGuestLayout* layout;
 328
 329       /* READONLY: the host word type.  Needed for constructing
 330          arguments of type 'HWord' to be passed to helper functions.
 331          Ity_I32 or Ity_I64 only. */
 332       IRType hWordTy;
 333    }
 334    MCEnv;
 335
 336
 337 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
 338    demand), as they are encountered.  This is for two reasons.
 339
 340    (1) (less important reason): Many original tmps are unused due to
 341    initial IR optimisation, and we do not want to spaces in tables
 342    tracking them.
 343
 344    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
 345    table indexed [0 .. n_types-1], which gives the current shadow for
 346    each original tmp, or INVALID_IRTEMP if none is so far assigned.
 347    It is necessary to support making multiple assignments to a shadow
 348    -- specifically, after testing a shadow for definedness, it needs
 349    to be made defined.  But IR's SSA property disallows this.
 350
 351    (2) (more important reason): Therefore, when a shadow needs to get
 352    a new value, a new temporary is created, the value is assigned to
 353    that, and the tmpMap is updated to reflect the new binding.
 354
 355    A corollary is that if the tmpMap maps a given tmp to
 356    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
 357    there's a read-before-write error in the original tmps.  The IR
 358    sanity checker should catch all such anomalies, however.
 359 */
 360
 361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
 362    both the table in mce->sb and to our auxiliary mapping.  Note that
 363    newTemp may cause mce->tmpMap to resize, hence previous results
 364    from VG_(indexXA)(mce->tmpMap) are invalidated. */
 365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
 366 {
 367    Word       newIx;
 368    TempMapEnt ent;
 369    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
 370    ent.kind    = kind;
 371    ent.shadowV = IRTemp_INVALID;
 372    ent.shadowB = IRTemp_INVALID;
 373    newIx = VG_(addToXA)( mce->tmpMap, &ent );
 374    tl_assert(newIx == (Word)tmp);
 375    return tmp;
 376 }
 377
 378
 379 /* Find the tmp currently shadowing the given original tmp.  If none
 380    so far exists, allocate one.  */
 381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 382 {
 383    TempMapEnt* ent;
 384    /* VG_(indexXA) range-checks 'orig', hence no need to check
 385       here. */
 386    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 387    tl_assert(ent->kind == Orig);
 388    if (ent->shadowV == IRTemp_INVALID) {
 389       IRTemp tmpV
 390         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 391       /* newTemp may cause mce->tmpMap to resize, hence previous results
 392          from VG_(indexXA) are invalid. */
 393       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 394       tl_assert(ent->kind == Orig);
 395       tl_assert(ent->shadowV == IRTemp_INVALID);
 396       ent->shadowV = tmpV;
 397    }
 398    return ent->shadowV;
 399 }
 400
 401 /* Allocate a new shadow for the given original tmp.  This means any
 402    previous shadow is abandoned.  This is needed because it is
 403    necessary to give a new value to a shadow once it has been tested
 404    for undefinedness, but unfortunately IR's SSA property disallows
 405    this.  Instead we must abandon the old shadow, allocate a new one
 406    and use that instead.
 407
 408    This is the same as findShadowTmpV, except we don't bother to see
 409    if a shadow temp already existed -- we simply allocate a new one
 410    regardless. */
 411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 412 {
 413    TempMapEnt* ent;
 414    /* VG_(indexXA) range-checks 'orig', hence no need to check
 415       here. */
 416    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 417    tl_assert(ent->kind == Orig);
 418    if (1) {
 419       IRTemp tmpV
 420         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 421       /* newTemp may cause mce->tmpMap to resize, hence previous results
 422          from VG_(indexXA) are invalid. */
 423       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 424       tl_assert(ent->kind == Orig);
 425       ent->shadowV = tmpV;
 426    }
 427 }
 428
 429
 430 /*------------------------------------------------------------*/
 431 /*--- IRAtoms -- a subset of IRExprs                       ---*/
 432 /*------------------------------------------------------------*/
 433
 434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
 435    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
 436    input, most of this code deals in atoms.  Usefully, a value atom
 437    always has a V-value which is also an atom: constants are shadowed
 438    by constants, and temps are shadowed by the corresponding shadow
 439    temporary. */
 440
 441 typedef  IRExpr  IRAtom;
 442
 443 /* (used for sanity checks only): is this an atom which looks
 444    like it's from original code? */
 445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 446 {
 447    if (a1->tag == Iex_Const)
 448       return True;
 449    if (a1->tag == Iex_RdTmp) {
 450       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 451       return ent->kind == Orig;
 452    }
 453    return False;
 454 }
 455
 456 /* (used for sanity checks only): is this an atom which looks
 457    like it's from shadow code? */
 458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 459 {
 460    if (a1->tag == Iex_Const)
 461       return True;
 462    if (a1->tag == Iex_RdTmp) {
 463       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 464       return ent->kind == VSh || ent->kind == BSh;
 465    }
 466    return False;
 467 }
 468
 469 /* (used for sanity checks only): check that both args are atoms and
 470    are identically-kinded. */
 471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 472 {
 473    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 474       return True;
 475    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 476       return True;
 477    return False;
 478 }
 479
 480
 481 /*------------------------------------------------------------*/
 482 /*--- Type management                                      ---*/
 483 /*------------------------------------------------------------*/
 484
 485 /* Shadow state is always accessed using integer types.  This returns
 486    an integer type with the same size (as per sizeofIRType) as the
 487    given type.  The only valid shadow types are Bit, I8, I16, I32,
 488    I64, I128, V128, V256. */
 489
 490 static IRType shadowTypeV ( IRType ty )
 491 {
 492    switch (ty) {
 493       case Ity_I1:
 494       case Ity_I8:
 495       case Ity_I16:
 496       case Ity_I32:
 497       case Ity_I64:
 498       case Ity_I128: return ty;
 499       case Ity_F16:  return Ity_I16;
 500       case Ity_F32:  return Ity_I32;
 501       case Ity_D32:  return Ity_I32;
 502       case Ity_F64:  return Ity_I64;
 503       case Ity_D64:  return Ity_I64;
 504       case Ity_F128: return Ity_I128;
 505       case Ity_D128: return Ity_I128;
 506       case Ity_V128: return Ity_V128;
 507       case Ity_V256: return Ity_V256;
 508       default: ppIRType(ty);
 509                VG_(tool_panic)("memcheck:shadowTypeV");
 510    }
 511 }
 512
 513 /* Produce a 'defined' value of the given shadow type.  Should only be
 514    supplied shadow types (Bit/I8/I16/I32/UI64). */
 515 static IRExpr* definedOfType ( IRType ty ) {
 516    switch (ty) {
 517       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
 518       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
 519       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
 520       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
 521       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
 522       case Ity_I128: return i128_const_zero();
 523       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
 524       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
 525       default:       VG_(tool_panic)("memcheck:definedOfType");
 526    }
 527 }
 528
 529
 530 /*------------------------------------------------------------*/
 531 /*--- Constructing IR fragments                            ---*/
 532 /*------------------------------------------------------------*/
 533
 534 /* add stmt to a bb */
 535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
 536    if (mce->trace) {
 537       VG_(printf)("  %c: ", cat);
 538       ppIRStmt(st);
 539       VG_(printf)("\n");
 540    }
 541    addStmtToIRSB(mce->sb, st);
 542 }
 543
 544 /* assign value to tmp */
 545 static inline
 546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
 547    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
 548 }
 549
 550 /* build various kinds of expressions */
 551 #define triop(_op, _arg1, _arg2, _arg3) \
 552                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
 553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
 554 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
 555 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
 556 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
 557 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
 558 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 559 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 560 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 561 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 562
 563 /* Bind the given expression to a new temporary, and return the
 564    temporary.  This effectively converts an arbitrary expression into
 565    an atom.
 566
 567    'ty' is the type of 'e' and hence the type that the new temporary
 568    needs to be.  But passing it in is redundant, since we can deduce
 569    the type merely by inspecting 'e'.  So at least use that fact to
 570    assert that the two types agree. */
 571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 572 {
 573    TempKind k;
 574    IRTemp   t;
 575    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
 576
 577    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
 578    switch (cat) {
 579       case 'V': k = VSh;  break;
 580       case 'B': k = BSh;  break;
 581       case 'C': k = Orig; break;
 582                 /* happens when we are making up new "orig"
 583                    expressions, for IRCAS handling */
 584       default: tl_assert(0);
 585    }
 586    t = newTemp(mce, ty, k);
 587    assign(cat, mce, t, e);
 588    return mkexpr(t);
 589 }
 590
 591
 592 /*------------------------------------------------------------*/
 593 /*--- Helper functions for 128-bit ops                     ---*/
 594 /*------------------------------------------------------------*/
 595
 596 static IRExpr *i128_const_zero(void)
 597 {
 598    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
 599    return binop(Iop_64HLto128, z64, z64);
 600 }
 601
 602 /* There are no I128-bit loads and/or stores [as generated by any
 603    current front ends].  So we do not need to worry about that in
 604    expr2vbits_Load */
 605
 606
 607 /*------------------------------------------------------------*/
 608 /*--- Constructing definedness primitive ops               ---*/
 609 /*------------------------------------------------------------*/
 610
 611 /* --------- Defined-if-either-defined --------- */
 612
 613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 614    tl_assert(isShadowAtom(mce,a1));
 615    tl_assert(isShadowAtom(mce,a2));
 616    return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
 617 }
 618
 619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 620    tl_assert(isShadowAtom(mce,a1));
 621    tl_assert(isShadowAtom(mce,a2));
 622    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 623 }
 624
 625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 626    tl_assert(isShadowAtom(mce,a1));
 627    tl_assert(isShadowAtom(mce,a2));
 628    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 629 }
 630
 631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 632    tl_assert(isShadowAtom(mce,a1));
 633    tl_assert(isShadowAtom(mce,a2));
 634    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 635 }
 636
 637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 638    tl_assert(isShadowAtom(mce,a1));
 639    tl_assert(isShadowAtom(mce,a2));
 640    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 641 }
 642
 643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 644    tl_assert(isShadowAtom(mce,a1));
 645    tl_assert(isShadowAtom(mce,a2));
 646    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 647 }
 648
 649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 650    tl_assert(isShadowAtom(mce,a1));
 651    tl_assert(isShadowAtom(mce,a2));
 652    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
 653 }
 654
 655 /* --------- Undefined-if-either-undefined --------- */
 656
 657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 658    tl_assert(isShadowAtom(mce,a1));
 659    tl_assert(isShadowAtom(mce,a2));
 660    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
 661 }
 662
 663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 664    tl_assert(isShadowAtom(mce,a1));
 665    tl_assert(isShadowAtom(mce,a2));
 666    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 667 }
 668
 669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 670    tl_assert(isShadowAtom(mce,a1));
 671    tl_assert(isShadowAtom(mce,a2));
 672    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 673 }
 674
 675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 676    tl_assert(isShadowAtom(mce,a1));
 677    tl_assert(isShadowAtom(mce,a2));
 678    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 679 }
 680
 681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 682    tl_assert(isShadowAtom(mce,a1));
 683    tl_assert(isShadowAtom(mce,a2));
 684    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 685 }
 686
 687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 688    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
 689    tl_assert(isShadowAtom(mce,a1));
 690    tl_assert(isShadowAtom(mce,a2));
 691    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
 692    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
 693    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
 694    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
 695    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
 696    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
 697
 698    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
 699 }
 700
 701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 702    tl_assert(isShadowAtom(mce,a1));
 703    tl_assert(isShadowAtom(mce,a2));
 704    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 705 }
 706
 707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 708    tl_assert(isShadowAtom(mce,a1));
 709    tl_assert(isShadowAtom(mce,a2));
 710    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
 711 }
 712
 713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
 714    switch (vty) {
 715       case Ity_I8:   return mkUifU8(mce, a1, a2);
 716       case Ity_I16:  return mkUifU16(mce, a1, a2);
 717       case Ity_I32:  return mkUifU32(mce, a1, a2);
 718       case Ity_I64:  return mkUifU64(mce, a1, a2);
 719       case Ity_I128: return mkUifU128(mce, a1, a2);
 720       case Ity_V128: return mkUifUV128(mce, a1, a2);
 721       case Ity_V256: return mkUifUV256(mce, a1, a2);
 722       default:
 723          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
 724          VG_(tool_panic)("memcheck:mkUifU");
 725    }
 726 }
 727
 728 /* --------- The Left-family of operations. --------- */
 729
 730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
 731    tl_assert(isShadowAtom(mce,a1));
 732    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 733 }
 734
 735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
 736    tl_assert(isShadowAtom(mce,a1));
 737    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 738 }
 739
 740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
 741    tl_assert(isShadowAtom(mce,a1));
 742    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 743 }
 744
 745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
 746    tl_assert(isShadowAtom(mce,a1));
 747    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 748 }
 749
 750 /* --------- The Right-family of operations. --------- */
 751
 752 /* Unfortunately these are a lot more expensive then their Left
 753    counterparts.  Fortunately they are only very rarely used -- only for
 754    count-leading-zeroes instrumentation. */
 755
 756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
 757 {
 758    for (Int i = 1; i <= 16; i *= 2) {
 759       // a1 |= (a1 >>u i)
 760       IRAtom* tmp
 761          = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
 762       a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
 763    }
 764    return a1;
 765 }
 766
 767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
 768 {
 769    for (Int i = 1; i <= 32; i *= 2) {
 770       // a1 |= (a1 >>u i)
 771       IRAtom* tmp
 772          = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
 773       a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
 774    }
 775    return a1;
 776 }
 777
 778 /* --------- 'Improvement' functions for AND/OR. --------- */
 779
 780 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
 781    defined (0); all other -> undefined (1).
 782 */
 783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 784 {
 785    tl_assert(isOriginalAtom(mce, data));
 786    tl_assert(isShadowAtom(mce, vbits));
 787    tl_assert(sameKindedAtoms(data, vbits));
 788    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
 789 }
 790
 791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 792 {
 793    tl_assert(isOriginalAtom(mce, data));
 794    tl_assert(isShadowAtom(mce, vbits));
 795    tl_assert(sameKindedAtoms(data, vbits));
 796    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 797 }
 798
 799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 800 {
 801    tl_assert(isOriginalAtom(mce, data));
 802    tl_assert(isShadowAtom(mce, vbits));
 803    tl_assert(sameKindedAtoms(data, vbits));
 804    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 805 }
 806
 807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 808 {
 809    tl_assert(isOriginalAtom(mce, data));
 810    tl_assert(isShadowAtom(mce, vbits));
 811    tl_assert(sameKindedAtoms(data, vbits));
 812    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 813 }
 814
 815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 816 {
 817    tl_assert(isOriginalAtom(mce, data));
 818    tl_assert(isShadowAtom(mce, vbits));
 819    tl_assert(sameKindedAtoms(data, vbits));
 820    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 821 }
 822
 823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 824 {
 825    tl_assert(isOriginalAtom(mce, data));
 826    tl_assert(isShadowAtom(mce, vbits));
 827    tl_assert(sameKindedAtoms(data, vbits));
 828    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 829 }
 830
 831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 832 {
 833    tl_assert(isOriginalAtom(mce, data));
 834    tl_assert(isShadowAtom(mce, vbits));
 835    tl_assert(sameKindedAtoms(data, vbits));
 836    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
 837 }
 838
 839 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
 840    defined (0); all other -> undefined (1).
 841 */
 842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 843 {
 844    tl_assert(isOriginalAtom(mce, data));
 845    tl_assert(isShadowAtom(mce, vbits));
 846    tl_assert(sameKindedAtoms(data, vbits));
 847    return assignNew(
 848              'V', mce, Ity_I1,
 849              binop(Iop_Or1,
 850                    assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
 851                    vbits) );
 852 }
 853
 854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 855 {
 856    tl_assert(isOriginalAtom(mce, data));
 857    tl_assert(isShadowAtom(mce, vbits));
 858    tl_assert(sameKindedAtoms(data, vbits));
 859    return assignNew(
 860              'V', mce, Ity_I8,
 861              binop(Iop_Or8,
 862                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
 863                    vbits) );
 864 }
 865
 866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 867 {
 868    tl_assert(isOriginalAtom(mce, data));
 869    tl_assert(isShadowAtom(mce, vbits));
 870    tl_assert(sameKindedAtoms(data, vbits));
 871    return assignNew(
 872              'V', mce, Ity_I16,
 873              binop(Iop_Or16,
 874                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
 875                    vbits) );
 876 }
 877
 878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 879 {
 880    tl_assert(isOriginalAtom(mce, data));
 881    tl_assert(isShadowAtom(mce, vbits));
 882    tl_assert(sameKindedAtoms(data, vbits));
 883    return assignNew(
 884              'V', mce, Ity_I32,
 885              binop(Iop_Or32,
 886                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
 887                    vbits) );
 888 }
 889
 890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 891 {
 892    tl_assert(isOriginalAtom(mce, data));
 893    tl_assert(isShadowAtom(mce, vbits));
 894    tl_assert(sameKindedAtoms(data, vbits));
 895    return assignNew(
 896              'V', mce, Ity_I64,
 897              binop(Iop_Or64,
 898                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
 899                    vbits) );
 900 }
 901
 902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 903 {
 904    tl_assert(isOriginalAtom(mce, data));
 905    tl_assert(isShadowAtom(mce, vbits));
 906    tl_assert(sameKindedAtoms(data, vbits));
 907    return assignNew(
 908              'V', mce, Ity_V128,
 909              binop(Iop_OrV128,
 910                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
 911                    vbits) );
 912 }
 913
 914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 915 {
 916    tl_assert(isOriginalAtom(mce, data));
 917    tl_assert(isShadowAtom(mce, vbits));
 918    tl_assert(sameKindedAtoms(data, vbits));
 919    return assignNew(
 920              'V', mce, Ity_V256,
 921              binop(Iop_OrV256,
 922                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
 923                    vbits) );
 924 }
 925
 926 /* --------- Pessimising casts. --------- */
 927
 928 /* The function returns an expression of type DST_TY. If any of the VBITS
 929    is undefined (value == 1) the resulting expression has all bits set to
 930    1. Otherwise, all bits are 0. */
 931
 932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
 933 {
 934    IRType  src_ty;
 935    IRAtom* tmp1;
 936
 937    /* Note, dst_ty is a shadow type, not an original type. */
 938    tl_assert(isShadowAtom(mce,vbits));
 939    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
 940
 941    /* Fast-track some common cases */
 942    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
 943       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 944
 945    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
 946       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 947
 948    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
 949       /* PCast the arg, then clone it. */
 950       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 951       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 952    }
 953
 954    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
 955       /* PCast the arg, then clone it 4 times. */
 956       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 957       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 958       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 959    }
 960
 961    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
 962       /* PCast the arg, then clone it 8 times. */
 963       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 964       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 965       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 966       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
 967    }
 968
 969    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
 970       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
 971          the top half. */
 972       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 973       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
 974    }
 975
 976    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
 977       /* Use InterleaveHI64x2 to copy the top half of the vector into
 978          the bottom half.  Then we can UifU it with the original, throw
 979          away the upper half of the result, and PCast-I64-to-I64
 980          the lower half. */
 981       // Generates vbits[127:64] : vbits[127:64]
 982       IRAtom* hi64hi64
 983          = assignNew('V', mce, Ity_V128,
 984                      binop(Iop_InterleaveHI64x2, vbits, vbits));
 985       // Generates
 986       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
 987       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
 988       IRAtom* lohi64
 989          = mkUifUV128(mce, hi64hi64, vbits);
 990       // Generates UifU(vbits[127:64],vbits[63:0])
 991       IRAtom* lo64
 992          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
 993       // Generates
 994       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
 995       //   == PCast-to-I64( vbits[127:0] )
 996       IRAtom* res
 997          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
 998       return res;
 999    }
1000
1001    /* Else do it the slow way .. */
1002    /* First of all, collapse vbits down to a single bit. */
1003    tmp1   = NULL;
1004    switch (src_ty) {
1005       case Ity_I1:
1006          tmp1 = vbits;
1007          break;
1008       case Ity_I8:
1009          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010          break;
1011       case Ity_I16:
1012          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013          break;
1014       case Ity_I32:
1015          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016          break;
1017       case Ity_I64:
1018          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019          break;
1020       case Ity_I128: {
1021          /* Gah.  Chop it in half, OR the halves together, and compare
1022             that with zero. */
1023          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026          tmp1         = assignNew('V', mce, Ity_I1,
1027                                        unop(Iop_CmpNEZ64, tmp4));
1028          break;
1029       }
1030       case Ity_V128: {
1031          /* Chop it in half, OR the halves together, and compare that
1032           * with zero.
1033           */
1034          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037          tmp1         = assignNew('V', mce, Ity_I1,
1038                                        unop(Iop_CmpNEZ64, tmp4));
1039          break;
1040       }
1041       default:
1042          ppIRType(src_ty);
1043          VG_(tool_panic)("mkPCastTo(1)");
1044    }
1045    tl_assert(tmp1);
1046    /* Now widen up to the dst type. */
1047    switch (dst_ty) {
1048       case Ity_I1:
1049          return tmp1;
1050       case Ity_I8:
1051          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052       case Ity_I16:
1053          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054       case Ity_I32:
1055          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056       case Ity_I64:
1057          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058       case Ity_V128:
1059          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1060          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061          return tmp1;
1062       case Ity_I128:
1063          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1064          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065          return tmp1;
1066       case Ity_V256:
1067          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1068          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069                                                     tmp1, tmp1));
1070          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071                                                     tmp1, tmp1));
1072          return tmp1;
1073       default:
1074          ppIRType(dst_ty);
1075          VG_(tool_panic)("mkPCastTo(2)");
1076    }
1077 }
1078
1079 /* This is a minor variant.  It takes an arg of some type and returns
1080    a value of the same type.  The result consists entirely of Defined
1081    (zero) bits except its least significant bit, which is a PCast of
1082    the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1084 {
1085    if (ty == Ity_V128) {
1086       /* --- Case for V128 --- */
1087       IRAtom* varg128 = varg;
1088       // generates: PCast-to-I64(varg128)
1089       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090       // Now introduce zeros (defined bits) in the top 63 places
1091       // generates: Def--(63)--Def PCast-to-I1(varg128)
1092       IRAtom* d63pc
1093          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094       // generates: Def--(64)--Def
1095       IRAtom* d64
1096          = definedOfType(Ity_I64);
1097       // generates: Def--(127)--Def PCast-to-I1(varg128)
1098       IRAtom* res
1099          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100       return res;
1101    }
1102    if (ty == Ity_I64) {
1103       /* --- Case for I64 --- */
1104       // PCast to 64
1105       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106       // Zero (Def) out the top 63 bits
1107       IRAtom* res
1108          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109       return res;
1110    }
1111    /*NOTREACHED*/
1112    tl_assert(0);
1113 }
1114
1115 /* --------- Optimistic casts. --------- */
1116
1117 /* The function takes and returns an expression of type TY. If any of the
1118    VBITS indicate defined (value == 0) the resulting expression has all bits
1119    set to 0. Otherwise, all bits are 1.  In words, if any bits are defined
1120    then all bits are made to be defined.
1121
1122    In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1123 */
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1125 {
1126    IROp opSUB, opSHR, opSAR;
1127    UInt sh;
1128
1129    switch (ty) {
1130       case Ity_I64:
1131          opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132          break;
1133       case Ity_I32:
1134          opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135          break;
1136       case Ity_I16:
1137          opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138          break;
1139       case Ity_I8:
1140          opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141          break;
1142       default:
1143          ppIRType(ty);
1144          VG_(tool_panic)("mkOCastTo");
1145    }
1146
1147    IRAtom *shr1, *at;
1148    shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149    at   = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150    at   = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151    return at;
1152 }
1153
1154
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1156 /*
1157    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158    PCasting to Ity_U1.  However, sometimes it is necessary to be more
1159    accurate.  The insight is that the result is defined if two
1160    corresponding bits can be found, one from each argument, so that
1161    both bits are defined but are different -- that makes EQ say "No"
1162    and NE say "Yes".  Hence, we compute an improvement term and DifD
1163    it onto the "normal" (UifU) result.
1164
1165    The result is:
1166
1167    PCastTo<1> (
1168       -- naive version
1169       UifU<sz>(vxx, vyy)
1170
1171       `DifD<sz>`
1172
1173       -- improvement term
1174       OCast<sz>(vec)
1175    )
1176
1177    where
1178      vec contains 0 (defined) bits where the corresponding arg bits
1179      are defined but different, and 1 bits otherwise.
1180
1181      vec = Or<sz>( vxx,   // 0 iff bit defined
1182                    vyy,   // 0 iff bit defined
1183                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1184                  )
1185
1186      If any bit of vec is 0, the result is defined and so the
1187      improvement term should produce 0...0, else it should produce
1188      1...1.
1189
1190      Hence require for the improvement term:
1191
1192         OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1193
1194      which you can think of as an "optimistic cast" (OCast, the opposite of
1195      the normal "pessimistic cast" (PCast) family.  An OCast says all bits
1196      are defined if any bit is defined.
1197
1198      It is possible to show that
1199
1200          if vec == 1...1 then 1...1 else 0...0
1201
1202      can be implemented in straight-line code as
1203
1204          (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1205
1206    We note that vec contains the sub-term Or<sz>(vxx, vyy).  Since UifU is
1207    implemented with Or (since 1 signifies undefinedness), this is a
1208    duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209    a final version of:
1210
1211    let naive = UifU<sz>(vxx, vyy)
1212        vec   = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1213    in
1214        PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1215
1216    This was extensively re-analysed and checked on 6 July 05 and again
1217    in July 2017.
1218 */
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
1220                                     IRType  ty,
1221                                     IRAtom* vxx, IRAtom* vyy,
1222                                     IRAtom* xx,  IRAtom* yy )
1223 {
1224    IRAtom *naive, *vec, *improved, *final_cast;
1225    IROp   opDIFD, opUIFU, opOR, opXOR, opNOT;
1226
1227    tl_assert(isShadowAtom(mce,vxx));
1228    tl_assert(isShadowAtom(mce,vyy));
1229    tl_assert(isOriginalAtom(mce,xx));
1230    tl_assert(isOriginalAtom(mce,yy));
1231    tl_assert(sameKindedAtoms(vxx,xx));
1232    tl_assert(sameKindedAtoms(vyy,yy));
1233
1234    switch (ty) {
1235       case Ity_I8:
1236          opDIFD = Iop_And8;
1237          opUIFU = Iop_Or8;
1238          opOR   = Iop_Or8;
1239          opXOR  = Iop_Xor8;
1240          opNOT  = Iop_Not8;
1241          break;
1242       case Ity_I16:
1243          opDIFD = Iop_And16;
1244          opUIFU = Iop_Or16;
1245          opOR   = Iop_Or16;
1246          opXOR  = Iop_Xor16;
1247          opNOT  = Iop_Not16;
1248          break;
1249       case Ity_I32:
1250          opDIFD = Iop_And32;
1251          opUIFU = Iop_Or32;
1252          opOR   = Iop_Or32;
1253          opXOR  = Iop_Xor32;
1254          opNOT  = Iop_Not32;
1255          break;
1256       case Ity_I64:
1257          opDIFD = Iop_And64;
1258          opUIFU = Iop_Or64;
1259          opOR   = Iop_Or64;
1260          opXOR  = Iop_Xor64;
1261          opNOT  = Iop_Not64;
1262          break;
1263       default:
1264          VG_(tool_panic)("expensiveCmpEQorNE");
1265    }
1266
1267    naive
1268       = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1269
1270    vec
1271       = assignNew(
1272            'V', mce,ty,
1273            binop( opOR,
1274                   naive,
1275                   assignNew(
1276                      'V', mce,ty,
1277                      unop(opNOT,
1278                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1279
1280    improved
1281       = assignNew( 'V', mce,ty,
1282                    binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1283
1284    final_cast
1285       = mkPCastTo( mce, Ity_I1, improved );
1286
1287    return final_cast;
1288 }
1289
1290
1291 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1292
1293 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1294
1295       CmpORD32S(x,y) = 1<<3   if  x <s y
1296                      = 1<<2   if  x >s y
1297                      = 1<<1   if  x == y
1298
1299    and similarly the unsigned variant.  The default interpretation is:
1300
1301       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1302                                   & (7<<1)
1303
1304    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1305    are zero and therefore defined (viz, zero).
1306
1307    Also deal with a special case better:
1308
1309       CmpORD32S(x,0)
1310
1311    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1312    will be defined even if the rest of x isn't.  In which case we do:
1313
1314       CmpORD32S#(x,x#,0,{impliedly 0}#)
1315          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1316            | (x# >>u 31) << 3      -- LT# = x#[31]
1317
1318    Analogous handling for CmpORD64{S,U}.
1319 */
1320 static Bool isZeroU32 ( IRAtom* e )
1321 {
1322    return
1323       toBool( e->tag == Iex_Const
1324               && e->Iex.Const.con->tag == Ico_U32
1325               && e->Iex.Const.con->Ico.U32 == 0 );
1326 }
1327
1328 static Bool isZeroU64 ( IRAtom* e )
1329 {
1330    return
1331       toBool( e->tag == Iex_Const
1332               && e->Iex.Const.con->tag == Ico_U64
1333               && e->Iex.Const.con->Ico.U64 == 0 );
1334 }
1335
1336 static IRAtom* doCmpORD ( MCEnv*  mce,
1337                           IROp    cmp_op,
1338                           IRAtom* xxhash, IRAtom* yyhash,
1339                           IRAtom* xx,     IRAtom* yy )
1340 {
1341    Bool   m64      = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1342    Bool   syned    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1343    IROp   opOR     = m64 ? Iop_Or64   : Iop_Or32;
1344    IROp   opAND    = m64 ? Iop_And64  : Iop_And32;
1345    IROp   opSHL    = m64 ? Iop_Shl64  : Iop_Shl32;
1346    IROp   opSHR    = m64 ? Iop_Shr64  : Iop_Shr32;
1347    IROp   op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1348    IRType ty       = m64 ? Ity_I64    : Ity_I32;
1349    Int    width    = m64 ? 64         : 32;
1350
1351    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1352
1353    tl_assert(isShadowAtom(mce,xxhash));
1354    tl_assert(isShadowAtom(mce,yyhash));
1355    tl_assert(isOriginalAtom(mce,xx));
1356    tl_assert(isOriginalAtom(mce,yy));
1357    tl_assert(sameKindedAtoms(xxhash,xx));
1358    tl_assert(sameKindedAtoms(yyhash,yy));
1359    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1360              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1361
1362    if (0) {
1363       ppIROp(cmp_op); VG_(printf)(" ");
1364       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1365    }
1366
1367    if (syned && isZero(yy)) {
1368       /* fancy interpretation */
1369       /* if yy is zero, then it must be fully defined (zero#). */
1370       tl_assert(isZero(yyhash));
1371       // This is still inaccurate, but I don't think it matters, since
1372       // nobody writes code of the form
1373       // "is <partially-undefined-value> signedly greater than zero?".
1374       // We therefore simply declare "x >s 0" to be undefined if any bit in
1375       // x is undefined.  That's clearly suboptimal in some cases.  Eg, if
1376       // the highest order bit is a defined 1 then x is negative so it
1377       // doesn't matter whether the remaining bits are defined or not.
1378       IRAtom* t_0_gt_0_0
1379          = assignNew(
1380               'V', mce,ty,
1381               binop(
1382                  opAND,
1383                  mkPCastTo(mce,ty, xxhash),
1384                  m64 ? mkU64(1<<2) : mkU32(1<<2)
1385               ));
1386       // For "x <s 0", we can just copy the definedness of the top bit of x
1387       // and we have a precise result.
1388       IRAtom* t_lt_0_0_0
1389          = assignNew(
1390               'V', mce,ty,
1391               binop(
1392                  opSHL,
1393                  assignNew(
1394                     'V', mce,ty,
1395                     binop(opSHR, xxhash, mkU8(width-1))),
1396                  mkU8(3)
1397               ));
1398       // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1399       IRAtom* t_0_0_eq_0
1400          = assignNew(
1401               'V', mce,ty,
1402               binop(
1403                  opSHL,
1404                  assignNew('V', mce,ty,
1405                     unop(
1406                     op1UtoWS,
1407                     expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1408                  ),
1409                  mkU8(1)
1410               ));
1411       return
1412          binop(
1413             opOR,
1414             assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1415             t_0_0_eq_0
1416          );
1417    } else {
1418       /* standard interpretation */
1419       IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1420       return
1421          binop(
1422             opAND,
1423             mkPCastTo( mce,ty,
1424                        mkUifU(mce,ty, xxhash,yyhash)),
1425             sevenLeft1
1426          );
1427    }
1428 }
1429
1430
1431 /*------------------------------------------------------------*/
1432 /*--- Emit a test and complaint if something is undefined. ---*/
1433 /*------------------------------------------------------------*/
1434
1435 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1436
1437
1438 /* Set the annotations on a dirty helper to indicate that the stack
1439    pointer and instruction pointers might be read.  This is the
1440    behaviour of all 'emit-a-complaint' style functions we might
1441    call. */
1442
1443 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1444    di->nFxState = 2;
1445    di->fxState[0].fx        = Ifx_Read;
1446    di->fxState[0].offset    = mce->layout->offset_SP;
1447    di->fxState[0].size      = mce->layout->sizeof_SP;
1448    di->fxState[0].nRepeats  = 0;
1449    di->fxState[0].repeatLen = 0;
1450    di->fxState[1].fx        = Ifx_Read;
1451    di->fxState[1].offset    = mce->layout->offset_IP;
1452    di->fxState[1].size      = mce->layout->sizeof_IP;
1453    di->fxState[1].nRepeats  = 0;
1454    di->fxState[1].repeatLen = 0;
1455 }
1456
1457
1458 /* Check the supplied *original* |atom| for undefinedness, and emit a
1459    complaint if so.  Once that happens, mark it as defined.  This is
1460    possible because the atom is either a tmp or literal.  If it's a
1461    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1462    be defined.  In fact as mentioned above, we will have to allocate a
1463    new tmp to carry the new 'defined' shadow value, and update the
1464    original->tmp mapping accordingly; we cannot simply assign a new
1465    value to an existing shadow tmp as this breaks SSAness.
1466
1467    The checks are performed, any resulting complaint emitted, and
1468    |atom|'s shadow temp set to 'defined', ONLY in the case that
1469    |guard| evaluates to True at run-time.  If it evaluates to False
1470    then no action is performed.  If |guard| is NULL (the usual case)
1471    then it is assumed to be always-true, and hence these actions are
1472    performed unconditionally.
1473
1474    This routine does not generate code to check the definedness of
1475    |guard|.  The caller is assumed to have taken care of that already.
1476 */
1477 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1478 {
1479    IRAtom*  vatom;
1480    IRType   ty;
1481    Int      sz;
1482    IRDirty* di;
1483    IRAtom*  cond;
1484    IRAtom*  origin;
1485    void*    fn;
1486    const HChar* nm;
1487    IRExpr** args;
1488    Int      nargs;
1489
1490    // Don't do V bit tests if we're not reporting undefined value errors.
1491    if (MC_(clo_mc_level) == 1)
1492       return;
1493
1494    if (guard)
1495       tl_assert(isOriginalAtom(mce, guard));
1496
1497    /* Since the original expression is atomic, there's no duplicated
1498       work generated by making multiple V-expressions for it.  So we
1499       don't really care about the possibility that someone else may
1500       also create a V-interpretion for it. */
1501    tl_assert(isOriginalAtom(mce, atom));
1502    vatom = expr2vbits( mce, atom, HuOth );
1503    tl_assert(isShadowAtom(mce, vatom));
1504    tl_assert(sameKindedAtoms(atom, vatom));
1505
1506    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1507
1508    /* sz is only used for constructing the error message */
1509    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1510
1511    cond = mkPCastTo( mce, Ity_I1, vatom );
1512    /* cond will be 0 if all defined, and 1 if any not defined. */
1513
1514    /* Get the origin info for the value we are about to check.  At
1515       least, if we are doing origin tracking.  If not, use a dummy
1516       zero origin. */
1517    if (MC_(clo_mc_level) == 3) {
1518       origin = schemeE( mce, atom );
1519       if (mce->hWordTy == Ity_I64) {
1520          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1521       }
1522    } else {
1523       origin = NULL;
1524    }
1525
1526    fn    = NULL;
1527    nm    = NULL;
1528    args  = NULL;
1529    nargs = -1;
1530
1531    switch (sz) {
1532       case 0:
1533          if (origin) {
1534             fn    = &MC_(helperc_value_check0_fail_w_o);
1535             nm    = "MC_(helperc_value_check0_fail_w_o)";
1536             args  = mkIRExprVec_1(origin);
1537             nargs = 1;
1538          } else {
1539             fn    = &MC_(helperc_value_check0_fail_no_o);
1540             nm    = "MC_(helperc_value_check0_fail_no_o)";
1541             args  = mkIRExprVec_0();
1542             nargs = 0;
1543          }
1544          break;
1545       case 1:
1546          if (origin) {
1547             fn    = &MC_(helperc_value_check1_fail_w_o);
1548             nm    = "MC_(helperc_value_check1_fail_w_o)";
1549             args  = mkIRExprVec_1(origin);
1550             nargs = 1;
1551          } else {
1552             fn    = &MC_(helperc_value_check1_fail_no_o);
1553             nm    = "MC_(helperc_value_check1_fail_no_o)";
1554             args  = mkIRExprVec_0();
1555             nargs = 0;
1556          }
1557          break;
1558       case 4:
1559          if (origin) {
1560             fn    = &MC_(helperc_value_check4_fail_w_o);
1561             nm    = "MC_(helperc_value_check4_fail_w_o)";
1562             args  = mkIRExprVec_1(origin);
1563             nargs = 1;
1564          } else {
1565             fn    = &MC_(helperc_value_check4_fail_no_o);
1566             nm    = "MC_(helperc_value_check4_fail_no_o)";
1567             args  = mkIRExprVec_0();
1568             nargs = 0;
1569          }
1570          break;
1571       case 8:
1572          if (origin) {
1573             fn    = &MC_(helperc_value_check8_fail_w_o);
1574             nm    = "MC_(helperc_value_check8_fail_w_o)";
1575             args  = mkIRExprVec_1(origin);
1576             nargs = 1;
1577          } else {
1578             fn    = &MC_(helperc_value_check8_fail_no_o);
1579             nm    = "MC_(helperc_value_check8_fail_no_o)";
1580             args  = mkIRExprVec_0();
1581             nargs = 0;
1582          }
1583          break;
1584       case 2:
1585       case 16:
1586          if (origin) {
1587             fn    = &MC_(helperc_value_checkN_fail_w_o);
1588             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1589             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1590             nargs = 2;
1591          } else {
1592             fn    = &MC_(helperc_value_checkN_fail_no_o);
1593             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1594             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1595             nargs = 1;
1596          }
1597          break;
1598       default:
1599          VG_(tool_panic)("unexpected szB");
1600    }
1601
1602    tl_assert(fn);
1603    tl_assert(nm);
1604    tl_assert(args);
1605    tl_assert(nargs >= 0 && nargs <= 2);
1606    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1607               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1608
1609    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1610                            VG_(fnptr_to_fnentry)( fn ), args );
1611    di->guard = cond; // and cond is PCast-to-1(atom#)
1612
1613    /* If the complaint is to be issued under a guard condition, AND
1614       that into the guard condition for the helper call. */
1615    if (guard) {
1616       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1617       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1618       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1619       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1620    }
1621
1622    setHelperAnns( mce, di );
1623    stmt( 'V', mce, IRStmt_Dirty(di));
1624
1625    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1626       defined -- but only in the case where the guard evaluates to
1627       True at run-time.  Do the update by setting the orig->shadow
1628       mapping for tmp to reflect the fact that this shadow is getting
1629       a new value. */
1630    tl_assert(isIRAtom(vatom));
1631    /* sameKindedAtoms ... */
1632    if (vatom->tag == Iex_RdTmp) {
1633       tl_assert(atom->tag == Iex_RdTmp);
1634       if (guard == NULL) {
1635          // guard is 'always True', hence update unconditionally
1636          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1637          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1638                           definedOfType(ty));
1639       } else {
1640          // update the temp only conditionally.  Do this by copying
1641          // its old value when the guard is False.
1642          // The old value ..
1643          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1644          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1645          IRAtom* new_tmpV
1646             = assignNew('V', mce, shadowTypeV(ty),
1647                         IRExpr_ITE(guard, definedOfType(ty),
1648                                           mkexpr(old_tmpV)));
1649          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1650       }
1651    }
1652 }
1653
1654
1655 /*------------------------------------------------------------*/
1656 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1657 /*------------------------------------------------------------*/
1658
1659 /* Examine the always-defined sections declared in layout to see if
1660    the (offset,size) section is within one.  Note, is is an error to
1661    partially fall into such a region: (offset,size) should either be
1662    completely in such a region or completely not-in such a region.
1663 */
1664 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1665 {
1666    Int minoffD, maxoffD, i;
1667    Int minoff = offset;
1668    Int maxoff = minoff + size - 1;
1669    tl_assert((minoff & ~0xFFFF) == 0);
1670    tl_assert((maxoff & ~0xFFFF) == 0);
1671
1672    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1673       minoffD = mce->layout->alwaysDefd[i].offset;
1674       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1675       tl_assert((minoffD & ~0xFFFF) == 0);
1676       tl_assert((maxoffD & ~0xFFFF) == 0);
1677
1678       if (maxoff < minoffD || maxoffD < minoff)
1679          continue; /* no overlap */
1680       if (minoff >= minoffD && maxoff <= maxoffD)
1681          return True; /* completely contained in an always-defd section */
1682
1683       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1684    }
1685    return False; /* could not find any containing section */
1686 }
1687
1688
1689 /* Generate into bb suitable actions to shadow this Put.  If the state
1690    slice is marked 'always defined', do nothing.  Otherwise, write the
1691    supplied V bits to the shadow state.  We can pass in either an
1692    original atom or a V-atom, but not both.  In the former case the
1693    relevant V-bits are then generated from the original.
1694    We assume here, that the definedness of GUARD has already been checked.
1695 */
1696 static
1697 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1698                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1699 {
1700    IRType ty;
1701
1702    // Don't do shadow PUTs if we're not doing undefined value checking.
1703    // Their absence lets Vex's optimiser remove all the shadow computation
1704    // that they depend on, which includes GETs of the shadow registers.
1705    if (MC_(clo_mc_level) == 1)
1706       return;
1707
1708    if (atom) {
1709       tl_assert(!vatom);
1710       tl_assert(isOriginalAtom(mce, atom));
1711       vatom = expr2vbits( mce, atom, HuOth );
1712    } else {
1713       tl_assert(vatom);
1714       tl_assert(isShadowAtom(mce, vatom));
1715    }
1716
1717    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1718    tl_assert(ty != Ity_I1);
1719    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1720       /* later: no ... */
1721       /* emit code to emit a complaint if any of the vbits are 1. */
1722       /* complainIfUndefined(mce, atom); */
1723    } else {
1724       /* Do a plain shadow Put. */
1725       if (guard) {
1726          /* If the guard expression evaluates to false we simply Put the value
1727             that is already stored in the guest state slot */
1728          IRAtom *cond, *iffalse;
1729
1730          cond    = assignNew('V', mce, Ity_I1, guard);
1731          iffalse = assignNew('V', mce, ty,
1732                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1733          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1734       }
1735       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1736    }
1737 }
1738
1739
1740 /* Return an expression which contains the V bits corresponding to the
1741    given GETI (passed in in pieces).
1742 */
1743 static
1744 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1745 {
1746    IRAtom* vatom;
1747    IRType  ty, tyS;
1748    Int     arrSize;;
1749    IRRegArray* descr = puti->descr;
1750    IRAtom*     ix    = puti->ix;
1751    Int         bias  = puti->bias;
1752    IRAtom*     atom  = puti->data;
1753
1754    // Don't do shadow PUTIs if we're not doing undefined value checking.
1755    // Their absence lets Vex's optimiser remove all the shadow computation
1756    // that they depend on, which includes GETIs of the shadow registers.
1757    if (MC_(clo_mc_level) == 1)
1758       return;
1759
1760    tl_assert(isOriginalAtom(mce,atom));
1761    vatom = expr2vbits( mce, atom, HuOth );
1762    tl_assert(sameKindedAtoms(atom, vatom));
1763    ty   = descr->elemTy;
1764    tyS  = shadowTypeV(ty);
1765    arrSize = descr->nElems * sizeofIRType(ty);
1766    tl_assert(ty != Ity_I1);
1767    tl_assert(isOriginalAtom(mce,ix));
1768    complainIfUndefined(mce, ix, NULL);
1769    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1770       /* later: no ... */
1771       /* emit code to emit a complaint if any of the vbits are 1. */
1772       /* complainIfUndefined(mce, atom); */
1773    } else {
1774       /* Do a cloned version of the Put that refers to the shadow
1775          area. */
1776       IRRegArray* new_descr
1777          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1778                          tyS, descr->nElems);
1779       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1780    }
1781 }
1782
1783
1784 /* Return an expression which contains the V bits corresponding to the
1785    given GET (passed in in pieces).
1786 */
1787 static
1788 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1789 {
1790    IRType tyS = shadowTypeV(ty);
1791    tl_assert(ty != Ity_I1);
1792    tl_assert(ty != Ity_I128);
1793    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1794       /* Always defined, return all zeroes of the relevant type */
1795       return definedOfType(tyS);
1796    } else {
1797       /* return a cloned version of the Get that refers to the shadow
1798          area. */
1799       /* FIXME: this isn't an atom! */
1800       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1801    }
1802 }
1803
1804
1805 /* Return an expression which contains the V bits corresponding to the
1806    given GETI (passed in in pieces).
1807 */
1808 static
1809 IRExpr* shadow_GETI ( MCEnv* mce,
1810                       IRRegArray* descr, IRAtom* ix, Int bias )
1811 {
1812    IRType ty   = descr->elemTy;
1813    IRType tyS  = shadowTypeV(ty);
1814    Int arrSize = descr->nElems * sizeofIRType(ty);
1815    tl_assert(ty != Ity_I1);
1816    tl_assert(isOriginalAtom(mce,ix));
1817    complainIfUndefined(mce, ix, NULL);
1818    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1819       /* Always defined, return all zeroes of the relevant type */
1820       return definedOfType(tyS);
1821    } else {
1822       /* return a cloned version of the Get that refers to the shadow
1823          area. */
1824       IRRegArray* new_descr
1825          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1826                          tyS, descr->nElems);
1827       return IRExpr_GetI( new_descr, ix, bias );
1828    }
1829 }
1830
1831
1832 /*------------------------------------------------------------*/
1833 /*--- Generating approximations for unknown operations,    ---*/
1834 /*--- using lazy-propagate semantics                       ---*/
1835 /*------------------------------------------------------------*/
1836
1837 /* Lazy propagation of undefinedness from two values, resulting in the
1838    specified shadow type.
1839 */
1840 static
1841 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1842 {
1843    IRAtom* at;
1844    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1845    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1846    tl_assert(isShadowAtom(mce,va1));
1847    tl_assert(isShadowAtom(mce,va2));
1848
1849    /* The general case is inefficient because PCast is an expensive
1850       operation.  Here are some special cases which use PCast only
1851       once rather than twice. */
1852
1853    /* I64 x I64 -> I64 */
1854    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1855       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1856       at = mkUifU(mce, Ity_I64, va1, va2);
1857       at = mkPCastTo(mce, Ity_I64, at);
1858       return at;
1859    }
1860
1861    /* I64 x I64 -> I32 */
1862    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1863       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1864       at = mkUifU(mce, Ity_I64, va1, va2);
1865       at = mkPCastTo(mce, Ity_I32, at);
1866       return at;
1867    }
1868
1869    /* I32 x I32 -> I32 */
1870    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1871       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1872       at = mkUifU(mce, Ity_I32, va1, va2);
1873       at = mkPCastTo(mce, Ity_I32, at);
1874       return at;
1875    }
1876
1877    if (0) {
1878       VG_(printf)("mkLazy2 ");
1879       ppIRType(t1);
1880       VG_(printf)("_");
1881       ppIRType(t2);
1882       VG_(printf)("_");
1883       ppIRType(finalVty);
1884       VG_(printf)("\n");
1885    }
1886
1887    /* General case: force everything via 32-bit intermediaries. */
1888    at = mkPCastTo(mce, Ity_I32, va1);
1889    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1890    at = mkPCastTo(mce, finalVty, at);
1891    return at;
1892 }
1893
1894
1895 /* 3-arg version of the above. */
1896 static
1897 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1898                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1899 {
1900    IRAtom* at;
1901    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1902    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1903    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1904    tl_assert(isShadowAtom(mce,va1));
1905    tl_assert(isShadowAtom(mce,va2));
1906    tl_assert(isShadowAtom(mce,va3));
1907
1908    /* The general case is inefficient because PCast is an expensive
1909       operation.  Here are some special cases which use PCast only
1910       twice rather than three times. */
1911
1912    /* I32 x I64 x I64 -> I64 */
1913    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1914    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1915        && finalVty == Ity_I64) {
1916       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1917       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1918          mode indication which is fully defined, this should get
1919          folded out later. */
1920       at = mkPCastTo(mce, Ity_I64, va1);
1921       /* Now fold in 2nd and 3rd args. */
1922       at = mkUifU(mce, Ity_I64, at, va2);
1923       at = mkUifU(mce, Ity_I64, at, va3);
1924       /* and PCast once again. */
1925       at = mkPCastTo(mce, Ity_I64, at);
1926       return at;
1927    }
1928
1929    /* I32 x I8 x I64 -> I64 */
1930    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1931        && finalVty == Ity_I64) {
1932       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1933       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1934        * rounding mode indication which is fully defined, this should
1935        * get folded out later.
1936       */
1937       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1938       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1939       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1940       at = mkUifU(mce, Ity_I64, at, va3);
1941       /* and PCast once again. */
1942       at = mkPCastTo(mce, Ity_I64, at);
1943       return at;
1944    }
1945
1946    /* I32 x I64 x I64 -> I32 */
1947    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1948        && finalVty == Ity_I32) {
1949       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1950       at = mkPCastTo(mce, Ity_I64, va1);
1951       at = mkUifU(mce, Ity_I64, at, va2);
1952       at = mkUifU(mce, Ity_I64, at, va3);
1953       at = mkPCastTo(mce, Ity_I32, at);
1954       return at;
1955    }
1956
1957    /* I32 x I32 x I32 -> I32 */
1958    /* 32-bit FP idiom, as (eg) happens on ARM */
1959    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1960        && finalVty == Ity_I32) {
1961       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1962       at = va1;
1963       at = mkUifU(mce, Ity_I32, at, va2);
1964       at = mkUifU(mce, Ity_I32, at, va3);
1965       at = mkPCastTo(mce, Ity_I32, at);
1966       return at;
1967    }
1968
1969    /* I32 x I128 x I128 -> I128 */
1970    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1971    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1972        && finalVty == Ity_I128) {
1973       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1974       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1975          mode indication which is fully defined, this should get
1976          folded out later. */
1977       at = mkPCastTo(mce, Ity_I128, va1);
1978       /* Now fold in 2nd and 3rd args. */
1979       at = mkUifU(mce, Ity_I128, at, va2);
1980       at = mkUifU(mce, Ity_I128, at, va3);
1981       /* and PCast once again. */
1982       at = mkPCastTo(mce, Ity_I128, at);
1983       return at;
1984    }
1985
1986    /* I32 x I8 x I128 -> I128 */
1987    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1988    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
1989        && finalVty == Ity_I128) {
1990       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
1991       /* Use I64 as an intermediate type, which means PCasting all 3
1992          args to I64 to start with. 1st arg is typically a rounding
1993          mode indication which is fully defined, so we hope that it
1994          will get folded out later. */
1995       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1996       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1997       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
1998       /* Now UifU all three together. */
1999       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
2000       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
2001       /* and PCast once again. */
2002       at = mkPCastTo(mce, Ity_I128, at);
2003       return at;
2004    }
2005    if (1) {
2006       VG_(printf)("mkLazy3: ");
2007       ppIRType(t1);
2008       VG_(printf)(" x ");
2009       ppIRType(t2);
2010       VG_(printf)(" x ");
2011       ppIRType(t3);
2012       VG_(printf)(" -> ");
2013       ppIRType(finalVty);
2014       VG_(printf)("\n");
2015    }
2016
2017    tl_assert(0);
2018    /* General case: force everything via 32-bit intermediaries. */
2019    /*
2020    at = mkPCastTo(mce, Ity_I32, va1);
2021    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2022    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2023    at = mkPCastTo(mce, finalVty, at);
2024    return at;
2025    */
2026 }
2027
2028
2029 /* 4-arg version of the above. */
2030 static
2031 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2032                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2033 {
2034    IRAtom* at;
2035    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2036    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2037    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2038    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2039    tl_assert(isShadowAtom(mce,va1));
2040    tl_assert(isShadowAtom(mce,va2));
2041    tl_assert(isShadowAtom(mce,va3));
2042    tl_assert(isShadowAtom(mce,va4));
2043
2044    /* The general case is inefficient because PCast is an expensive
2045       operation.  Here are some special cases which use PCast only
2046       twice rather than three times. */
2047
2048    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2049
2050    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2051        && finalVty == Ity_I128) {
2052       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2053       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
2054          mode indication which is fully defined, this should get
2055          folded out later. */
2056       at = mkPCastTo(mce, Ity_I128, va1);
2057       /* Now fold in 2nd, 3rd, 4th args. */
2058       at = mkUifU(mce, Ity_I128, at, va2);
2059       at = mkUifU(mce, Ity_I128, at, va3);
2060       at = mkUifU(mce, Ity_I128, at, va4);
2061       /* and PCast once again. */
2062       at = mkPCastTo(mce, Ity_I128, at);
2063       return at;
2064    }
2065
2066    /* I32 x I64 x I64 x I64 -> I64 */
2067    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2068        && finalVty == Ity_I64) {
2069       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2070       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
2071          mode indication which is fully defined, this should get
2072          folded out later. */
2073       at = mkPCastTo(mce, Ity_I64, va1);
2074       /* Now fold in 2nd, 3rd, 4th args. */
2075       at = mkUifU(mce, Ity_I64, at, va2);
2076       at = mkUifU(mce, Ity_I64, at, va3);
2077       at = mkUifU(mce, Ity_I64, at, va4);
2078       /* and PCast once again. */
2079       at = mkPCastTo(mce, Ity_I64, at);
2080       return at;
2081    }
2082    /* I32 x I32 x I32 x I32 -> I32 */
2083    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2084    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2085        && finalVty == Ity_I32) {
2086       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2087       at = va1;
2088       /* Now fold in 2nd, 3rd, 4th args. */
2089       at = mkUifU(mce, Ity_I32, at, va2);
2090       at = mkUifU(mce, Ity_I32, at, va3);
2091       at = mkUifU(mce, Ity_I32, at, va4);
2092       at = mkPCastTo(mce, Ity_I32, at);
2093       return at;
2094    }
2095
2096    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2097        && finalVty == Ity_I32) {
2098       if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2099       at = mkPCastTo(mce, Ity_I8, va1);
2100       /* Now fold in 2nd, 3rd, 4th args. */
2101       at = mkUifU(mce, Ity_I8, at, va2);
2102       at = mkUifU(mce, Ity_I8, at, va3);
2103       at = mkUifU(mce, Ity_I8, at, va4);
2104       at = mkPCastTo(mce, Ity_I32, at);
2105       return at;
2106    }
2107
2108    if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2109        && finalVty == Ity_I64) {
2110       if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2111       at = mkPCastTo(mce, Ity_I8, va1);
2112       /* Now fold in 2nd, 3rd, 4th args. */
2113       at = mkUifU(mce, Ity_I8, at, va2);
2114       at = mkUifU(mce, Ity_I8, at, va3);
2115       at = mkUifU(mce, Ity_I8, at, va4);
2116       at = mkPCastTo(mce, Ity_I64, at);
2117       return at;
2118    }
2119
2120    if (1) {
2121       VG_(printf)("mkLazy4: ");
2122       ppIRType(t1);
2123       VG_(printf)(" x ");
2124       ppIRType(t2);
2125       VG_(printf)(" x ");
2126       ppIRType(t3);
2127       VG_(printf)(" x ");
2128       ppIRType(t4);
2129       VG_(printf)(" -> ");
2130       ppIRType(finalVty);
2131       VG_(printf)("\n");
2132    }
2133
2134    tl_assert(0);
2135 }
2136
2137
2138 /* Do the lazy propagation game from a null-terminated vector of
2139    atoms.  This is presumably the arguments to a helper call, so the
2140    IRCallee info is also supplied in order that we can know which
2141    arguments should be ignored (via the .mcx_mask field).
2142 */
2143 static
2144 IRAtom* mkLazyN ( MCEnv* mce,
2145                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2146 {
2147    Int     i;
2148    IRAtom* here;
2149    IRAtom* curr;
2150    IRType  mergeTy;
2151    Bool    mergeTy64 = True;
2152
2153    /* Decide on the type of the merge intermediary.  If all relevant
2154       args are I64, then it's I64.  In all other circumstances, use
2155       I32. */
2156    for (i = 0; exprvec[i]; i++) {
2157       tl_assert(i < 32);
2158       tl_assert(isOriginalAtom(mce, exprvec[i]));
2159       if (cee->mcx_mask & (1<<i))
2160          continue;
2161       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2162          mergeTy64 = False;
2163    }
2164
2165    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
2166    curr    = definedOfType(mergeTy);
2167
2168    for (i = 0; exprvec[i]; i++) {
2169       tl_assert(i < 32);
2170       tl_assert(isOriginalAtom(mce, exprvec[i]));
2171       /* Only take notice of this arg if the callee's mc-exclusion
2172          mask does not say it is to be excluded. */
2173       if (cee->mcx_mask & (1<<i)) {
2174          /* the arg is to be excluded from definedness checking.  Do
2175             nothing. */
2176          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2177       } else {
2178          /* calculate the arg's definedness, and pessimistically merge
2179             it in. */
2180          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2181          curr = mergeTy64
2182                    ? mkUifU64(mce, here, curr)
2183                    : mkUifU32(mce, here, curr);
2184       }
2185    }
2186    return mkPCastTo(mce, finalVtype, curr );
2187 }
2188
2189
2190 /*------------------------------------------------------------*/
2191 /*--- Generating expensive sequences for exact carry-chain ---*/
2192 /*--- propagation in add/sub and related operations.       ---*/
2193 /*------------------------------------------------------------*/
2194
2195 static
2196 IRAtom* expensiveAddSub ( MCEnv*  mce,
2197                           Bool    add,
2198                           IRType  ty,
2199                           IRAtom* qaa, IRAtom* qbb,
2200                           IRAtom* aa,  IRAtom* bb )
2201 {
2202    IRAtom *a_min, *b_min, *a_max, *b_max;
2203    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
2204
2205    tl_assert(isShadowAtom(mce,qaa));
2206    tl_assert(isShadowAtom(mce,qbb));
2207    tl_assert(isOriginalAtom(mce,aa));
2208    tl_assert(isOriginalAtom(mce,bb));
2209    tl_assert(sameKindedAtoms(qaa,aa));
2210    tl_assert(sameKindedAtoms(qbb,bb));
2211
2212    switch (ty) {
2213       case Ity_I32:
2214          opAND = Iop_And32;
2215          opOR  = Iop_Or32;
2216          opXOR = Iop_Xor32;
2217          opNOT = Iop_Not32;
2218          opADD = Iop_Add32;
2219          opSUB = Iop_Sub32;
2220          break;
2221       case Ity_I64:
2222          opAND = Iop_And64;
2223          opOR  = Iop_Or64;
2224          opXOR = Iop_Xor64;
2225          opNOT = Iop_Not64;
2226          opADD = Iop_Add64;
2227          opSUB = Iop_Sub64;
2228          break;
2229       default:
2230          VG_(tool_panic)("expensiveAddSub");
2231    }
2232
2233    // a_min = aa & ~qaa
2234    a_min = assignNew('V', mce,ty,
2235                      binop(opAND, aa,
2236                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
2237
2238    // b_min = bb & ~qbb
2239    b_min = assignNew('V', mce,ty,
2240                      binop(opAND, bb,
2241                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
2242
2243    // a_max = aa | qaa
2244    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2245
2246    // b_max = bb | qbb
2247    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2248
2249    if (add) {
2250       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2251       return
2252       assignNew('V', mce,ty,
2253          binop( opOR,
2254                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2255                 assignNew('V', mce,ty,
2256                    binop( opXOR,
2257                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2258                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2259                    )
2260                 )
2261          )
2262       );
2263    } else {
2264       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2265       return
2266       assignNew('V', mce,ty,
2267          binop( opOR,
2268                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2269                 assignNew('V', mce,ty,
2270                    binop( opXOR,
2271                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2272                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2273                    )
2274                 )
2275          )
2276       );
2277    }
2278
2279 }
2280
2281
2282 static
2283 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2284                                        IRAtom* atom, IRAtom* vatom )
2285 {
2286    IRType ty;
2287    IROp xorOp, subOp, andOp;
2288    IRExpr *one;
2289    IRAtom *improver, *improved;
2290    tl_assert(isShadowAtom(mce,vatom));
2291    tl_assert(isOriginalAtom(mce,atom));
2292    tl_assert(sameKindedAtoms(atom,vatom));
2293
2294    switch (czop) {
2295       case Iop_Ctz32: case Iop_CtzNat32:
2296          ty = Ity_I32;
2297          xorOp = Iop_Xor32;
2298          subOp = Iop_Sub32;
2299          andOp = Iop_And32;
2300          one = mkU32(1);
2301          break;
2302       case Iop_Ctz64: case Iop_CtzNat64:
2303          ty = Ity_I64;
2304          xorOp = Iop_Xor64;
2305          subOp = Iop_Sub64;
2306          andOp = Iop_And64;
2307          one = mkU64(1);
2308          break;
2309       default:
2310          ppIROp(czop);
2311          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2312    }
2313
2314    // improver = atom ^ (atom - 1)
2315    //
2316    // That is, improver has its low ctz(atom)+1 bits equal to one;
2317    // higher bits (if any) equal to zero.  So it's exactly the right
2318    // mask to use to remove the irrelevant undefined input bits.
2319    /* Here are some examples:
2320          atom   = U...U 1 0...0
2321          atom-1 = U...U 0 1...1
2322          ^ed    = 0...0 1 11111, which correctly describes which bits of |atom|
2323                                  actually influence the result
2324       A boundary case
2325          atom   = 0...0
2326          atom-1 = 1...1
2327          ^ed    = 11111, also a correct mask for the input: all input bits
2328                          are relevant
2329       Another boundary case
2330          atom   = 1..1 1
2331          atom-1 = 1..1 0
2332          ^ed    = 0..0 1, also a correct mask: only the rightmost input bit
2333                           is relevant
2334       Now with misc U bits interspersed:
2335          atom   = U...U 1 0 U...U 0 1 0...0
2336          atom-1 = U...U 1 0 U...U 0 0 1...1
2337          ^ed    = 0...0 0 0 0...0 0 1 1...1, also correct
2338       (Per re-check/analysis of 14 Nov 2018)
2339    */
2340    improver = assignNew('V', mce,ty,
2341                         binop(xorOp,
2342                               atom,
2343                               assignNew('V', mce, ty,
2344                                         binop(subOp, atom, one))));
2345
2346    // improved = vatom & improver
2347    //
2348    // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2349    // bits as "defined".
2350    improved = assignNew('V', mce, ty,
2351                         binop(andOp, vatom, improver));
2352
2353    // Return pessimizing cast of improved.
2354    return mkPCastTo(mce, ty, improved);
2355 }
2356
2357 static
2358 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2359                                       IRAtom* atom, IRAtom* vatom )
2360 {
2361    IRType ty;
2362    IROp shrOp, notOp, andOp;
2363    IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2364    IRAtom *improver, *improved;
2365    tl_assert(isShadowAtom(mce,vatom));
2366    tl_assert(isOriginalAtom(mce,atom));
2367    tl_assert(sameKindedAtoms(atom,vatom));
2368
2369    switch (czop) {
2370       case Iop_Clz32: case Iop_ClzNat32:
2371          ty = Ity_I32;
2372          shrOp = Iop_Shr32;
2373          notOp = Iop_Not32;
2374          andOp = Iop_And32;
2375          mkRight = mkRight32;
2376          break;
2377       case Iop_Clz64: case Iop_ClzNat64:
2378          ty = Ity_I64;
2379          shrOp = Iop_Shr64;
2380          notOp = Iop_Not64;
2381          andOp = Iop_And64;
2382          mkRight = mkRight64;
2383          break;
2384       default:
2385          ppIROp(czop);
2386          VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2387    }
2388
2389    // This is in principle very similar to how expensiveCountTrailingZeroes
2390    // works.  That function computed an "improver", which it used to mask
2391    // off all but the rightmost 1-bit and the zeroes to the right of it,
2392    // hence removing irrelevant bits from the input.  Here, we play the
2393    // exact same game but with the left-vs-right roles interchanged.
2394    // Unfortunately calculation of the improver in this case is
2395    // significantly more expensive.
2396    //
2397    // improver = ~(RIGHT(atom) >>u 1)
2398    //
2399    // That is, improver has its upper clz(atom)+1 bits equal to one;
2400    // lower bits (if any) equal to zero.  So it's exactly the right
2401    // mask to use to remove the irrelevant undefined input bits.
2402    /* Here are some examples:
2403          atom             = 0...0 1 U...U
2404          R(atom)          = 0...0 1 1...1
2405          R(atom) >>u 1    = 0...0 0 1...1
2406          ~(R(atom) >>u 1) = 1...1 1 0...0
2407                             which correctly describes which bits of |atom|
2408                             actually influence the result
2409       A boundary case
2410          atom             = 0...0
2411          R(atom)          = 0...0
2412          R(atom) >>u 1    = 0...0
2413          ~(R(atom) >>u 1) = 1...1
2414                             also a correct mask for the input: all input bits
2415                             are relevant
2416       Another boundary case
2417          atom             = 1 1..1
2418          R(atom)          = 1 1..1
2419          R(atom) >>u 1    = 0 1..1
2420          ~(R(atom) >>u 1) = 1 0..0
2421                             also a correct mask: only the leftmost input bit
2422                             is relevant
2423       Now with misc U bits interspersed:
2424          atom             = 0...0 1 U...U 0 1 U...U
2425          R(atom)          = 0...0 1 1...1 1 1 1...1
2426          R(atom) >>u 1    = 0...0 0 1...1 1 1 1...1
2427          ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2428       (Per initial implementation of 15 Nov 2018)
2429    */
2430    improver = mkRight(mce, atom);
2431    improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2432    improver = assignNew('V', mce, ty, unop(notOp, improver));
2433
2434    // improved = vatom & improver
2435    //
2436    // That is, treat any V bits to the right of the leftmost clz(atom)+1
2437    // bits as "defined".
2438    improved = assignNew('V', mce, ty,
2439                         binop(andOp, vatom, improver));
2440
2441    // Return pessimizing cast of improved.
2442    return mkPCastTo(mce, ty, improved);
2443 }
2444
2445
2446 /*------------------------------------------------------------*/
2447 /*--- Scalar shifts.                                       ---*/
2448 /*------------------------------------------------------------*/
2449
2450 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2451    idea is to shift the definedness bits by the original shift amount.
2452    This introduces 0s ("defined") in new positions for left shifts and
2453    unsigned right shifts, and copies the top definedness bit for
2454    signed right shifts.  So, conveniently, applying the original shift
2455    operator to the definedness bits for the left arg is exactly the
2456    right thing to do:
2457
2458       (qaa << bb)
2459
2460    However if the shift amount is undefined then the whole result
2461    is undefined.  Hence need:
2462
2463       (qaa << bb) `UifU` PCast(qbb)
2464
2465    If the shift amount bb is a literal than qbb will say 'all defined'
2466    and the UifU and PCast will get folded out by post-instrumentation
2467    optimisation.
2468 */
2469 static IRAtom* scalarShift ( MCEnv*  mce,
2470                              IRType  ty,
2471                              IROp    original_op,
2472                              IRAtom* qaa, IRAtom* qbb,
2473                              IRAtom* aa,  IRAtom* bb )
2474 {
2475    tl_assert(isShadowAtom(mce,qaa));
2476    tl_assert(isShadowAtom(mce,qbb));
2477    tl_assert(isOriginalAtom(mce,aa));
2478    tl_assert(isOriginalAtom(mce,bb));
2479    tl_assert(sameKindedAtoms(qaa,aa));
2480    tl_assert(sameKindedAtoms(qbb,bb));
2481    return
2482       assignNew(
2483          'V', mce, ty,
2484          mkUifU( mce, ty,
2485                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2486                  mkPCastTo(mce, ty, qbb)
2487          )
2488    );
2489 }
2490
2491
2492 /*------------------------------------------------------------*/
2493 /*--- Helpers for dealing with vector primops.             ---*/
2494 /*------------------------------------------------------------*/
2495
2496 /* Vector pessimisation -- pessimise within each lane individually. */
2497
2498 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2499 {
2500    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2501 }
2502
2503 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2504 {
2505    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2506 }
2507
2508 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2509 {
2510    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2511 }
2512
2513 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2514 {
2515    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2516 }
2517
2518 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2519 {
2520    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2521 }
2522
2523 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2524 {
2525    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2526 }
2527
2528 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2529 {
2530    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2531 }
2532
2533 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2534 {
2535    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2536 }
2537
2538 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2539 {
2540    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2541 }
2542
2543 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2544 {
2545    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2546 }
2547
2548 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2549 {
2550    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2551 }
2552
2553 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2554 {
2555    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2556 }
2557
2558 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2559 {
2560    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2561 }
2562
2563 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2564 {
2565    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2566 }
2567
2568
2569 /* Here's a simple scheme capable of handling ops derived from SSE1
2570    code and while only generating ops that can be efficiently
2571    implemented in SSE1. */
2572
2573 /* All-lanes versions are straightforward:
2574
2575    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2576
2577    unary32Fx4(x,y)    ==> PCast32x4(x#)
2578
2579    Lowest-lane-only versions are more complex:
2580
2581    binary32F0x4(x,y)  ==> SetV128lo32(
2582                              x#,
2583                              PCast32(V128to32(UifUV128(x#,y#)))
2584                           )
2585
2586    This is perhaps not so obvious.  In particular, it's faster to
2587    do a V128-bit UifU and then take the bottom 32 bits than the more
2588    obvious scheme of taking the bottom 32 bits of each operand
2589    and doing a 32-bit UifU.  Basically since UifU is fast and
2590    chopping lanes off vector values is slow.
2591
2592    Finally:
2593
2594    unary32F0x4(x)     ==> SetV128lo32(
2595                              x#,
2596                              PCast32(V128to32(x#))
2597                           )
2598
2599    Where:
2600
2601    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2602    PCast32x4(v#) = CmpNEZ32x4(v#)
2603 */
2604
2605 static
2606 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2607 {
2608    IRAtom* at;
2609    tl_assert(isShadowAtom(mce, vatomX));
2610    tl_assert(isShadowAtom(mce, vatomY));
2611    at = mkUifUV128(mce, vatomX, vatomY);
2612    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2613    return at;
2614 }
2615
2616 static
2617 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2618 {
2619    IRAtom* at;
2620    tl_assert(isShadowAtom(mce, vatomX));
2621    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2622    return at;
2623 }
2624
2625 static
2626 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2627 {
2628    IRAtom* at;
2629    tl_assert(isShadowAtom(mce, vatomX));
2630    tl_assert(isShadowAtom(mce, vatomY));
2631    at = mkUifUV128(mce, vatomX, vatomY);
2632    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2633    at = mkPCastTo(mce, Ity_I32, at);
2634    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2635    return at;
2636 }
2637
2638 static
2639 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2640 {
2641    IRAtom* at;
2642    tl_assert(isShadowAtom(mce, vatomX));
2643    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2644    at = mkPCastTo(mce, Ity_I32, at);
2645    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2646    return at;
2647 }
2648
2649 /* --- ... and ... 64Fx2 versions of the same ... --- */
2650
2651 static
2652 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2653 {
2654    IRAtom* at;
2655    tl_assert(isShadowAtom(mce, vatomX));
2656    tl_assert(isShadowAtom(mce, vatomY));
2657    at = mkUifUV128(mce, vatomX, vatomY);
2658    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2659    return at;
2660 }
2661
2662 static
2663 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2664 {
2665    IRAtom* at;
2666    tl_assert(isShadowAtom(mce, vatomX));
2667    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2668    return at;
2669 }
2670
2671 static
2672 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2673 {
2674    IRAtom* at;
2675    tl_assert(isShadowAtom(mce, vatomX));
2676    tl_assert(isShadowAtom(mce, vatomY));
2677    at = mkUifUV128(mce, vatomX, vatomY);
2678    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2679    at = mkPCastTo(mce, Ity_I64, at);
2680    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2681    return at;
2682 }
2683
2684 static
2685 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2686 {
2687    IRAtom* at;
2688    tl_assert(isShadowAtom(mce, vatomX));
2689    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2690    at = mkPCastTo(mce, Ity_I64, at);
2691    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2692    return at;
2693 }
2694
2695 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2696
2697 static
2698 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2699 {
2700    IRAtom* at;
2701    tl_assert(isShadowAtom(mce, vatomX));
2702    tl_assert(isShadowAtom(mce, vatomY));
2703    at = mkUifU64(mce, vatomX, vatomY);
2704    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2705    return at;
2706 }
2707
2708 static
2709 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2710 {
2711    IRAtom* at;
2712    tl_assert(isShadowAtom(mce, vatomX));
2713    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2714    return at;
2715 }
2716
2717 /* --- ... and ... 64Fx4 versions of the same ... --- */
2718
2719 static
2720 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2721 {
2722    IRAtom* at;
2723    tl_assert(isShadowAtom(mce, vatomX));
2724    tl_assert(isShadowAtom(mce, vatomY));
2725    at = mkUifUV256(mce, vatomX, vatomY);
2726    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2727    return at;
2728 }
2729
2730 static
2731 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2732 {
2733    IRAtom* at;
2734    tl_assert(isShadowAtom(mce, vatomX));
2735    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2736    return at;
2737 }
2738
2739 /* --- ... and ... 32Fx8 versions of the same ... --- */
2740
2741 static
2742 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2743 {
2744    IRAtom* at;
2745    tl_assert(isShadowAtom(mce, vatomX));
2746    tl_assert(isShadowAtom(mce, vatomY));
2747    at = mkUifUV256(mce, vatomX, vatomY);
2748    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2749    return at;
2750 }
2751
2752 static
2753 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2754 {
2755    IRAtom* at;
2756    tl_assert(isShadowAtom(mce, vatomX));
2757    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2758    return at;
2759 }
2760
2761 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2762
2763 static
2764 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2765                                        IRAtom* vatomX, IRAtom* vatomY )
2766 {
2767    /* This is the same as binary64Fx2, except that we subsequently
2768       pessimise vRM (definedness of the rounding mode), widen to 128
2769       bits and UifU it into the result.  As with the scalar cases, if
2770       the RM is a constant then it is defined and so this extra bit
2771       will get constant-folded out later. */
2772    // "do" the vector args
2773    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2774    // PCast the RM, and widen it to 128 bits
2775    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2776    // Roll it into the result
2777    t1 = mkUifUV128(mce, t1, t2);
2778    return t1;
2779 }
2780
2781 /* --- ... and ... 32Fx4 versions of the same --- */
2782
2783 static
2784 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2785                                        IRAtom* vatomX, IRAtom* vatomY )
2786 {
2787    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2788    // PCast the RM, and widen it to 128 bits
2789    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2790    // Roll it into the result
2791    t1 = mkUifUV128(mce, t1, t2);
2792    return t1;
2793 }
2794
2795 /* --- ... and ... 64Fx4 versions of the same --- */
2796
2797 static
2798 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2799                                        IRAtom* vatomX, IRAtom* vatomY )
2800 {
2801    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2802    // PCast the RM, and widen it to 256 bits
2803    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2804    // Roll it into the result
2805    t1 = mkUifUV256(mce, t1, t2);
2806    return t1;
2807 }
2808
2809 /* --- ... and ... 32Fx8 versions of the same --- */
2810
2811 static
2812 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2813                                        IRAtom* vatomX, IRAtom* vatomY )
2814 {
2815    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2816    // PCast the RM, and widen it to 256 bits
2817    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2818    // Roll it into the result
2819    t1 = mkUifUV256(mce, t1, t2);
2820    return t1;
2821 }
2822
2823 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2824
2825 static
2826 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2827 {
2828    /* Same scheme as binary64Fx2_w_rm. */
2829    // "do" the vector arg
2830    IRAtom* t1 = unary64Fx2(mce, vatomX);
2831    // PCast the RM, and widen it to 128 bits
2832    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2833    // Roll it into the result
2834    t1 = mkUifUV128(mce, t1, t2);
2835    return t1;
2836 }
2837
2838 /* --- ... and ... 32Fx4 versions of the same --- */
2839
2840 static
2841 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2842 {
2843    /* Same scheme as binaryFx4_w_rm. */
2844    IRAtom* t1 = unary32Fx4(mce, vatomX);
2845    // PCast the RM, and widen it to 128 bits
2846    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2847    // Roll it into the result
2848    t1 = mkUifUV128(mce, t1, t2);
2849    return t1;
2850 }
2851
2852 /* --- ... and ... 32Fx8 versions of the same --- */
2853
2854 static
2855 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2856 {
2857    /* Same scheme as unary32Fx8_w_rm. */
2858    IRAtom* t1 = unary32Fx8(mce, vatomX);
2859    // PCast the RM, and widen it to 256 bits
2860    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2861    // Roll it into the result
2862    t1 = mkUifUV256(mce, t1, t2);
2863    return t1;
2864 }
2865
2866
2867 /* --- --- Vector saturated narrowing --- --- */
2868
2869 /* We used to do something very clever here, but on closer inspection
2870    (2011-Jun-15), and in particular bug #279698, it turns out to be
2871    wrong.  Part of the problem came from the fact that for a long
2872    time, the IR primops to do with saturated narrowing were
2873    underspecified and managed to confuse multiple cases which needed
2874    to be separate: the op names had a signedness qualifier, but in
2875    fact the source and destination signednesses needed to be specified
2876    independently, so the op names really need two independent
2877    signedness specifiers.
2878
2879    As of 2011-Jun-15 (ish) the underspecification was sorted out
2880    properly.  The incorrect instrumentation remained, though.  That
2881    has now (2011-Oct-22) been fixed.
2882
2883    What we now do is simple:
2884
2885    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2886    number of lanes, X is the source lane width and signedness, and Y
2887    is the destination lane width and signedness.  In all cases the
2888    destination lane width is half the source lane width, so the names
2889    have a bit of redundancy, but are at least easy to read.
2890
2891    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2892    to unsigned 16s.
2893
2894    Let Vanilla(OP) be a function that takes OP, one of these
2895    saturating narrowing ops, and produces the same "shaped" narrowing
2896    op which is not saturating, but merely dumps the most significant
2897    bits.  "same shape" means that the lane numbers and widths are the
2898    same as with OP.
2899
2900    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2901                   = Iop_NarrowBin32to16x8,
2902    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2903    dumping the top half of each lane.
2904
2905    So, with that in place, the scheme is simple, and it is simple to
2906    pessimise each lane individually and then apply Vanilla(OP) so as
2907    to get the result in the right "shape".  If the original OP is
2908    QNarrowBinXtoYxZ then we produce
2909
2910    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2911
2912    or for the case when OP is unary (Iop_QNarrowUn*)
2913
2914    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2915 */
2916 static
2917 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2918 {
2919    switch (qnarrowOp) {
2920       /* Binary: (128, 128) -> 128 */
2921       case Iop_QNarrowBin16Sto8Ux16:
2922       case Iop_QNarrowBin16Sto8Sx16:
2923       case Iop_QNarrowBin16Uto8Ux16:
2924       case Iop_QNarrowBin64Sto32Sx4:
2925       case Iop_QNarrowBin64Uto32Ux4:
2926          return Iop_NarrowBin16to8x16;
2927       case Iop_QNarrowBin32Sto16Ux8:
2928       case Iop_QNarrowBin32Sto16Sx8:
2929       case Iop_QNarrowBin32Uto16Ux8:
2930          return Iop_NarrowBin32to16x8;
2931       /* Binary: (64, 64) -> 64 */
2932       case Iop_QNarrowBin32Sto16Sx4:
2933          return Iop_NarrowBin32to16x4;
2934       case Iop_QNarrowBin16Sto8Ux8:
2935       case Iop_QNarrowBin16Sto8Sx8:
2936          return Iop_NarrowBin16to8x8;
2937       /* Unary: 128 -> 64 */
2938       case Iop_QNarrowUn64Uto32Ux2:
2939       case Iop_QNarrowUn64Sto32Sx2:
2940       case Iop_QNarrowUn64Sto32Ux2:
2941          return Iop_NarrowUn64to32x2;
2942       case Iop_QNarrowUn32Uto16Ux4:
2943       case Iop_QNarrowUn32Sto16Sx4:
2944       case Iop_QNarrowUn32Sto16Ux4:
2945       case Iop_F32toF16x4_DEP:
2946          return Iop_NarrowUn32to16x4;
2947       case Iop_QNarrowUn16Uto8Ux8:
2948       case Iop_QNarrowUn16Sto8Sx8:
2949       case Iop_QNarrowUn16Sto8Ux8:
2950          return Iop_NarrowUn16to8x8;
2951       default:
2952          ppIROp(qnarrowOp);
2953          VG_(tool_panic)("vanillaNarrowOpOfShape");
2954    }
2955 }
2956
2957 static
2958 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
2959                               IRAtom* vatom1, IRAtom* vatom2)
2960 {
2961    IRAtom *at1, *at2, *at3;
2962    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2963    switch (narrow_op) {
2964       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
2965       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
2966       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
2967       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
2968       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
2969       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
2970       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
2971       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
2972       default: VG_(tool_panic)("vectorNarrowBinV128");
2973    }
2974    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2975    tl_assert(isShadowAtom(mce,vatom1));
2976    tl_assert(isShadowAtom(mce,vatom2));
2977    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2978    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
2979    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
2980    return at3;
2981 }
2982
2983 static
2984 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
2985                             IRAtom* vatom1, IRAtom* vatom2)
2986 {
2987    IRAtom *at1, *at2, *at3;
2988    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2989    switch (narrow_op) {
2990       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
2991       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
2992       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
2993       default: VG_(tool_panic)("vectorNarrowBin64");
2994    }
2995    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2996    tl_assert(isShadowAtom(mce,vatom1));
2997    tl_assert(isShadowAtom(mce,vatom2));
2998    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
2999    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3000    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3001    return at3;
3002 }
3003
3004 static
3005 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3006                              IRAtom* vatom1)
3007 {
3008    IRAtom *at1, *at2;
3009    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3010    tl_assert(isShadowAtom(mce,vatom1));
3011    /* For vanilla narrowing (non-saturating), we can just apply
3012       the op directly to the V bits. */
3013    switch (narrow_op) {
3014       case Iop_NarrowUn16to8x8:
3015       case Iop_NarrowUn32to16x4:
3016       case Iop_NarrowUn64to32x2:
3017       case Iop_F32toF16x4_DEP:
3018          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3019          return at1;
3020       default:
3021          break; /* Do Plan B */
3022    }
3023    /* Plan B: for ops that involve a saturation operation on the args,
3024       we must PCast before the vanilla narrow. */
3025    switch (narrow_op) {
3026       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
3027       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
3028       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
3029       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3030       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3031       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3032       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3033       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3034       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3035       default: VG_(tool_panic)("vectorNarrowUnV128");
3036    }
3037    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3038    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3039    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3040    return at2;
3041 }
3042
3043 static
3044 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3045                          IRAtom* vatom1)
3046 {
3047    IRAtom *at1, *at2;
3048    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3049    switch (longen_op) {
3050       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
3051       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
3052       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3053       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3054       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3055       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3056       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
3057       default: VG_(tool_panic)("vectorWidenI64");
3058    }
3059    tl_assert(isShadowAtom(mce,vatom1));
3060    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3061    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3062    return at2;
3063 }
3064
3065
3066 /* --- --- Vector integer arithmetic --- --- */
3067
3068 /* Simple ... UifU the args and per-lane pessimise the results. */
3069
3070 /* --- V256-bit versions --- */
3071
3072 static
3073 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3074 {
3075    IRAtom* at;
3076    at = mkUifUV256(mce, vatom1, vatom2);
3077    at = mkPCast8x32(mce, at);
3078    return at;
3079 }
3080
3081 static
3082 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3083 {
3084    IRAtom* at;
3085    at = mkUifUV256(mce, vatom1, vatom2);
3086    at = mkPCast16x16(mce, at);
3087    return at;
3088 }
3089
3090 static
3091 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3092 {
3093    IRAtom* at;
3094    at = mkUifUV256(mce, vatom1, vatom2);
3095    at = mkPCast32x8(mce, at);
3096    return at;
3097 }
3098
3099 static
3100 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3101 {
3102    IRAtom* at;
3103    at = mkUifUV256(mce, vatom1, vatom2);
3104    at = mkPCast64x4(mce, at);
3105    return at;
3106 }
3107
3108 /* --- V128-bit versions --- */
3109
3110 static
3111 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3112 {
3113    IRAtom* at;
3114    at = mkUifUV128(mce, vatom1, vatom2);
3115    at = mkPCast8x16(mce, at);
3116    return at;
3117 }
3118
3119 static
3120 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3121 {
3122    IRAtom* at;
3123    at = mkUifUV128(mce, vatom1, vatom2);
3124    at = mkPCast16x8(mce, at);
3125    return at;
3126 }
3127
3128 static
3129 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3130 {
3131    IRAtom* at;
3132    at = mkUifUV128(mce, vatom1, vatom2);
3133    at = mkPCast32x4(mce, at);
3134    return at;
3135 }
3136
3137 static
3138 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3139 {
3140    IRAtom* at;
3141    at = mkUifUV128(mce, vatom1, vatom2);
3142    at = mkPCast64x2(mce, at);
3143    return at;
3144 }
3145
3146 static
3147 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3148 {
3149    IRAtom* at;
3150    at = mkUifUV128(mce, vatom1, vatom2);
3151    at = mkPCast128x1(mce, at);
3152    return at;
3153 }
3154
3155 /* --- 64-bit versions --- */
3156
3157 static
3158 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3159 {
3160    IRAtom* at;
3161    at = mkUifU64(mce, vatom1, vatom2);
3162    at = mkPCast8x8(mce, at);
3163    return at;
3164 }
3165
3166 static
3167 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3168 {
3169    IRAtom* at;
3170    at = mkUifU64(mce, vatom1, vatom2);
3171    at = mkPCast16x4(mce, at);
3172    return at;
3173 }
3174
3175 static
3176 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3177 {
3178    IRAtom* at;
3179    at = mkUifU64(mce, vatom1, vatom2);
3180    at = mkPCast32x2(mce, at);
3181    return at;
3182 }
3183
3184 static
3185 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3186 {
3187    IRAtom* at;
3188    at = mkUifU64(mce, vatom1, vatom2);
3189    at = mkPCastTo(mce, Ity_I64, at);
3190    return at;
3191 }
3192
3193 /* --- 32-bit versions --- */
3194
3195 static
3196 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3197 {
3198    IRAtom* at;
3199    at = mkUifU32(mce, vatom1, vatom2);
3200    at = mkPCast8x4(mce, at);
3201    return at;
3202 }
3203
3204 static
3205 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3206 {
3207    IRAtom* at;
3208    at = mkUifU32(mce, vatom1, vatom2);
3209    at = mkPCast16x2(mce, at);
3210    return at;
3211 }
3212
3213
3214 /*------------------------------------------------------------*/
3215 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
3216 /*------------------------------------------------------------*/
3217
3218 static
3219 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3220                          IROp op,
3221                          IRAtom* atom1, IRAtom* atom2,
3222                          IRAtom* atom3, IRAtom* atom4 )
3223 {
3224    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3225    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3226    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3227    IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3228
3229    tl_assert(isOriginalAtom(mce,atom1));
3230    tl_assert(isOriginalAtom(mce,atom2));
3231    tl_assert(isOriginalAtom(mce,atom3));
3232    tl_assert(isOriginalAtom(mce,atom4));
3233    tl_assert(isShadowAtom(mce,vatom1));
3234    tl_assert(isShadowAtom(mce,vatom2));
3235    tl_assert(isShadowAtom(mce,vatom3));
3236    tl_assert(isShadowAtom(mce,vatom4));
3237    tl_assert(sameKindedAtoms(atom1,vatom1));
3238    tl_assert(sameKindedAtoms(atom2,vatom2));
3239    tl_assert(sameKindedAtoms(atom3,vatom3));
3240    tl_assert(sameKindedAtoms(atom4,vatom4));
3241    switch (op) {
3242       case Iop_MAddF64:
3243       case Iop_MAddF64r32:
3244       case Iop_MSubF64:
3245       case Iop_MSubF64r32:
3246          /* I32(rm) x F64 x F64 x F64 -> F64 */
3247          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3248
3249       case Iop_MAddF32:
3250       case Iop_MSubF32:
3251          /* I32(rm) x F32 x F32 x F32 -> F32 */
3252          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3253
3254       case Iop_MAddF128:
3255       case Iop_MSubF128:
3256       case Iop_NegMAddF128:
3257       case Iop_NegMSubF128:
3258          /* I32(rm) x F128 x F128 x F128 -> F128 */
3259          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3260
3261       /* V256-bit data-steering */
3262       case Iop_64x4toV256:
3263          return assignNew('V', mce, Ity_V256,
3264                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3265
3266       /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3267       case Iop_Rotx32:
3268          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3269       case Iop_Rotx64:
3270          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3271       default:
3272          ppIROp(op);
3273          VG_(tool_panic)("memcheck:expr2vbits_Qop");
3274    }
3275 }
3276
3277
3278 static
3279 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3280                            IROp op,
3281                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3282 {
3283    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3284    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3285    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3286
3287    tl_assert(isOriginalAtom(mce,atom1));
3288    tl_assert(isOriginalAtom(mce,atom2));
3289    tl_assert(isOriginalAtom(mce,atom3));
3290    tl_assert(isShadowAtom(mce,vatom1));
3291    tl_assert(isShadowAtom(mce,vatom2));
3292    tl_assert(isShadowAtom(mce,vatom3));
3293    tl_assert(sameKindedAtoms(atom1,vatom1));
3294    tl_assert(sameKindedAtoms(atom2,vatom2));
3295    tl_assert(sameKindedAtoms(atom3,vatom3));
3296    switch (op) {
3297       case Iop_AddF128:
3298       case Iop_SubF128:
3299       case Iop_MulF128:
3300       case Iop_DivF128:
3301       case Iop_AddD128:
3302       case Iop_SubD128:
3303       case Iop_MulD128:
3304       case Iop_DivD128:
3305       case Iop_QuantizeD128:
3306          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3307          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3308       case Iop_AddF64:
3309       case Iop_AddD64:
3310       case Iop_AddF64r32:
3311       case Iop_SubF64:
3312       case Iop_SubD64:
3313       case Iop_SubF64r32:
3314       case Iop_MulF64:
3315       case Iop_MulD64:
3316       case Iop_MulF64r32:
3317       case Iop_DivF64:
3318       case Iop_DivD64:
3319       case Iop_DivF64r32:
3320       case Iop_ScaleF64:
3321       case Iop_Yl2xF64:
3322       case Iop_Yl2xp1F64:
3323       case Iop_AtanF64:
3324       case Iop_PRemF64:
3325       case Iop_PRem1F64:
3326       case Iop_QuantizeD64:
3327          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3328          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3329       case Iop_PRemC3210F64:
3330       case Iop_PRem1C3210F64:
3331          /* I32(rm) x F64 x F64 -> I32 */
3332          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3333       case Iop_AddF32:
3334       case Iop_SubF32:
3335       case Iop_MulF32:
3336       case Iop_DivF32:
3337          /* I32(rm) x F32 x F32 -> I32 */
3338          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3339       case Iop_SignificanceRoundD64:
3340          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3341          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3342       case Iop_SignificanceRoundD128:
3343          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3344          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3345       case Iop_SliceV128:
3346          /* (V128, V128, I8) -> V128 */
3347          complainIfUndefined(mce, atom3, NULL);
3348          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3349       case Iop_Slice64:
3350          /* (I64, I64, I8) -> I64 */
3351          complainIfUndefined(mce, atom3, NULL);
3352          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3353       case Iop_SetElem8x8:
3354       case Iop_SetElem16x4:
3355       case Iop_SetElem32x2:
3356          complainIfUndefined(mce, atom2, NULL);
3357          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3358
3359       case Iop_SetElem8x16:
3360       case Iop_SetElem16x8:
3361       case Iop_SetElem32x4:
3362       case Iop_SetElem64x2:
3363          complainIfUndefined(mce, atom2, NULL);
3364          return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3365
3366       case Iop_Perm8x16x2:
3367          /* (V128, V128, V128) -> V128 */
3368             complainIfUndefined(mce, atom3, NULL);
3369             return mkUifUV128(
3370                    mce,
3371                    assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3372                    mkPCast8x16(mce, vatom3)
3373                 );
3374
3375       /* Vector FP with rounding mode as the first arg */
3376       case Iop_Add64Fx2:
3377       case Iop_Sub64Fx2:
3378       case Iop_Mul64Fx2:
3379       case Iop_Div64Fx2:
3380       case Iop_Scale2_64Fx2:
3381          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3382
3383       case Iop_Add32Fx4:
3384       case Iop_Sub32Fx4:
3385       case Iop_Mul32Fx4:
3386       case Iop_Div32Fx4:
3387       case Iop_Scale2_32Fx4:
3388         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3389
3390       case Iop_Add64Fx4:
3391       case Iop_Sub64Fx4:
3392       case Iop_Mul64Fx4:
3393       case Iop_Div64Fx4:
3394          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3395
3396       case Iop_Add32Fx8:
3397       case Iop_Sub32Fx8:
3398       case Iop_Mul32Fx8:
3399       case Iop_Div32Fx8:
3400          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3401
3402       case Iop_F32x4_2toQ16x8:
3403          return assignNew('V', mce, Ity_V128,
3404                           binop(Iop_PackEvenLanes16x8,
3405                                 unary32Fx4_w_rm(mce, vatom1, vatom2),
3406                                 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3407       case Iop_F64x2_2toQ32x4:
3408          return assignNew('V', mce, Ity_V128,
3409                           binop(Iop_PackEvenLanes32x4,
3410                                 unary64Fx2_w_rm(mce, vatom1, vatom2),
3411                                 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3412
3413
3414       default:
3415          ppIROp(op);
3416          VG_(tool_panic)("memcheck:expr2vbits_Triop");
3417    }
3418 }
3419
3420
3421 static
3422 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3423                            IROp op,
3424                            IRAtom* atom1, IRAtom* atom2,
3425                            HowUsed hu/*use HuOth if unknown*/ )
3426 {
3427    IRType  and_or_ty = Ity_INVALID;
3428    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3429    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3430    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3431
3432    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3433    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3434
3435    tl_assert(isOriginalAtom(mce,atom1));
3436    tl_assert(isOriginalAtom(mce,atom2));
3437    tl_assert(isShadowAtom(mce,vatom1));
3438    tl_assert(isShadowAtom(mce,vatom2));
3439    tl_assert(sameKindedAtoms(atom1,vatom1));
3440    tl_assert(sameKindedAtoms(atom2,vatom2));
3441    switch (op) {
3442
3443       /* 32-bit SIMD */
3444
3445       case Iop_Add16x2:
3446       case Iop_HAdd16Ux2:
3447       case Iop_HAdd16Sx2:
3448       case Iop_Sub16x2:
3449       case Iop_HSub16Ux2:
3450       case Iop_HSub16Sx2:
3451       case Iop_QAdd16Sx2:
3452       case Iop_QSub16Sx2:
3453       case Iop_QSub16Ux2:
3454       case Iop_QAdd16Ux2:
3455          return binary16Ix2(mce, vatom1, vatom2);
3456
3457       case Iop_Add8x4:
3458       case Iop_HAdd8Ux4:
3459       case Iop_HAdd8Sx4:
3460       case Iop_Sub8x4:
3461       case Iop_HSub8Ux4:
3462       case Iop_HSub8Sx4:
3463       case Iop_QSub8Ux4:
3464       case Iop_QAdd8Ux4:
3465       case Iop_QSub8Sx4:
3466       case Iop_QAdd8Sx4:
3467          return binary8Ix4(mce, vatom1, vatom2);
3468
3469       /* 64-bit SIMD */
3470
3471       case Iop_ShrN8x8:
3472       case Iop_ShrN16x4:
3473       case Iop_ShrN32x2:
3474       case Iop_SarN8x8:
3475       case Iop_SarN16x4:
3476       case Iop_SarN32x2:
3477       case Iop_ShlN16x4:
3478       case Iop_ShlN32x2:
3479       case Iop_ShlN8x8:
3480          /* Same scheme as with all other shifts. */
3481          complainIfUndefined(mce, atom2, NULL);
3482          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3483
3484       case Iop_QNarrowBin32Sto16Sx4:
3485       case Iop_QNarrowBin16Sto8Sx8:
3486       case Iop_QNarrowBin16Sto8Ux8:
3487          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3488
3489       case Iop_Min8Ux8:
3490       case Iop_Min8Sx8:
3491       case Iop_Max8Ux8:
3492       case Iop_Max8Sx8:
3493       case Iop_Avg8Ux8:
3494       case Iop_QSub8Sx8:
3495       case Iop_QSub8Ux8:
3496       case Iop_Sub8x8:
3497       case Iop_CmpGT8Sx8:
3498       case Iop_CmpGT8Ux8:
3499       case Iop_CmpEQ8x8:
3500       case Iop_QAdd8Sx8:
3501       case Iop_QAdd8Ux8:
3502       case Iop_QSal8x8:
3503       case Iop_QShl8x8:
3504       case Iop_Add8x8:
3505       case Iop_Mul8x8:
3506       case Iop_PolynomialMul8x8:
3507          return binary8Ix8(mce, vatom1, vatom2);
3508
3509       case Iop_Min16Sx4:
3510       case Iop_Min16Ux4:
3511       case Iop_Max16Sx4:
3512       case Iop_Max16Ux4:
3513       case Iop_Avg16Ux4:
3514       case Iop_QSub16Ux4:
3515       case Iop_QSub16Sx4:
3516       case Iop_Sub16x4:
3517       case Iop_Mul16x4:
3518       case Iop_MulHi16Sx4:
3519       case Iop_MulHi16Ux4:
3520       case Iop_CmpGT16Sx4:
3521       case Iop_CmpGT16Ux4:
3522       case Iop_CmpEQ16x4:
3523       case Iop_QAdd16Sx4:
3524       case Iop_QAdd16Ux4:
3525       case Iop_QSal16x4:
3526       case Iop_QShl16x4:
3527       case Iop_Add16x4:
3528       case Iop_QDMulHi16Sx4:
3529       case Iop_QRDMulHi16Sx4:
3530          return binary16Ix4(mce, vatom1, vatom2);
3531
3532       case Iop_Sub32x2:
3533       case Iop_Mul32x2:
3534       case Iop_Max32Sx2:
3535       case Iop_Max32Ux2:
3536       case Iop_Min32Sx2:
3537       case Iop_Min32Ux2:
3538       case Iop_CmpGT32Sx2:
3539       case Iop_CmpGT32Ux2:
3540       case Iop_CmpEQ32x2:
3541       case Iop_Add32x2:
3542       case Iop_QAdd32Ux2:
3543       case Iop_QAdd32Sx2:
3544       case Iop_QSub32Ux2:
3545       case Iop_QSub32Sx2:
3546       case Iop_QSal32x2:
3547       case Iop_QShl32x2:
3548       case Iop_QDMulHi32Sx2:
3549       case Iop_QRDMulHi32Sx2:
3550          return binary32Ix2(mce, vatom1, vatom2);
3551
3552       case Iop_QSub64Ux1:
3553       case Iop_QSub64Sx1:
3554       case Iop_QAdd64Ux1:
3555       case Iop_QAdd64Sx1:
3556       case Iop_QSal64x1:
3557       case Iop_QShl64x1:
3558       case Iop_Sal64x1:
3559          return binary64Ix1(mce, vatom1, vatom2);
3560
3561       case Iop_QShlNsatSU8x8:
3562       case Iop_QShlNsatUU8x8:
3563       case Iop_QShlNsatSS8x8:
3564          complainIfUndefined(mce, atom2, NULL);
3565          return mkPCast8x8(mce, vatom1);
3566
3567       case Iop_QShlNsatSU16x4:
3568       case Iop_QShlNsatUU16x4:
3569       case Iop_QShlNsatSS16x4:
3570          complainIfUndefined(mce, atom2, NULL);
3571          return mkPCast16x4(mce, vatom1);
3572
3573       case Iop_QShlNsatSU32x2:
3574       case Iop_QShlNsatUU32x2:
3575       case Iop_QShlNsatSS32x2:
3576          complainIfUndefined(mce, atom2, NULL);
3577          return mkPCast32x2(mce, vatom1);
3578
3579       case Iop_QShlNsatSU64x1:
3580       case Iop_QShlNsatUU64x1:
3581       case Iop_QShlNsatSS64x1:
3582          complainIfUndefined(mce, atom2, NULL);
3583          return mkPCast32x2(mce, vatom1);
3584
3585       case Iop_PwMax32Sx2:
3586       case Iop_PwMax32Ux2:
3587       case Iop_PwMin32Sx2:
3588       case Iop_PwMin32Ux2:
3589       case Iop_PwMax32Fx2:
3590       case Iop_PwMin32Fx2:
3591          return assignNew('V', mce, Ity_I64,
3592                           binop(Iop_PwMax32Ux2,
3593                                 mkPCast32x2(mce, vatom1),
3594                                 mkPCast32x2(mce, vatom2)));
3595
3596       case Iop_PwMax16Sx4:
3597       case Iop_PwMax16Ux4:
3598       case Iop_PwMin16Sx4:
3599       case Iop_PwMin16Ux4:
3600          return assignNew('V', mce, Ity_I64,
3601                           binop(Iop_PwMax16Ux4,
3602                                 mkPCast16x4(mce, vatom1),
3603                                 mkPCast16x4(mce, vatom2)));
3604
3605       case Iop_PwMax8Sx8:
3606       case Iop_PwMax8Ux8:
3607       case Iop_PwMin8Sx8:
3608       case Iop_PwMin8Ux8:
3609          return assignNew('V', mce, Ity_I64,
3610                           binop(Iop_PwMax8Ux8,
3611                                 mkPCast8x8(mce, vatom1),
3612                                 mkPCast8x8(mce, vatom2)));
3613
3614       case Iop_PwAdd32x2:
3615       case Iop_PwAdd32Fx2:
3616          return mkPCast32x2(mce,
3617                assignNew('V', mce, Ity_I64,
3618                          binop(Iop_PwAdd32x2,
3619                                mkPCast32x2(mce, vatom1),
3620                                mkPCast32x2(mce, vatom2))));
3621
3622       case Iop_PwAdd16x4:
3623          return mkPCast16x4(mce,
3624                assignNew('V', mce, Ity_I64,
3625                          binop(op, mkPCast16x4(mce, vatom1),
3626                                    mkPCast16x4(mce, vatom2))));
3627
3628       case Iop_PwAdd8x8:
3629          return mkPCast8x8(mce,
3630                assignNew('V', mce, Ity_I64,
3631                          binop(op, mkPCast8x8(mce, vatom1),
3632                                    mkPCast8x8(mce, vatom2))));
3633
3634       case Iop_Shl8x8:
3635       case Iop_Shr8x8:
3636       case Iop_Sar8x8:
3637       case Iop_Sal8x8:
3638          return mkUifU64(mce,
3639                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3640                    mkPCast8x8(mce,vatom2)
3641                 );
3642
3643       case Iop_Shl16x4:
3644       case Iop_Shr16x4:
3645       case Iop_Sar16x4:
3646       case Iop_Sal16x4:
3647          return mkUifU64(mce,
3648                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3649                    mkPCast16x4(mce,vatom2)
3650                 );
3651
3652       case Iop_Shl32x2:
3653       case Iop_Shr32x2:
3654       case Iop_Sar32x2:
3655       case Iop_Sal32x2:
3656          return mkUifU64(mce,
3657                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3658                    mkPCast32x2(mce,vatom2)
3659                 );
3660
3661       /* 64-bit data-steering */
3662       case Iop_InterleaveLO32x2:
3663       case Iop_InterleaveLO16x4:
3664       case Iop_InterleaveLO8x8:
3665       case Iop_InterleaveHI32x2:
3666       case Iop_InterleaveHI16x4:
3667       case Iop_InterleaveHI8x8:
3668       case Iop_CatOddLanes8x8:
3669       case Iop_CatEvenLanes8x8:
3670       case Iop_CatOddLanes16x4:
3671       case Iop_CatEvenLanes16x4:
3672       case Iop_InterleaveOddLanes8x8:
3673       case Iop_InterleaveEvenLanes8x8:
3674       case Iop_InterleaveOddLanes16x4:
3675       case Iop_InterleaveEvenLanes16x4:
3676          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3677
3678       case Iop_GetElem8x8:
3679          complainIfUndefined(mce, atom2, NULL);
3680          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3681       case Iop_GetElem16x4:
3682          complainIfUndefined(mce, atom2, NULL);
3683          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3684       case Iop_GetElem32x2:
3685          complainIfUndefined(mce, atom2, NULL);
3686          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3687
3688       /* Perm8x8: rearrange values in left arg using steering values from
3689          right arg.  So rearrange the vbits in the same way but pessimise wrt
3690          steering values.  We assume that unused bits in the steering value
3691          are defined zeros, so we can safely PCast within each lane of the the
3692          steering value without having to take precautions to avoid a
3693          dependency on those unused bits.
3694
3695          This is also correct for PermOrZero8x8, but it is a bit subtle.  For
3696          each lane, if bit 7 of the steering value is zero, then we'll steer
3697          the shadow value exactly as per Perm8x8.  If that bit is one, then
3698          the operation will set the resulting (concrete) value to zero.  That
3699          means it is defined, and should have a shadow value of zero.  Hence
3700          in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3701          as Perm8x8) and then pessimise against the steering values.  */
3702       case Iop_Perm8x8:
3703       case Iop_PermOrZero8x8:
3704          return mkUifU64(
3705                    mce,
3706                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3707                    mkPCast8x8(mce, vatom2)
3708                 );
3709
3710       /* V128-bit SIMD */
3711
3712       case Iop_I32StoF32x4:
3713       case Iop_F32toI32Sx4:
3714       case Iop_Sqrt32Fx4:
3715          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3716       case Iop_Sqrt64Fx2:
3717          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3718
3719       case Iop_ShrN8x16:
3720       case Iop_ShrN16x8:
3721       case Iop_ShrN32x4:
3722       case Iop_ShrN64x2:
3723       case Iop_SarN8x16:
3724       case Iop_SarN16x8:
3725       case Iop_SarN32x4:
3726       case Iop_SarN64x2:
3727       case Iop_ShlN8x16:
3728       case Iop_ShlN16x8:
3729       case Iop_ShlN32x4:
3730       case Iop_ShlN64x2:
3731          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3732             this is wrong now, scalar shifts are done properly lazily.
3733             Vector shifts should be fixed too. */
3734          complainIfUndefined(mce, atom2, NULL);
3735          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3736
3737       /* V x V shifts/rotates are done using the standard lazy scheme. */
3738       /* For the non-rounding variants of bi-di vector x vector
3739          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3740          But note that this is overly pessimistic, because in fact only
3741          the bottom 8 bits of each lane of the second argument are taken
3742          into account when shifting.  So really we ought to ignore
3743          undefinedness in bits 8 and above of each lane in the
3744          second argument. */
3745       case Iop_Shl8x16:
3746       case Iop_Shr8x16:
3747       case Iop_Sar8x16:
3748       case Iop_Sal8x16:
3749       case Iop_Rol8x16:
3750       case Iop_Sh8Sx16:
3751       case Iop_Sh8Ux16:
3752          return mkUifUV128(mce,
3753                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3754                    mkPCast8x16(mce,vatom2)
3755                 );
3756
3757       case Iop_Shl16x8:
3758       case Iop_Shr16x8:
3759       case Iop_Sar16x8:
3760       case Iop_Sal16x8:
3761       case Iop_Rol16x8:
3762       case Iop_Sh16Sx8:
3763       case Iop_Sh16Ux8:
3764          return mkUifUV128(mce,
3765                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3766                    mkPCast16x8(mce,vatom2)
3767                 );
3768
3769       case Iop_Shl32x4:
3770       case Iop_Shr32x4:
3771       case Iop_Sar32x4:
3772       case Iop_Sal32x4:
3773       case Iop_Rol32x4:
3774       case Iop_Sh32Sx4:
3775       case Iop_Sh32Ux4:
3776          return mkUifUV128(mce,
3777                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3778                    mkPCast32x4(mce,vatom2)
3779                 );
3780
3781       case Iop_Shl64x2:
3782       case Iop_Shr64x2:
3783       case Iop_Sar64x2:
3784       case Iop_Sal64x2:
3785       case Iop_Rol64x2:
3786       case Iop_Sh64Sx2:
3787       case Iop_Sh64Ux2:
3788          return mkUifUV128(mce,
3789                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3790                    mkPCast64x2(mce,vatom2)
3791                 );
3792
3793       /* For the rounding variants of bi-di vector x vector shifts, the
3794          rounding adjustment can cause undefinedness to propagate through
3795          the entire lane, in the worst case.  Too complex to handle
3796          properly .. just UifU the arguments and then PCast them.
3797          Suboptimal but safe. */
3798       case Iop_Rsh8Sx16:
3799       case Iop_Rsh8Ux16:
3800          return binary8Ix16(mce, vatom1, vatom2);
3801       case Iop_Rsh16Sx8:
3802       case Iop_Rsh16Ux8:
3803          return binary16Ix8(mce, vatom1, vatom2);
3804       case Iop_Rsh32Sx4:
3805       case Iop_Rsh32Ux4:
3806          return binary32Ix4(mce, vatom1, vatom2);
3807       case Iop_Rsh64Sx2:
3808       case Iop_Rsh64Ux2:
3809          return binary64Ix2(mce, vatom1, vatom2);
3810
3811       case Iop_F32ToFixed32Ux4_RZ:
3812       case Iop_F32ToFixed32Sx4_RZ:
3813       case Iop_Fixed32UToF32x4_RN:
3814       case Iop_Fixed32SToF32x4_RN:
3815          complainIfUndefined(mce, atom2, NULL);
3816          return mkPCast32x4(mce, vatom1);
3817
3818       case Iop_F32ToFixed32Ux2_RZ:
3819       case Iop_F32ToFixed32Sx2_RZ:
3820       case Iop_Fixed32UToF32x2_RN:
3821       case Iop_Fixed32SToF32x2_RN:
3822          complainIfUndefined(mce, atom2, NULL);
3823          return mkPCast32x2(mce, vatom1);
3824
3825       case Iop_QSub8Ux16:
3826       case Iop_QSub8Sx16:
3827       case Iop_Sub8x16:
3828       case Iop_Min8Ux16:
3829       case Iop_Min8Sx16:
3830       case Iop_Max8Ux16:
3831       case Iop_Max8Sx16:
3832       case Iop_CmpGT8Sx16:
3833       case Iop_CmpGT8Ux16:
3834       case Iop_CmpEQ8x16:
3835       case Iop_Avg8Ux16:
3836       case Iop_Avg8Sx16:
3837       case Iop_QAdd8Ux16:
3838       case Iop_QAdd8Sx16:
3839       case Iop_QAddExtUSsatSS8x16:
3840       case Iop_QAddExtSUsatUU8x16:
3841       case Iop_QSal8x16:
3842       case Iop_QShl8x16:
3843       case Iop_Add8x16:
3844       case Iop_Mul8x16:
3845       case Iop_MulHi8Sx16:
3846       case Iop_MulHi8Ux16:
3847       case Iop_PolynomialMul8x16:
3848       case Iop_PolynomialMulAdd8x16:
3849          return binary8Ix16(mce, vatom1, vatom2);
3850
3851       case Iop_QSub16Ux8:
3852       case Iop_QSub16Sx8:
3853       case Iop_Sub16x8:
3854       case Iop_Mul16x8:
3855       case Iop_MulHi16Sx8:
3856       case Iop_MulHi16Ux8:
3857       case Iop_Min16Sx8:
3858       case Iop_Min16Ux8:
3859       case Iop_Max16Sx8:
3860       case Iop_Max16Ux8:
3861       case Iop_CmpGT16Sx8:
3862       case Iop_CmpGT16Ux8:
3863       case Iop_CmpEQ16x8:
3864       case Iop_Avg16Ux8:
3865       case Iop_Avg16Sx8:
3866       case Iop_QAdd16Ux8:
3867       case Iop_QAdd16Sx8:
3868       case Iop_QAddExtUSsatSS16x8:
3869       case Iop_QAddExtSUsatUU16x8:
3870       case Iop_QSal16x8:
3871       case Iop_QShl16x8:
3872       case Iop_Add16x8:
3873       case Iop_QDMulHi16Sx8:
3874       case Iop_QRDMulHi16Sx8:
3875       case Iop_PolynomialMulAdd16x8:
3876       /* PwExtUSMulQAdd8x16 is a bit subtle.  The effect of it is that each
3877          16-bit chunk of the output is formed from corresponding 16-bit chunks
3878          of the input args, so we can treat it like an other binary 16x8
3879          operation.  That's despite it having '8x16' in its name. */
3880       case Iop_PwExtUSMulQAdd8x16:
3881          return binary16Ix8(mce, vatom1, vatom2);
3882
3883       case Iop_Sub32x4:
3884       case Iop_CmpGT32Sx4:
3885       case Iop_CmpGT32Ux4:
3886       case Iop_CmpEQ32x4:
3887       case Iop_QAdd32Sx4:
3888       case Iop_QAdd32Ux4:
3889       case Iop_QSub32Sx4:
3890       case Iop_QSub32Ux4:
3891       case Iop_QAddExtUSsatSS32x4:
3892       case Iop_QAddExtSUsatUU32x4:
3893       case Iop_QSal32x4:
3894       case Iop_QShl32x4:
3895       case Iop_Avg32Ux4:
3896       case Iop_Avg32Sx4:
3897       case Iop_Add32x4:
3898       case Iop_Max32Ux4:
3899       case Iop_Max32Sx4:
3900       case Iop_Min32Ux4:
3901       case Iop_Min32Sx4:
3902       case Iop_Mul32x4:
3903       case Iop_MulHi32Sx4:
3904       case Iop_MulHi32Ux4:
3905       case Iop_QDMulHi32Sx4:
3906       case Iop_QRDMulHi32Sx4:
3907       case Iop_PolynomialMulAdd32x4:
3908          return binary32Ix4(mce, vatom1, vatom2);
3909
3910       case Iop_Sub64x2:
3911       case Iop_Add64x2:
3912       case Iop_Avg64Ux2:
3913       case Iop_Avg64Sx2:
3914       case Iop_Max64Sx2:
3915       case Iop_Max64Ux2:
3916       case Iop_Min64Sx2:
3917       case Iop_Min64Ux2:
3918       case Iop_CmpEQ64x2:
3919       case Iop_CmpGT64Sx2:
3920       case Iop_CmpGT64Ux2:
3921       case Iop_QSal64x2:
3922       case Iop_QShl64x2:
3923       case Iop_QAdd64Ux2:
3924       case Iop_QAdd64Sx2:
3925       case Iop_QSub64Ux2:
3926       case Iop_QSub64Sx2:
3927       case Iop_QAddExtUSsatSS64x2:
3928       case Iop_QAddExtSUsatUU64x2:
3929       case Iop_PolynomialMulAdd64x2:
3930       case Iop_CipherV128:
3931       case Iop_CipherLV128:
3932       case Iop_NCipherV128:
3933       case Iop_NCipherLV128:
3934       case Iop_MulI128by10E:
3935       case Iop_MulI128by10ECarry:
3936         return binary64Ix2(mce, vatom1, vatom2);
3937
3938       case Iop_Add128x1:
3939       case Iop_Sub128x1:
3940       case Iop_CmpNEZ128x1:
3941          return binary128Ix1(mce, vatom1, vatom2);
3942
3943       case Iop_QNarrowBin64Sto32Sx4:
3944       case Iop_QNarrowBin64Uto32Ux4:
3945       case Iop_QNarrowBin32Sto16Sx8:
3946       case Iop_QNarrowBin32Uto16Ux8:
3947       case Iop_QNarrowBin32Sto16Ux8:
3948       case Iop_QNarrowBin16Sto8Sx16:
3949       case Iop_QNarrowBin16Uto8Ux16:
3950       case Iop_QNarrowBin16Sto8Ux16:
3951          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
3952
3953       case Iop_Min64Fx2:
3954       case Iop_Max64Fx2:
3955       case Iop_CmpLT64Fx2:
3956       case Iop_CmpLE64Fx2:
3957       case Iop_CmpEQ64Fx2:
3958       case Iop_CmpUN64Fx2:
3959       case Iop_RecipStep64Fx2:
3960       case Iop_RSqrtStep64Fx2:
3961          return binary64Fx2(mce, vatom1, vatom2);
3962
3963       case Iop_Sub64F0x2:
3964       case Iop_Mul64F0x2:
3965       case Iop_Min64F0x2:
3966       case Iop_Max64F0x2:
3967       case Iop_Div64F0x2:
3968       case Iop_CmpLT64F0x2:
3969       case Iop_CmpLE64F0x2:
3970       case Iop_CmpEQ64F0x2:
3971       case Iop_CmpUN64F0x2:
3972       case Iop_Add64F0x2:
3973          return binary64F0x2(mce, vatom1, vatom2);
3974
3975       case Iop_Min32Fx4:
3976       case Iop_Max32Fx4:
3977       case Iop_CmpLT32Fx4:
3978       case Iop_CmpLE32Fx4:
3979       case Iop_CmpEQ32Fx4:
3980       case Iop_CmpUN32Fx4:
3981       case Iop_CmpGT32Fx4:
3982       case Iop_CmpGE32Fx4:
3983       case Iop_RecipStep32Fx4:
3984       case Iop_RSqrtStep32Fx4:
3985          return binary32Fx4(mce, vatom1, vatom2);
3986
3987       case Iop_Sub32Fx2:
3988       case Iop_Mul32Fx2:
3989       case Iop_Min32Fx2:
3990       case Iop_Max32Fx2:
3991       case Iop_CmpEQ32Fx2:
3992       case Iop_CmpGT32Fx2:
3993       case Iop_CmpGE32Fx2:
3994       case Iop_Add32Fx2:
3995       case Iop_RecipStep32Fx2:
3996       case Iop_RSqrtStep32Fx2:
3997          return binary32Fx2(mce, vatom1, vatom2);
3998
3999       case Iop_Sub32F0x4:
4000       case Iop_Mul32F0x4:
4001       case Iop_Min32F0x4:
4002       case Iop_Max32F0x4:
4003       case Iop_Div32F0x4:
4004       case Iop_CmpLT32F0x4:
4005       case Iop_CmpLE32F0x4:
4006       case Iop_CmpEQ32F0x4:
4007       case Iop_CmpUN32F0x4:
4008       case Iop_Add32F0x4:
4009          return binary32F0x4(mce, vatom1, vatom2);
4010
4011       case Iop_QShlNsatSU8x16:
4012       case Iop_QShlNsatUU8x16:
4013       case Iop_QShlNsatSS8x16:
4014          complainIfUndefined(mce, atom2, NULL);
4015          return mkPCast8x16(mce, vatom1);
4016
4017       case Iop_QShlNsatSU16x8:
4018       case Iop_QShlNsatUU16x8:
4019       case Iop_QShlNsatSS16x8:
4020          complainIfUndefined(mce, atom2, NULL);
4021          return mkPCast16x8(mce, vatom1);
4022
4023       case Iop_QShlNsatSU32x4:
4024       case Iop_QShlNsatUU32x4:
4025       case Iop_QShlNsatSS32x4:
4026          complainIfUndefined(mce, atom2, NULL);
4027          return mkPCast32x4(mce, vatom1);
4028
4029       case Iop_QShlNsatSU64x2:
4030       case Iop_QShlNsatUU64x2:
4031       case Iop_QShlNsatSS64x2:
4032          complainIfUndefined(mce, atom2, NULL);
4033          return mkPCast32x4(mce, vatom1);
4034
4035       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4036          To make this simpler, do the following:
4037          * complain if the shift amount (the I8) is undefined
4038          * pcast each lane at the wide width
4039          * truncate each lane to half width
4040          * pcast the resulting 64-bit value to a single bit and use
4041            that as the least significant bit of the upper half of the
4042            result. */
4043       case Iop_QandQShrNnarrow64Uto32Ux2:
4044       case Iop_QandQSarNnarrow64Sto32Sx2:
4045       case Iop_QandQSarNnarrow64Sto32Ux2:
4046       case Iop_QandQRShrNnarrow64Uto32Ux2:
4047       case Iop_QandQRSarNnarrow64Sto32Sx2:
4048       case Iop_QandQRSarNnarrow64Sto32Ux2:
4049       case Iop_QandQShrNnarrow32Uto16Ux4:
4050       case Iop_QandQSarNnarrow32Sto16Sx4:
4051       case Iop_QandQSarNnarrow32Sto16Ux4:
4052       case Iop_QandQRShrNnarrow32Uto16Ux4:
4053       case Iop_QandQRSarNnarrow32Sto16Sx4:
4054       case Iop_QandQRSarNnarrow32Sto16Ux4:
4055       case Iop_QandQShrNnarrow16Uto8Ux8:
4056       case Iop_QandQSarNnarrow16Sto8Sx8:
4057       case Iop_QandQSarNnarrow16Sto8Ux8:
4058       case Iop_QandQRShrNnarrow16Uto8Ux8:
4059       case Iop_QandQRSarNnarrow16Sto8Sx8:
4060       case Iop_QandQRSarNnarrow16Sto8Ux8:
4061       {
4062          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4063          IROp opNarrow = Iop_INVALID;
4064          switch (op) {
4065             case Iop_QandQShrNnarrow64Uto32Ux2:
4066             case Iop_QandQSarNnarrow64Sto32Sx2:
4067             case Iop_QandQSarNnarrow64Sto32Ux2:
4068             case Iop_QandQRShrNnarrow64Uto32Ux2:
4069             case Iop_QandQRSarNnarrow64Sto32Sx2:
4070             case Iop_QandQRSarNnarrow64Sto32Ux2:
4071                fnPessim = mkPCast64x2;
4072                opNarrow = Iop_NarrowUn64to32x2;
4073                break;
4074             case Iop_QandQShrNnarrow32Uto16Ux4:
4075             case Iop_QandQSarNnarrow32Sto16Sx4:
4076             case Iop_QandQSarNnarrow32Sto16Ux4:
4077             case Iop_QandQRShrNnarrow32Uto16Ux4:
4078             case Iop_QandQRSarNnarrow32Sto16Sx4:
4079             case Iop_QandQRSarNnarrow32Sto16Ux4:
4080                fnPessim = mkPCast32x4;
4081                opNarrow = Iop_NarrowUn32to16x4;
4082                break;
4083             case Iop_QandQShrNnarrow16Uto8Ux8:
4084             case Iop_QandQSarNnarrow16Sto8Sx8:
4085             case Iop_QandQSarNnarrow16Sto8Ux8:
4086             case Iop_QandQRShrNnarrow16Uto8Ux8:
4087             case Iop_QandQRSarNnarrow16Sto8Sx8:
4088             case Iop_QandQRSarNnarrow16Sto8Ux8:
4089                fnPessim = mkPCast16x8;
4090                opNarrow = Iop_NarrowUn16to8x8;
4091                break;
4092             default:
4093                tl_assert(0);
4094          }
4095          complainIfUndefined(mce, atom2, NULL);
4096          // Pessimised shift result
4097          IRAtom* shV
4098             = fnPessim(mce, vatom1);
4099          // Narrowed, pessimised shift result
4100          IRAtom* shVnarrowed
4101             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4102          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4103          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4104          // and assemble the result
4105          return assignNew('V', mce, Ity_V128,
4106                           binop(Iop_64HLtoV128, qV, shVnarrowed));
4107       }
4108
4109       case Iop_Mull32Sx2:
4110       case Iop_Mull32Ux2:
4111       case Iop_QDMull32Sx2:
4112          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4113                                     mkUifU64(mce, vatom1, vatom2));
4114
4115       case Iop_Mull16Sx4:
4116       case Iop_Mull16Ux4:
4117       case Iop_QDMull16Sx4:
4118          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4119                                     mkUifU64(mce, vatom1, vatom2));
4120
4121       case Iop_Mull8Sx8:
4122       case Iop_Mull8Ux8:
4123       case Iop_PolynomialMull8x8:
4124          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4125                                     mkUifU64(mce, vatom1, vatom2));
4126
4127       case Iop_PwAdd32x4:
4128          return mkPCast32x4(mce,
4129                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4130                      mkPCast32x4(mce, vatom2))));
4131
4132       case Iop_PwAdd16x8:
4133          return mkPCast16x8(mce,
4134                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4135                      mkPCast16x8(mce, vatom2))));
4136
4137       case Iop_PwAdd8x16:
4138          return mkPCast8x16(mce,
4139                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4140                      mkPCast8x16(mce, vatom2))));
4141
4142       /* V128-bit data-steering */
4143       case Iop_SetV128lo32:
4144       case Iop_SetV128lo64:
4145       case Iop_64HLtoV128:
4146       case Iop_InterleaveLO64x2:
4147       case Iop_InterleaveLO32x4:
4148       case Iop_InterleaveLO16x8:
4149       case Iop_InterleaveLO8x16:
4150       case Iop_InterleaveHI64x2:
4151       case Iop_InterleaveHI32x4:
4152       case Iop_InterleaveHI16x8:
4153       case Iop_InterleaveHI8x16:
4154       case Iop_CatOddLanes8x16:
4155       case Iop_CatOddLanes16x8:
4156       case Iop_CatOddLanes32x4:
4157       case Iop_CatEvenLanes8x16:
4158       case Iop_CatEvenLanes16x8:
4159       case Iop_CatEvenLanes32x4:
4160       case Iop_InterleaveOddLanes8x16:
4161       case Iop_InterleaveOddLanes16x8:
4162       case Iop_InterleaveOddLanes32x4:
4163       case Iop_InterleaveEvenLanes8x16:
4164       case Iop_InterleaveEvenLanes16x8:
4165       case Iop_InterleaveEvenLanes32x4:
4166       case Iop_PackOddLanes8x16:
4167       case Iop_PackOddLanes16x8:
4168       case Iop_PackOddLanes32x4:
4169       case Iop_PackEvenLanes8x16:
4170       case Iop_PackEvenLanes16x8:
4171       case Iop_PackEvenLanes32x4:
4172          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4173
4174       case Iop_GetElem8x16:
4175          complainIfUndefined(mce, atom2, NULL);
4176          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4177       case Iop_GetElem16x8:
4178          complainIfUndefined(mce, atom2, NULL);
4179          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4180       case Iop_GetElem32x4:
4181          complainIfUndefined(mce, atom2, NULL);
4182          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4183       case Iop_GetElem64x2:
4184          complainIfUndefined(mce, atom2, NULL);
4185          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4186
4187       /* Perm8x16: rearrange values in left arg using steering values
4188          from right arg.  So rearrange the vbits in the same way but
4189          pessimise wrt steering values.  Perm32x4 ditto. */
4190       /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4191       case Iop_Perm8x16:
4192       case Iop_PermOrZero8x16:
4193          return mkUifUV128(
4194                    mce,
4195                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4196                    mkPCast8x16(mce, vatom2)
4197                 );
4198       case Iop_Perm32x4:
4199          return mkUifUV128(
4200                    mce,
4201                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4202                    mkPCast32x4(mce, vatom2)
4203                 );
4204
4205      /* These two take the lower half of each 16-bit lane, sign/zero
4206         extend it to 32, and multiply together, producing a 32x4
4207         result (and implicitly ignoring half the operand bits).  So
4208         treat it as a bunch of independent 16x8 operations, but then
4209         do 32-bit shifts left-right to copy the lower half results
4210         (which are all 0s or all 1s due to PCasting in binary16Ix8)
4211         into the upper half of each result lane. */
4212       case Iop_MullEven16Ux8:
4213       case Iop_MullEven16Sx8: {
4214          IRAtom* at;
4215          at = binary16Ix8(mce,vatom1,vatom2);
4216          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4217          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4218          return at;
4219       }
4220
4221       /* Same deal as Iop_MullEven16{S,U}x8 */
4222       case Iop_MullEven8Ux16:
4223       case Iop_MullEven8Sx16: {
4224          IRAtom* at;
4225          at = binary8Ix16(mce,vatom1,vatom2);
4226          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4227          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4228          return at;
4229       }
4230
4231       /* Same deal as Iop_MullEven16{S,U}x8 */
4232       case Iop_MullEven32Ux4:
4233       case Iop_MullEven32Sx4: {
4234          IRAtom* at;
4235          at = binary32Ix4(mce,vatom1,vatom2);
4236          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4237          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4238          return at;
4239       }
4240
4241       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4242          32x4 -> 16x8 laneage, discarding the upper half of each lane.
4243          Simply apply same op to the V bits, since this really no more
4244          than a data steering operation. */
4245       case Iop_NarrowBin32to16x8:
4246       case Iop_NarrowBin16to8x16:
4247       case Iop_NarrowBin64to32x4:
4248          return assignNew('V', mce, Ity_V128,
4249                                     binop(op, vatom1, vatom2));
4250
4251       case Iop_ShrV128:
4252       case Iop_SarV128:
4253       case Iop_ShlV128:
4254       case Iop_I128StoBCD128:
4255          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
4256             this is wrong now, scalar shifts are done properly lazily.
4257             Vector shifts should be fixed too. */
4258          complainIfUndefined(mce, atom2, NULL);
4259          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4260
4261       case Iop_BCDAdd:
4262       case Iop_BCDSub:
4263          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4264
4265       /* SHA Iops */
4266       case Iop_SHA256:
4267       case Iop_SHA512:
4268          complainIfUndefined(mce, atom2, NULL);
4269          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4270
4271       /* I128-bit data-steering */
4272       case Iop_64HLto128:
4273          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4274
4275       /* V256-bit SIMD */
4276
4277       case Iop_Max64Fx4:
4278       case Iop_Min64Fx4:
4279          return binary64Fx4(mce, vatom1, vatom2);
4280
4281       case Iop_Max32Fx8:
4282       case Iop_Min32Fx8:
4283          return binary32Fx8(mce, vatom1, vatom2);
4284
4285       /* V256-bit data-steering */
4286       case Iop_V128HLtoV256:
4287          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4288
4289       /* Scalar floating point */
4290
4291       case Iop_F32toI64S:
4292       case Iop_F32toI64U:
4293          /* I32(rm) x F32 -> I64 */
4294          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4295
4296       case Iop_I64StoF32:
4297          /* I32(rm) x I64 -> F32 */
4298          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4299
4300       case Iop_RoundF64toInt:
4301       case Iop_RoundF64toF32:
4302       case Iop_F64toI64S:
4303       case Iop_F64toI64U:
4304       case Iop_I64StoF64:
4305       case Iop_I64UtoF64:
4306       case Iop_SinF64:
4307       case Iop_CosF64:
4308       case Iop_TanF64:
4309       case Iop_2xm1F64:
4310       case Iop_SqrtF64:
4311       case Iop_RecpExpF64:
4312          /* I32(rm) x I64/F64 -> I64/F64 */
4313          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4314
4315       case Iop_ShlD64:
4316       case Iop_ShrD64:
4317       case Iop_RoundD64toInt:
4318          /* I32(rm) x D64 -> D64 */
4319          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4320
4321       case Iop_ShlD128:
4322       case Iop_ShrD128:
4323       case Iop_RoundD128toInt:
4324          /* I32(rm) x D128 -> D128 */
4325          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4326
4327       case Iop_RoundF128toInt:
4328          /* I32(rm) x F128 -> F128 */
4329          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4330
4331       case Iop_D64toI64S:
4332       case Iop_D64toI64U:
4333       case Iop_I64StoD64:
4334       case Iop_I64UtoD64:
4335          /* I32(rm) x I64/D64 -> D64/I64 */
4336          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4337
4338       case Iop_F32toD32:
4339       case Iop_F64toD32:
4340       case Iop_F128toD32:
4341       case Iop_D32toF32:
4342       case Iop_D64toF32:
4343       case Iop_D128toF32:
4344          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4345          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4346
4347       case Iop_F32toD64:
4348       case Iop_F64toD64:
4349       case Iop_F128toD64:
4350       case Iop_D32toF64:
4351       case Iop_D64toF64:
4352       case Iop_D128toF64:
4353          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4354          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4355
4356       case Iop_F32toD128:
4357       case Iop_F64toD128:
4358       case Iop_F128toD128:
4359       case Iop_D32toF128:
4360       case Iop_D64toF128:
4361       case Iop_D128toF128:
4362          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4363          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4364
4365       case Iop_RoundF32toInt:
4366       case Iop_SqrtF32:
4367       case Iop_RecpExpF32:
4368          /* I32(rm) x I32/F32 -> I32/F32 */
4369          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4370
4371       case Iop_SqrtF128:
4372          /* I32(rm) x F128 -> F128 */
4373          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4374
4375       case Iop_I32StoF32:
4376       case Iop_I32UtoF32:
4377       case Iop_F32toI32S:
4378       case Iop_F32toI32U:
4379          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4380          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4381
4382       case Iop_F64toF16:
4383       case Iop_F32toF16:
4384          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4385          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4386
4387       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
4388       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
4389       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
4390       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
4391       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
4392          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4393
4394       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
4395       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
4396          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4397
4398       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
4399       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
4400       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
4401       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
4402       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
4403       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
4404          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4405
4406       case Iop_F64HLtoF128:
4407       case Iop_D64HLtoD128:
4408          return assignNew('V', mce, Ity_I128,
4409                           binop(Iop_64HLto128, vatom1, vatom2));
4410
4411       case Iop_F64toI32U:
4412       case Iop_F64toI32S:
4413       case Iop_F64toF32:
4414       case Iop_I64UtoF32:
4415       case Iop_D64toI32U:
4416       case Iop_D64toI32S:
4417          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4418          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4419
4420       case Iop_D64toD32:
4421          /* First arg is I32 (rounding mode), second is D64 (data). */
4422          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4423
4424       case Iop_F64toI16S:
4425          /* First arg is I32 (rounding mode), second is F64 (data). */
4426          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4427
4428       case Iop_InsertExpD64:
4429          /*  I64 x I64 -> D64 */
4430          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4431
4432       case Iop_InsertExpD128:
4433          /*  I64 x I128 -> D128 */
4434          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4435
4436       case Iop_CmpF32:
4437       case Iop_CmpF64:
4438       case Iop_CmpF128:
4439       case Iop_CmpD64:
4440       case Iop_CmpD128:
4441       case Iop_CmpExpD64:
4442       case Iop_CmpExpD128:
4443          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4444
4445       case Iop_MaxNumF32:
4446       case Iop_MinNumF32:
4447          /* F32 x F32 -> F32 */
4448          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4449
4450       case Iop_MaxNumF64:
4451       case Iop_MinNumF64:
4452          /* F64 x F64 -> F64 */
4453          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4454
4455       /* non-FP after here */
4456
4457       case Iop_DivModU64to32:
4458       case Iop_DivModS64to32:
4459          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4460
4461       case Iop_DivModU128to64:
4462       case Iop_DivModS128to64:
4463          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4464
4465       case Iop_8HLto16:
4466          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4467       case Iop_16HLto32:
4468          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4469       case Iop_32HLto64:
4470          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4471
4472       case Iop_DivModU64to64:
4473       case Iop_DivModS64to64: {
4474          IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4475          return assignNew('V', mce, Ity_I128,
4476                           binop(Iop_64HLto128, vTmp64, vTmp64));
4477       }
4478
4479       case Iop_MullS64:
4480       case Iop_MullU64: {
4481          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4482          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4483          return assignNew('V', mce, Ity_I128,
4484                           binop(Iop_64HLto128, vHi64, vLo64));
4485       }
4486
4487       case Iop_DivModU32to32:
4488       case Iop_DivModS32to32: {
4489          IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4490          return assignNew('V', mce, Ity_I64,
4491                           binop(Iop_32HLto64, vTmp32, vTmp32));
4492       }
4493
4494       case Iop_MullS32:
4495       case Iop_MullU32: {
4496          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4497          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4498          return assignNew('V', mce, Ity_I64,
4499                           binop(Iop_32HLto64, vHi32, vLo32));
4500       }
4501
4502       case Iop_MullS16:
4503       case Iop_MullU16: {
4504          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4505          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4506          return assignNew('V', mce, Ity_I32,
4507                           binop(Iop_16HLto32, vHi16, vLo16));
4508       }
4509
4510       case Iop_MullS8:
4511       case Iop_MullU8: {
4512          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4513          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4514          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4515       }
4516
4517       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4518       case Iop_DivS32:
4519       case Iop_DivU32:
4520       case Iop_DivU32E:
4521       case Iop_DivS32E:
4522       case Iop_QAdd32S: /* could probably do better */
4523       case Iop_QSub32S: /* could probably do better */
4524          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4525
4526       case Iop_DivS64:
4527       case Iop_DivU64:
4528       case Iop_DivS64E:
4529       case Iop_DivU64E:
4530          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4531
4532       case Iop_Add32:
4533          if (mce->dlbo.dl_Add32 == DLexpensive
4534              || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4535              return expensiveAddSub(mce,True,Ity_I32,
4536                                     vatom1,vatom2, atom1,atom2);
4537          } else {
4538              goto cheap_AddSub32;
4539          }
4540       case Iop_Sub32:
4541          if (mce->dlbo.dl_Sub32 == DLexpensive
4542              || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4543              return expensiveAddSub(mce,False,Ity_I32,
4544                                     vatom1,vatom2, atom1,atom2);
4545          } else {
4546              goto cheap_AddSub32;
4547          }
4548
4549       cheap_AddSub32:
4550       case Iop_Mul32:
4551          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4552
4553       case Iop_CmpORD32S:
4554       case Iop_CmpORD32U:
4555       case Iop_CmpORD64S:
4556       case Iop_CmpORD64U:
4557          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4558
4559       case Iop_Add64:
4560          if (mce->dlbo.dl_Add64 == DLexpensive
4561              || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4562              return expensiveAddSub(mce,True,Ity_I64,
4563                                     vatom1,vatom2, atom1,atom2);
4564          } else {
4565              goto cheap_AddSub64;
4566          }
4567       case Iop_Sub64:
4568          if (mce->dlbo.dl_Sub64 == DLexpensive
4569              || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4570              return expensiveAddSub(mce,False,Ity_I64,
4571                                     vatom1,vatom2, atom1,atom2);
4572          } else {
4573              goto cheap_AddSub64;
4574          }
4575
4576       cheap_AddSub64:
4577       case Iop_Mul64:
4578          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4579
4580       case Iop_Mul16:
4581       case Iop_Add16:
4582       case Iop_Sub16:
4583          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4584
4585       case Iop_Mul8:
4586       case Iop_Sub8:
4587       case Iop_Add8:
4588          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4589
4590       ////---- CmpXX64
4591       case Iop_CmpEQ64: case Iop_CmpNE64:
4592          if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4593             goto expensive_cmp64;
4594          else
4595             goto cheap_cmp64;
4596
4597       expensive_cmp64:
4598       case Iop_ExpCmpNE64:
4599          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4600
4601       cheap_cmp64:
4602       case Iop_CmpLE64S: case Iop_CmpLE64U:
4603       case Iop_CmpLT64U: case Iop_CmpLT64S:
4604          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4605
4606       ////---- CmpXX32
4607       case Iop_CmpEQ32: case Iop_CmpNE32:
4608          if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4609             goto expensive_cmp32;
4610          else
4611             goto cheap_cmp32;
4612
4613       expensive_cmp32:
4614       case Iop_ExpCmpNE32:
4615          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4616
4617       cheap_cmp32:
4618       case Iop_CmpLE32S: case Iop_CmpLE32U:
4619       case Iop_CmpLT32U: case Iop_CmpLT32S:
4620          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4621
4622       ////---- CmpXX16
4623       case Iop_CmpEQ16: case Iop_CmpNE16:
4624          if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4625             goto expensive_cmp16;
4626          else
4627             goto cheap_cmp16;
4628
4629       expensive_cmp16:
4630       case Iop_ExpCmpNE16:
4631          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4632
4633       cheap_cmp16:
4634          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4635
4636       ////---- CmpXX8
4637       case Iop_CmpEQ8: case Iop_CmpNE8:
4638          if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4639             goto expensive_cmp8;
4640          else
4641             goto cheap_cmp8;
4642
4643       expensive_cmp8:
4644          return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4645
4646       cheap_cmp8:
4647          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4648
4649       ////---- end CmpXX{64,32,16,8}
4650
4651       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4652       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4653       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4654       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4655          /* Just say these all produce a defined result, regardless
4656             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4657          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4658
4659       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4660          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4661
4662       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4663          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4664
4665       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4666          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4667
4668       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4669          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4670
4671       case Iop_AndV256:
4672          uifu = mkUifUV256; difd = mkDifDV256;
4673          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4674       case Iop_AndV128:
4675          uifu = mkUifUV128; difd = mkDifDV128;
4676          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4677       case Iop_And64:
4678          uifu = mkUifU64; difd = mkDifD64;
4679          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4680       case Iop_And32:
4681          uifu = mkUifU32; difd = mkDifD32;
4682          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4683       case Iop_And16:
4684          uifu = mkUifU16; difd = mkDifD16;
4685          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4686       case Iop_And8:
4687          uifu = mkUifU8; difd = mkDifD8;
4688          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4689       case Iop_And1:
4690          uifu = mkUifU1; difd = mkDifD1;
4691          and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4692
4693       case Iop_OrV256:
4694          uifu = mkUifUV256; difd = mkDifDV256;
4695          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4696       case Iop_OrV128:
4697          uifu = mkUifUV128; difd = mkDifDV128;
4698          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4699       case Iop_Or64:
4700          uifu = mkUifU64; difd = mkDifD64;
4701          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4702       case Iop_Or32:
4703          uifu = mkUifU32; difd = mkDifD32;
4704          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4705       case Iop_Or16:
4706          uifu = mkUifU16; difd = mkDifD16;
4707          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4708       case Iop_Or8:
4709          uifu = mkUifU8; difd = mkDifD8;
4710          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4711       case Iop_Or1:
4712          uifu = mkUifU1; difd = mkDifD1;
4713          and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4714
4715       do_And_Or:
4716          return
4717          assignNew(
4718             'V', mce,
4719             and_or_ty,
4720             difd(mce, uifu(mce, vatom1, vatom2),
4721                       difd(mce, improve(mce, atom1, vatom1),
4722                                 improve(mce, atom2, vatom2) ) ) );
4723
4724       case Iop_Xor8:
4725          return mkUifU8(mce, vatom1, vatom2);
4726       case Iop_Xor16:
4727          return mkUifU16(mce, vatom1, vatom2);
4728       case Iop_Xor32:
4729          return mkUifU32(mce, vatom1, vatom2);
4730       case Iop_Xor64:
4731          return mkUifU64(mce, vatom1, vatom2);
4732       case Iop_XorV128:
4733          return mkUifUV128(mce, vatom1, vatom2);
4734       case Iop_XorV256:
4735          return mkUifUV256(mce, vatom1, vatom2);
4736
4737       /* V256-bit SIMD */
4738
4739       case Iop_ShrN16x16:
4740       case Iop_ShrN32x8:
4741       case Iop_ShrN64x4:
4742       case Iop_SarN16x16:
4743       case Iop_SarN32x8:
4744       case Iop_ShlN16x16:
4745       case Iop_ShlN32x8:
4746       case Iop_ShlN64x4:
4747          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4748             this is wrong now, scalar shifts are done properly lazily.
4749             Vector shifts should be fixed too. */
4750          complainIfUndefined(mce, atom2, NULL);
4751          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4752
4753       case Iop_QSub8Ux32:
4754       case Iop_QSub8Sx32:
4755       case Iop_Sub8x32:
4756       case Iop_Min8Ux32:
4757       case Iop_Min8Sx32:
4758       case Iop_Max8Ux32:
4759       case Iop_Max8Sx32:
4760       case Iop_CmpGT8Sx32:
4761       case Iop_CmpEQ8x32:
4762       case Iop_Avg8Ux32:
4763       case Iop_QAdd8Ux32:
4764       case Iop_QAdd8Sx32:
4765       case Iop_Add8x32:
4766          return binary8Ix32(mce, vatom1, vatom2);
4767
4768       case Iop_QSub16Ux16:
4769       case Iop_QSub16Sx16:
4770       case Iop_Sub16x16:
4771       case Iop_Mul16x16:
4772       case Iop_MulHi16Sx16:
4773       case Iop_MulHi16Ux16:
4774       case Iop_Min16Sx16:
4775       case Iop_Min16Ux16:
4776       case Iop_Max16Sx16:
4777       case Iop_Max16Ux16:
4778       case Iop_CmpGT16Sx16:
4779       case Iop_CmpEQ16x16:
4780       case Iop_Avg16Ux16:
4781       case Iop_QAdd16Ux16:
4782       case Iop_QAdd16Sx16:
4783       case Iop_Add16x16:
4784          return binary16Ix16(mce, vatom1, vatom2);
4785
4786       case Iop_Sub32x8:
4787       case Iop_CmpGT32Sx8:
4788       case Iop_CmpEQ32x8:
4789       case Iop_Add32x8:
4790       case Iop_Max32Ux8:
4791       case Iop_Max32Sx8:
4792       case Iop_Min32Ux8:
4793       case Iop_Min32Sx8:
4794       case Iop_Mul32x8:
4795          return binary32Ix8(mce, vatom1, vatom2);
4796
4797       case Iop_Sub64x4:
4798       case Iop_Add64x4:
4799       case Iop_CmpEQ64x4:
4800       case Iop_CmpGT64Sx4:
4801          return binary64Ix4(mce, vatom1, vatom2);
4802
4803       case Iop_I32StoF32x8:
4804       case Iop_F32toI32Sx8:
4805          return unary32Fx8_w_rm(mce, vatom1, vatom2);
4806
4807       /* Perm32x8: rearrange values in left arg using steering values
4808          from right arg.  So rearrange the vbits in the same way but
4809          pessimise wrt steering values. */
4810       case Iop_Perm32x8:
4811          return mkUifUV256(
4812                    mce,
4813                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4814                    mkPCast32x8(mce, vatom2)
4815                 );
4816
4817       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4818          Handle the shifted results in the same way that other
4819          binary Q ops are handled, eg QSub: UifU the two args,
4820          then pessimise -- which is binaryNIxM.  But for the upper
4821          V128, we require to generate just 1 bit which is the
4822          pessimised shift result, with 127 defined zeroes above it.
4823
4824          Note that this overly pessimistic in that in fact only the
4825          bottom 8 bits of each lane of the second arg determine the shift
4826          amount.  Really we ought to ignore any undefinedness in the
4827          rest of the lanes of the second arg. */
4828       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
4829       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4830       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
4831       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4832       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
4833       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4834       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
4835       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4836       {
4837          // The function to generate the pessimised shift result
4838          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4839          switch (op) {
4840             case Iop_QandSQsh64x2:
4841             case Iop_QandUQsh64x2:
4842             case Iop_QandSQRsh64x2:
4843             case Iop_QandUQRsh64x2:
4844                binaryNIxM = binary64Ix2;
4845                break;
4846             case Iop_QandSQsh32x4:
4847             case Iop_QandUQsh32x4:
4848             case Iop_QandSQRsh32x4:
4849             case Iop_QandUQRsh32x4:
4850                binaryNIxM = binary32Ix4;
4851                break;
4852             case Iop_QandSQsh16x8:
4853             case Iop_QandUQsh16x8:
4854             case Iop_QandSQRsh16x8:
4855             case Iop_QandUQRsh16x8:
4856                binaryNIxM = binary16Ix8;
4857                break;
4858             case Iop_QandSQsh8x16:
4859             case Iop_QandUQsh8x16:
4860             case Iop_QandSQRsh8x16:
4861             case Iop_QandUQRsh8x16:
4862                binaryNIxM = binary8Ix16;
4863                break;
4864             default:
4865                tl_assert(0);
4866          }
4867          tl_assert(binaryNIxM);
4868          // Pessimised shift result, shV[127:0]
4869          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4870          // Generates: Def--(127)--Def PCast-to-I1(shV)
4871          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4872          // and assemble the result
4873          return assignNew('V', mce, Ity_V256,
4874                           binop(Iop_V128HLtoV256, qV, shV));
4875       }
4876
4877       case Iop_F32toF16x4: {
4878          // First, PCast the input vector, retaining the 32x4 format.
4879          IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
4880          // Now truncate each 32 bit lane to 16 bits.  Since we already PCasted
4881          // the input, we're not going to lose any information.
4882          IRAtom* pcHI64
4883             = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
4884          IRAtom* pcLO64
4885             = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
4886          IRAtom* narrowed
4887             = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
4888                                                  pcHI64, pcLO64)); // 16x4
4889          // Finally, roll in any badness from the rounding mode.
4890          IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
4891          return mkUifU64(mce, narrowed, rmPCasted);
4892       }
4893
4894       case Iop_F32toF16x8: {
4895          // Same scheme as for Iop_F32toF16x4.
4896          IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
4897          IRAtom* pcHI128
4898             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
4899                                                  pcasted)); // 32x4
4900          IRAtom* pcLO128
4901             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
4902                                                  pcasted)); // 32x4
4903          IRAtom* narrowed
4904             = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
4905                                                   pcHI128, pcLO128)); // 16x8
4906          // Finally, roll in any badness from the rounding mode.
4907          IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
4908          return mkUifUV128(mce, narrowed, rmPCasted);
4909       }
4910
4911       default:
4912          ppIROp(op);
4913          VG_(tool_panic)("memcheck:expr2vbits_Binop");
4914    }
4915 }
4916
4917
4918 static
4919 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
4920 {
4921    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
4922       selection of shadow operation implicitly duplicates the logic in
4923       do_shadow_LoadG and should be kept in sync (in the very unlikely
4924       event that the interpretation of such widening ops changes in
4925       future).  See comment in do_shadow_LoadG. */
4926    IRAtom* vatom = expr2vbits( mce, atom, HuOth );
4927    tl_assert(isOriginalAtom(mce,atom));
4928    switch (op) {
4929
4930       case Iop_Abs64Fx2:
4931       case Iop_Neg64Fx2:
4932       case Iop_RSqrtEst64Fx2:
4933       case Iop_RecipEst64Fx2:
4934       case Iop_Log2_64Fx2:
4935          return unary64Fx2(mce, vatom);
4936
4937       case Iop_Sqrt64F0x2:
4938          return unary64F0x2(mce, vatom);
4939
4940       case Iop_Sqrt32Fx8:
4941       case Iop_RSqrtEst32Fx8:
4942       case Iop_RecipEst32Fx8:
4943          return unary32Fx8(mce, vatom);
4944
4945       case Iop_Sqrt64Fx4:
4946          return unary64Fx4(mce, vatom);
4947
4948       case Iop_RecipEst32Fx4:
4949       case Iop_I32UtoF32x4_DEP:
4950       case Iop_I32StoF32x4_DEP:
4951       case Iop_QF32toI32Ux4_RZ:
4952       case Iop_QF32toI32Sx4_RZ:
4953       case Iop_RoundF32x4_RM:
4954       case Iop_RoundF32x4_RP:
4955       case Iop_RoundF32x4_RN:
4956       case Iop_RoundF32x4_RZ:
4957       case Iop_RecipEst32Ux4:
4958       case Iop_Abs32Fx4:
4959       case Iop_Neg32Fx4:
4960       case Iop_RSqrtEst32Fx4:
4961       case Iop_Log2_32Fx4:
4962       case Iop_Exp2_32Fx4:
4963          return unary32Fx4(mce, vatom);
4964
4965       case Iop_I32UtoF32x2_DEP:
4966       case Iop_I32StoF32x2_DEP:
4967       case Iop_RecipEst32Fx2:
4968       case Iop_RecipEst32Ux2:
4969       case Iop_Abs32Fx2:
4970       case Iop_Neg32Fx2:
4971       case Iop_RSqrtEst32Fx2:
4972          return unary32Fx2(mce, vatom);
4973
4974       case Iop_Sqrt32F0x4:
4975       case Iop_RSqrtEst32F0x4:
4976       case Iop_RecipEst32F0x4:
4977          return unary32F0x4(mce, vatom);
4978
4979       // These are self-shadowing.
4980       case Iop_32UtoV128:
4981       case Iop_64UtoV128:
4982       case Iop_Dup8x16:
4983       case Iop_Dup16x8:
4984       case Iop_Dup32x4:
4985       case Iop_Reverse1sIn8_x16:
4986       case Iop_Reverse8sIn16_x8:
4987       case Iop_Reverse8sIn32_x4:
4988       case Iop_Reverse16sIn32_x4:
4989       case Iop_Reverse8sIn64_x2:
4990       case Iop_Reverse16sIn64_x2:
4991       case Iop_Reverse32sIn64_x2:
4992       case Iop_V256toV128_1: case Iop_V256toV128_0:
4993       case Iop_ZeroHI64ofV128:
4994       case Iop_ZeroHI96ofV128:
4995       case Iop_ZeroHI112ofV128:
4996       case Iop_ZeroHI120ofV128:
4997          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4998
4999       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
5000       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
5001          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5002       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
5003       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
5004          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5005
5006       case Iop_NegF128:
5007       case Iop_AbsF128:
5008       case Iop_RndF128:
5009       case Iop_TruncF128toI64S: /* F128 -> I64S */
5010       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5011       case Iop_TruncF128toI64U: /* F128 -> I64U */
5012       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5013          return mkPCastTo(mce, Ity_I128, vatom);
5014
5015       case Iop_BCD128toI128S:
5016       case Iop_MulI128by10:
5017       case Iop_MulI128by10Carry:
5018       case Iop_F16toF64x2:
5019       case Iop_F64toF16x2_DEP:
5020          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5021          return vatom;
5022
5023       case Iop_I32StoF128: /* signed I32 -> F128 */
5024       case Iop_I64StoF128: /* signed I64 -> F128 */
5025       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5026       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5027       case Iop_F32toF128:  /* F32 -> F128 */
5028       case Iop_F64toF128:  /* F64 -> F128 */
5029       case Iop_I32StoD128: /* signed I64 -> D128 */
5030       case Iop_I64StoD128: /* signed I64 -> D128 */
5031       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5032       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5033          return mkPCastTo(mce, Ity_I128, vatom);
5034
5035       case Iop_F16toF64:
5036       case Iop_F32toF64:
5037       case Iop_I32StoF64:
5038       case Iop_I32UtoF64:
5039       case Iop_NegF64:
5040       case Iop_AbsF64:
5041       case Iop_RSqrtEst5GoodF64:
5042       case Iop_RoundF64toF64_NEAREST:
5043       case Iop_RoundF64toF64_NegINF:
5044       case Iop_RoundF64toF64_PosINF:
5045       case Iop_RoundF64toF64_ZERO:
5046       case Iop_D32toD64:
5047       case Iop_I32StoD64:
5048       case Iop_I32UtoD64:
5049       case Iop_ExtractExpD64:    /* D64  -> I64 */
5050       case Iop_ExtractExpD128:   /* D128 -> I64 */
5051       case Iop_ExtractSigD64:    /* D64  -> I64 */
5052       case Iop_ExtractSigD128:   /* D128 -> I64 */
5053       case Iop_DPBtoBCD:
5054       case Iop_BCDtoDPB:
5055          return mkPCastTo(mce, Ity_I64, vatom);
5056
5057       case Iop_D64toD128:
5058          return mkPCastTo(mce, Ity_I128, vatom);
5059
5060       case Iop_TruncF64asF32:
5061       case Iop_NegF32:
5062       case Iop_AbsF32:
5063       case Iop_F16toF32:
5064          return mkPCastTo(mce, Ity_I32, vatom);
5065
5066       case Iop_Ctz32: case Iop_CtzNat32:
5067       case Iop_Ctz64: case Iop_CtzNat64:
5068          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5069
5070       case Iop_Clz32: case Iop_ClzNat32:
5071       case Iop_Clz64: case Iop_ClzNat64:
5072          return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5073
5074       // PopCount32: this is slightly pessimistic.  It is true that the
5075       // result depends on all input bits, so that aspect of the PCast is
5076       // correct.  However, regardless of the input, only the lowest 5 bits
5077       // out of the output can ever be undefined.  So we could actually
5078       // "improve" the results here by marking the top 27 bits of output as
5079       // defined.  A similar comment applies for PopCount64.
5080       case Iop_PopCount32:
5081          return mkPCastTo(mce, Ity_I32, vatom);
5082       case Iop_PopCount64:
5083          return mkPCastTo(mce, Ity_I64, vatom);
5084
5085       // These are self-shadowing.
5086       case Iop_1Uto64:
5087       case Iop_1Sto64:
5088       case Iop_8Uto64:
5089       case Iop_8Sto64:
5090       case Iop_16Uto64:
5091       case Iop_16Sto64:
5092       case Iop_32Sto64:
5093       case Iop_32Uto64:
5094       case Iop_V128to64:
5095       case Iop_V128HIto64:
5096       case Iop_128HIto64:
5097       case Iop_128to64:
5098       case Iop_Dup8x8:
5099       case Iop_Dup16x4:
5100       case Iop_Dup32x2:
5101       case Iop_Reverse8sIn16_x4:
5102       case Iop_Reverse8sIn32_x2:
5103       case Iop_Reverse16sIn32_x2:
5104       case Iop_Reverse8sIn64_x1:
5105       case Iop_Reverse16sIn64_x1:
5106       case Iop_Reverse32sIn64_x1:
5107       case Iop_V256to64_0: case Iop_V256to64_1:
5108       case Iop_V256to64_2: case Iop_V256to64_3:
5109          return assignNew('V', mce, Ity_I64, unop(op, vatom));
5110
5111       // These are self-shadowing.
5112       case Iop_64to32:
5113       case Iop_64HIto32:
5114       case Iop_1Uto32:
5115       case Iop_1Sto32:
5116       case Iop_8Uto32:
5117       case Iop_16Uto32:
5118       case Iop_16Sto32:
5119       case Iop_8Sto32:
5120       case Iop_V128to32:
5121       case Iop_Reverse8sIn32_x1:
5122          return assignNew('V', mce, Ity_I32, unop(op, vatom));
5123
5124       // These are self-shadowing.
5125       case Iop_8Sto16:
5126       case Iop_8Uto16:
5127       case Iop_32to16:
5128       case Iop_32HIto16:
5129       case Iop_64to16:
5130       case Iop_GetMSBs8x16:
5131          return assignNew('V', mce, Ity_I16, unop(op, vatom));
5132
5133       // These are self-shadowing.
5134       case Iop_1Uto8:
5135       case Iop_1Sto8:
5136       case Iop_16to8:
5137       case Iop_16HIto8:
5138       case Iop_32to8:
5139       case Iop_64to8:
5140       case Iop_GetMSBs8x8:
5141          return assignNew('V', mce, Ity_I8, unop(op, vatom));
5142
5143       case Iop_32to1:
5144          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5145
5146       case Iop_64to1:
5147          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5148
5149       case Iop_ReinterpF64asI64:
5150       case Iop_ReinterpI64asF64:
5151       case Iop_ReinterpI32asF32:
5152       case Iop_ReinterpF32asI32:
5153       case Iop_ReinterpI64asD64:
5154       case Iop_ReinterpD64asI64:
5155       case Iop_NotV256:
5156       case Iop_NotV128:
5157       case Iop_Not64:
5158       case Iop_Not32:
5159       case Iop_Not16:
5160       case Iop_Not8:
5161       case Iop_Not1:
5162          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5163          return vatom;
5164
5165       case Iop_CmpNEZ8x8:
5166       case Iop_Cnt8x8:
5167       case Iop_Clz8x8:
5168       case Iop_Cls8x8:
5169       case Iop_Abs8x8:
5170          return mkPCast8x8(mce, vatom);
5171
5172       case Iop_CmpNEZ8x16:
5173       case Iop_Cnt8x16:
5174       case Iop_Clz8x16:
5175       case Iop_Cls8x16:
5176       case Iop_Abs8x16:
5177       case Iop_Ctz8x16:
5178          return mkPCast8x16(mce, vatom);
5179
5180       case Iop_CmpNEZ16x4:
5181       case Iop_Clz16x4:
5182       case Iop_Cls16x4:
5183       case Iop_Abs16x4:
5184          return mkPCast16x4(mce, vatom);
5185
5186       case Iop_CmpNEZ16x8:
5187       case Iop_Clz16x8:
5188       case Iop_Cls16x8:
5189       case Iop_Abs16x8:
5190       case Iop_Ctz16x8:
5191          return mkPCast16x8(mce, vatom);
5192
5193       case Iop_CmpNEZ32x2:
5194       case Iop_Clz32x2:
5195       case Iop_Cls32x2:
5196       case Iop_F32toI32Ux2_RZ:
5197       case Iop_F32toI32Sx2_RZ:
5198       case Iop_Abs32x2:
5199          return mkPCast32x2(mce, vatom);
5200
5201       case Iop_CmpNEZ32x4:
5202       case Iop_Clz32x4:
5203       case Iop_Cls32x4:
5204       case Iop_F32toI32Ux4_RZ:
5205       case Iop_F32toI32Sx4_RZ:
5206       case Iop_Abs32x4:
5207       case Iop_RSqrtEst32Ux4:
5208       case Iop_Ctz32x4:
5209          return mkPCast32x4(mce, vatom);
5210
5211       case Iop_CmpwNEZ32:
5212          return mkPCastTo(mce, Ity_I32, vatom);
5213
5214       case Iop_CmpwNEZ64:
5215          return mkPCastTo(mce, Ity_I64, vatom);
5216
5217       case Iop_CmpNEZ64x2:
5218       case Iop_CipherSV128:
5219       case Iop_Clz64x2:
5220       case Iop_Abs64x2:
5221       case Iop_Ctz64x2:
5222          return mkPCast64x2(mce, vatom);
5223
5224       // This is self-shadowing.
5225       case Iop_PwBitMtxXpose64x2:
5226          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5227
5228       case Iop_NarrowUn16to8x8:
5229       case Iop_NarrowUn32to16x4:
5230       case Iop_NarrowUn64to32x2:
5231       case Iop_QNarrowUn16Sto8Sx8:
5232       case Iop_QNarrowUn16Sto8Ux8:
5233       case Iop_QNarrowUn16Uto8Ux8:
5234       case Iop_QNarrowUn32Sto16Sx4:
5235       case Iop_QNarrowUn32Sto16Ux4:
5236       case Iop_QNarrowUn32Uto16Ux4:
5237       case Iop_QNarrowUn64Sto32Sx2:
5238       case Iop_QNarrowUn64Sto32Ux2:
5239       case Iop_QNarrowUn64Uto32Ux2:
5240          return vectorNarrowUnV128(mce, op, vatom);
5241
5242       // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5243       // right.
5244       case Iop_F32toF16x4_DEP:
5245          return vectorNarrowUnV128(mce, op, vatom);
5246
5247       case Iop_Widen8Sto16x8:
5248       case Iop_Widen8Uto16x8:
5249       case Iop_Widen16Sto32x4:
5250       case Iop_Widen16Uto32x4:
5251       case Iop_Widen32Sto64x2:
5252       case Iop_Widen32Uto64x2:
5253          return vectorWidenI64(mce, op, vatom);
5254
5255       case Iop_F16toF32x4:
5256          // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5257          // OK by accident if -- as seems likely -- the F16 to F32 conversion
5258          // preserves will generate an output 32 bits with at least one 1 bit
5259          // set if there's one or more 1 bits set in the input 16 bits.  More
5260          // correct code for this is just below, but commented out, so as to
5261          // avoid short-term backend failures on targets that can't do
5262          // Iop_Interleave{LO,HI}16x4.
5263          return vectorWidenI64(mce, op, vatom);
5264
5265       case Iop_F16toF32x8: {
5266          // PCast the input at 16x8.  This makes each lane hold either all
5267          // zeroes or all ones.
5268          IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5269          // Now double the width of each lane to 32 bits.  Because the lanes are
5270          // all zeroes or all ones, we can just copy the each lane twice into
5271          // the result.  Here's the low half:
5272          IRAtom* widenedLO // :: I32x4
5273             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5274                                                   pcasted, pcasted));
5275          // And the high half:
5276          IRAtom* widenedHI // :: I32x4
5277             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5278                                                   pcasted, pcasted));
5279          // Glue them back together:
5280          return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5281                                                     widenedHI, widenedLO));
5282       }
5283
5284       // See comment just above, for Iop_F16toF32x4
5285       //case Iop_F16toF32x4: {
5286       //   // Same scheme as F16toF32x4
5287       //   IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5288       //   IRAtom* widenedLO // :: I32x2
5289       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5290       //                                            pcasted, pcasted));
5291       //   IRAtom* widenedHI // :: I32x4
5292       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5293       //                                            pcasted, pcasted));
5294       //   // Glue them back together:
5295       //   return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5296       //                                              widenedHI, widenedLO));
5297       //}
5298
5299       case Iop_PwAddL32Ux2:
5300       case Iop_PwAddL32Sx2:
5301          return mkPCastTo(mce, Ity_I64,
5302                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5303
5304       case Iop_PwAddL16Ux4:
5305       case Iop_PwAddL16Sx4:
5306          return mkPCast32x2(mce,
5307                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5308
5309       case Iop_PwAddL8Ux8:
5310       case Iop_PwAddL8Sx8:
5311          return mkPCast16x4(mce,
5312                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5313
5314       case Iop_PwAddL32Ux4:
5315       case Iop_PwAddL32Sx4:
5316          return mkPCast64x2(mce,
5317                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5318
5319       case Iop_PwAddL64Ux2:
5320          return mkPCast128x1(mce,
5321                assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5322
5323       case Iop_PwAddL16Ux8:
5324       case Iop_PwAddL16Sx8:
5325          return mkPCast32x4(mce,
5326                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5327
5328       case Iop_PwAddL8Ux16:
5329       case Iop_PwAddL8Sx16:
5330          return mkPCast16x8(mce,
5331                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5332
5333       case Iop_I64UtoF32:
5334       default:
5335          ppIROp(op);
5336          VG_(tool_panic)("memcheck:expr2vbits_Unop");
5337    }
5338 }
5339
5340
5341 /* Worker function -- do not call directly.  See comments on
5342    expr2vbits_Load for the meaning of |guard|.
5343
5344    Generates IR to (1) perform a definedness test of |addr|, (2)
5345    perform a validity test of |addr|, and (3) return the Vbits for the
5346    location indicated by |addr|.  All of this only happens when
5347    |guard| is NULL or |guard| evaluates to True at run time.
5348
5349    If |guard| evaluates to False at run time, the returned value is
5350    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5351    performed.
5352
5353    The definedness of |guard| itself is not checked.  That is assumed
5354    to have been done before this point, by the caller. */
5355 static
5356 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5357                               IREndness end, IRType ty,
5358                               IRAtom* addr, UInt bias, IRAtom* guard )
5359 {
5360    tl_assert(isOriginalAtom(mce,addr));
5361    tl_assert(end == Iend_LE || end == Iend_BE);
5362
5363    /* First, emit a definedness test for the address.  This also sets
5364       the address (shadow) to 'defined' following the test. */
5365    complainIfUndefined( mce, addr, guard );
5366
5367    /* Now cook up a call to the relevant helper function, to read the
5368       data V bits from shadow memory. */
5369    ty = shadowTypeV(ty);
5370
5371    void*        helper           = NULL;
5372    const HChar* hname            = NULL;
5373    Bool         ret_via_outparam = False;
5374
5375    if (end == Iend_LE) {
5376       switch (ty) {
5377          case Ity_V256: helper = &MC_(helperc_LOADV256le);
5378                         hname = "MC_(helperc_LOADV256le)";
5379                         ret_via_outparam = True;
5380                         break;
5381          case Ity_V128: helper = &MC_(helperc_LOADV128le);
5382                         hname = "MC_(helperc_LOADV128le)";
5383                         ret_via_outparam = True;
5384                         break;
5385          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
5386                         hname = "MC_(helperc_LOADV64le)";
5387                         break;
5388          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
5389                         hname = "MC_(helperc_LOADV32le)";
5390                         break;
5391          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
5392                         hname = "MC_(helperc_LOADV16le)";
5393                         break;
5394          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5395                         hname = "MC_(helperc_LOADV8)";
5396                         break;
5397          default:       ppIRType(ty);
5398                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5399       }
5400    } else {
5401       switch (ty) {
5402          case Ity_V256: helper = &MC_(helperc_LOADV256be);
5403                         hname = "MC_(helperc_LOADV256be)";
5404                         ret_via_outparam = True;
5405                         break;
5406          case Ity_V128: helper = &MC_(helperc_LOADV128be);
5407                         hname = "MC_(helperc_LOADV128be)";
5408                         ret_via_outparam = True;
5409                         break;
5410          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
5411                         hname = "MC_(helperc_LOADV64be)";
5412                         break;
5413          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
5414                         hname = "MC_(helperc_LOADV32be)";
5415                         break;
5416          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
5417                         hname = "MC_(helperc_LOADV16be)";
5418                         break;
5419          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5420                         hname = "MC_(helperc_LOADV8)";
5421                         break;
5422          default:       ppIRType(ty);
5423                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5424       }
5425    }
5426
5427    tl_assert(helper);
5428    tl_assert(hname);
5429
5430    /* Generate the actual address into addrAct. */
5431    IRAtom* addrAct;
5432    if (bias == 0) {
5433       addrAct = addr;
5434    } else {
5435       IROp    mkAdd;
5436       IRAtom* eBias;
5437       IRType  tyAddr  = mce->hWordTy;
5438       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5439       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5440       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5441       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5442    }
5443
5444    /* We need to have a place to park the V bits we're just about to
5445       read. */
5446    IRTemp datavbits = newTemp(mce, ty, VSh);
5447
5448    /* Here's the call. */
5449    IRDirty* di;
5450    if (ret_via_outparam) {
5451       di = unsafeIRDirty_1_N( datavbits,
5452                               2/*regparms*/,
5453                               hname, VG_(fnptr_to_fnentry)( helper ),
5454                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5455    } else {
5456       di = unsafeIRDirty_1_N( datavbits,
5457                               1/*regparms*/,
5458                               hname, VG_(fnptr_to_fnentry)( helper ),
5459                               mkIRExprVec_1( addrAct ) );
5460    }
5461
5462    setHelperAnns( mce, di );
5463    if (guard) {
5464       di->guard = guard;
5465       /* Ideally the didn't-happen return value here would be all-ones
5466          (all-undefined), so it'd be obvious if it got used
5467          inadvertently.  We can get by with the IR-mandated default
5468          value (0b01 repeating, 0x55 etc) as that'll still look pretty
5469          undefined if it ever leaks out. */
5470    }
5471    stmt( 'V', mce, IRStmt_Dirty(di) );
5472
5473    return mkexpr(datavbits);
5474 }
5475
5476
5477 /* Generate IR to do a shadow load.  The helper is expected to check
5478    the validity of the address and return the V bits for that address.
5479    This can optionally be controlled by a guard, which is assumed to
5480    be True if NULL.  In the case where the guard is False at runtime,
5481    the helper will return the didn't-do-the-call value of 0x55..55.
5482    Since that means "completely undefined result", the caller of
5483    this function will need to fix up the result somehow in that
5484    case.
5485
5486    Caller of this function is also expected to have checked the
5487    definedness of |guard| before this point.
5488 */
5489 static
5490 IRAtom* expr2vbits_Load ( MCEnv* mce,
5491                           IREndness end, IRType ty,
5492                           IRAtom* addr, UInt bias,
5493                           IRAtom* guard )
5494 {
5495    tl_assert(end == Iend_LE || end == Iend_BE);
5496    switch (shadowTypeV(ty)) {
5497       case Ity_I8:
5498       case Ity_I16:
5499       case Ity_I32:
5500       case Ity_I64:
5501       case Ity_V128:
5502       case Ity_V256:
5503          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5504       default:
5505          VG_(tool_panic)("expr2vbits_Load");
5506    }
5507 }
5508
5509
5510 /* The most general handler for guarded loads.  Assumes the
5511    definedness of GUARD has already been checked by the caller.  A
5512    GUARD of NULL is assumed to mean "always True".  Generates code to
5513    check the definedness and validity of ADDR.
5514
5515    Generate IR to do a shadow load from ADDR and return the V bits.
5516    The loaded type is TY.  The loaded data is then (shadow) widened by
5517    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
5518    evaluates to False at run time then the returned Vbits are simply
5519    VALT instead.  Note therefore that the argument type of VWIDEN must
5520    be TY and the result type of VWIDEN must equal the type of VALT.
5521 */
5522 static
5523 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5524                                           IREndness end, IRType ty,
5525                                           IRAtom* addr, UInt bias,
5526                                           IRAtom* guard,
5527                                           IROp vwiden, IRAtom* valt )
5528 {
5529    /* Sanity check the conversion operation, and also set TYWIDE. */
5530    IRType tyWide = Ity_INVALID;
5531    switch (vwiden) {
5532       case Iop_INVALID:
5533          tyWide = ty;
5534          break;
5535       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5536          tyWide = Ity_I32;
5537          break;
5538       default:
5539          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5540    }
5541
5542    /* If the guard evaluates to True, this will hold the loaded V bits
5543       at TY.  If the guard evaluates to False, this will be all
5544       ones, meaning "all undefined", in which case we will have to
5545       replace it using an ITE below. */
5546    IRAtom* iftrue1
5547       = assignNew('V', mce, ty,
5548                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
5549    /* Now (shadow-) widen the loaded V bits to the desired width.  In
5550       the guard-is-False case, the allowable widening operators will
5551       in the worst case (unsigned widening) at least leave the
5552       pre-widened part as being marked all-undefined, and in the best
5553       case (signed widening) mark the whole widened result as
5554       undefined.  Anyway, it doesn't matter really, since in this case
5555       we will replace said value with the default value |valt| using an
5556       ITE. */
5557    IRAtom* iftrue2
5558       = vwiden == Iop_INVALID
5559            ? iftrue1
5560            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5561    /* These are the V bits we will return if the load doesn't take
5562       place. */
5563    IRAtom* iffalse
5564       = valt;
5565    /* Prepare the cond for the ITE.  Convert a NULL cond into
5566       something that iropt knows how to fold out later. */
5567    IRAtom* cond
5568       = guard == NULL  ? mkU1(1)  : guard;
5569    /* And assemble the final result. */
5570    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5571 }
5572
5573
5574 /* A simpler handler for guarded loads, in which there is no
5575    conversion operation, and the default V bit return (when the guard
5576    evaluates to False at runtime) is "all defined".  If there is no
5577    guard expression or the guard is always TRUE this function behaves
5578    like expr2vbits_Load.  It is assumed that definedness of GUARD has
5579    already been checked at the call site. */
5580 static
5581 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5582                                          IREndness end, IRType ty,
5583                                          IRAtom* addr, UInt bias,
5584                                          IRAtom *guard )
5585 {
5586    return expr2vbits_Load_guarded_General(
5587              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5588           );
5589 }
5590
5591
5592 static
5593 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5594                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5595 {
5596    IRAtom *vbitsC, *vbits0, *vbits1;
5597    IRType ty;
5598    /* Given ITE(cond, iftrue,  iffalse),  generate
5599             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5600       That is, steer the V bits like the originals, but trash the
5601       result if the steering value is undefined.  This gives
5602       lazy propagation. */
5603    tl_assert(isOriginalAtom(mce, cond));
5604    tl_assert(isOriginalAtom(mce, iftrue));
5605    tl_assert(isOriginalAtom(mce, iffalse));
5606
5607    vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5608    vbits1 = expr2vbits(mce, iftrue, HuOth);
5609    vbits0 = expr2vbits(mce, iffalse, HuOth);
5610    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5611
5612    return
5613       mkUifU(mce, ty, assignNew('V', mce, ty,
5614                                      IRExpr_ITE(cond, vbits1, vbits0)),
5615                       mkPCastTo(mce, ty, vbitsC) );
5616 }
5617
5618 /* --------- This is the main expression-handling function. --------- */
5619
5620 static
5621 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5622                      HowUsed hu/*use HuOth if unknown*/ )
5623 {
5624    switch (e->tag) {
5625
5626       case Iex_Get:
5627          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5628
5629       case Iex_GetI:
5630          return shadow_GETI( mce, e->Iex.GetI.descr,
5631                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
5632
5633       case Iex_RdTmp:
5634          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5635
5636       case Iex_Const:
5637          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5638
5639       case Iex_Qop:
5640          return expr2vbits_Qop(
5641                    mce,
5642                    e->Iex.Qop.details->op,
5643                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5644                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5645                 );
5646
5647       case Iex_Triop:
5648          return expr2vbits_Triop(
5649                    mce,
5650                    e->Iex.Triop.details->op,
5651                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5652                    e->Iex.Triop.details->arg3
5653                 );
5654
5655       case Iex_Binop:
5656          return expr2vbits_Binop(
5657                    mce,
5658                    e->Iex.Binop.op,
5659                    e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5660                    hu
5661                 );
5662
5663       case Iex_Unop:
5664          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5665
5666       case Iex_Load:
5667          return expr2vbits_Load( mce, e->Iex.Load.end,
5668                                       e->Iex.Load.ty,
5669                                       e->Iex.Load.addr, 0/*addr bias*/,
5670                                       NULL/* guard == "always True"*/ );
5671
5672       case Iex_CCall:
5673          return mkLazyN( mce, e->Iex.CCall.args,
5674                               e->Iex.CCall.retty,
5675                               e->Iex.CCall.cee );
5676
5677       case Iex_ITE:
5678          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5679                                      e->Iex.ITE.iffalse);
5680
5681       default:
5682          VG_(printf)("\n");
5683          ppIRExpr(e);
5684          VG_(printf)("\n");
5685          VG_(tool_panic)("memcheck: expr2vbits");
5686    }
5687 }
5688
5689
5690 /*------------------------------------------------------------*/
5691 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5692 /*------------------------------------------------------------*/
5693
5694 /* Widen a value to the host word size. */
5695
5696 static
5697 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5698 {
5699    IRType ty, tyH;
5700
5701    /* vatom is vbits-value and as such can only have a shadow type. */
5702    tl_assert(isShadowAtom(mce,vatom));
5703
5704    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5705    tyH = mce->hWordTy;
5706
5707    if (tyH == Ity_I32) {
5708       switch (ty) {
5709          case Ity_I32:
5710             return vatom;
5711          case Ity_I16:
5712             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5713          case Ity_I8:
5714             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5715          default:
5716             goto unhandled;
5717       }
5718    } else
5719    if (tyH == Ity_I64) {
5720       switch (ty) {
5721          case Ity_I32:
5722             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5723          case Ity_I16:
5724             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5725                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5726          case Ity_I8:
5727             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5728                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5729          default:
5730             goto unhandled;
5731       }
5732    } else {
5733       goto unhandled;
5734    }
5735   unhandled:
5736    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5737    VG_(tool_panic)("zwidenToHostWord");
5738 }
5739
5740
5741 /* Generate a shadow store.  |addr| is always the original address
5742    atom.  You can pass in either originals or V-bits for the data
5743    atom, but obviously not both.  This function generates a check for
5744    the definedness and (indirectly) the validity of |addr|, but only
5745    when |guard| evaluates to True at run time (or is NULL).
5746
5747    |guard| :: Ity_I1 controls whether the store really happens; NULL
5748    means it unconditionally does.  Note that |guard| itself is not
5749    checked for definedness; the caller of this function must do that
5750    if necessary.
5751 */
5752 static
5753 void do_shadow_Store ( MCEnv* mce,
5754                        IREndness end,
5755                        IRAtom* addr, UInt bias,
5756                        IRAtom* data, IRAtom* vdata,
5757                        IRAtom* guard )
5758 {
5759    IROp     mkAdd;
5760    IRType   ty, tyAddr;
5761    void*    helper = NULL;
5762    const HChar* hname = NULL;
5763    IRConst* c;
5764
5765    tyAddr = mce->hWordTy;
5766    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5767    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5768    tl_assert( end == Iend_LE || end == Iend_BE );
5769
5770    if (data) {
5771       tl_assert(!vdata);
5772       tl_assert(isOriginalAtom(mce, data));
5773       tl_assert(bias == 0);
5774       vdata = expr2vbits( mce, data, HuOth );
5775    } else {
5776       tl_assert(vdata);
5777    }
5778
5779    tl_assert(isOriginalAtom(mce,addr));
5780    tl_assert(isShadowAtom(mce,vdata));
5781
5782    if (guard) {
5783       tl_assert(isOriginalAtom(mce, guard));
5784       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5785    }
5786
5787    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5788
5789    // If we're not doing undefined value checking, pretend that this value
5790    // is "all valid".  That lets Vex's optimiser remove some of the V bit
5791    // shadow computation ops that precede it.
5792    if (MC_(clo_mc_level) == 1) {
5793       switch (ty) {
5794          case Ity_V256: // V256 weirdness -- used four times
5795                         c = IRConst_V256(V_BITS32_DEFINED); break;
5796          case Ity_V128: // V128 weirdness -- used twice
5797                         c = IRConst_V128(V_BITS16_DEFINED); break;
5798          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
5799          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
5800          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
5801          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
5802          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5803       }
5804       vdata = IRExpr_Const( c );
5805    }
5806
5807    /* First, emit a definedness test for the address.  This also sets
5808       the address (shadow) to 'defined' following the test.  Both of
5809       those actions are gated on |guard|. */
5810    complainIfUndefined( mce, addr, guard );
5811
5812    /* Now decide which helper function to call to write the data V
5813       bits into shadow memory. */
5814    if (end == Iend_LE) {
5815       switch (ty) {
5816          case Ity_V256: /* we'll use the helper four times */
5817          case Ity_V128: /* we'll use the helper twice */
5818          case Ity_I64: helper = &MC_(helperc_STOREV64le);
5819                        hname = "MC_(helperc_STOREV64le)";
5820                        break;
5821          case Ity_I32: helper = &MC_(helperc_STOREV32le);
5822                        hname = "MC_(helperc_STOREV32le)";
5823                        break;
5824          case Ity_I16: helper = &MC_(helperc_STOREV16le);
5825                        hname = "MC_(helperc_STOREV16le)";
5826                        break;
5827          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5828                        hname = "MC_(helperc_STOREV8)";
5829                        break;
5830          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5831       }
5832    } else {
5833       switch (ty) {
5834          case Ity_V128: /* we'll use the helper twice */
5835          case Ity_I64: helper = &MC_(helperc_STOREV64be);
5836                        hname = "MC_(helperc_STOREV64be)";
5837                        break;
5838          case Ity_I32: helper = &MC_(helperc_STOREV32be);
5839                        hname = "MC_(helperc_STOREV32be)";
5840                        break;
5841          case Ity_I16: helper = &MC_(helperc_STOREV16be);
5842                        hname = "MC_(helperc_STOREV16be)";
5843                        break;
5844          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5845                        hname = "MC_(helperc_STOREV8)";
5846                        break;
5847          /* Note, no V256 case here, because no big-endian target that
5848             we support, has 256 vectors. */
5849          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
5850       }
5851    }
5852
5853    if (UNLIKELY(ty == Ity_V256)) {
5854
5855       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
5856          Q3 being the most significant lane. */
5857       /* These are the offsets of the Qs in memory. */
5858       Int     offQ0, offQ1, offQ2, offQ3;
5859
5860       /* Various bits for constructing the 4 lane helper calls */
5861       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
5862       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
5863       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
5864       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
5865
5866       if (end == Iend_LE) {
5867          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
5868       } else {
5869          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
5870       }
5871
5872       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
5873       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
5874       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
5875       diQ0    = unsafeIRDirty_0_N(
5876                    1/*regparms*/,
5877                    hname, VG_(fnptr_to_fnentry)( helper ),
5878                    mkIRExprVec_2( addrQ0, vdataQ0 )
5879                 );
5880
5881       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
5882       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
5883       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
5884       diQ1    = unsafeIRDirty_0_N(
5885                    1/*regparms*/,
5886                    hname, VG_(fnptr_to_fnentry)( helper ),
5887                    mkIRExprVec_2( addrQ1, vdataQ1 )
5888                 );
5889
5890       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
5891       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
5892       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
5893       diQ2    = unsafeIRDirty_0_N(
5894                    1/*regparms*/,
5895                    hname, VG_(fnptr_to_fnentry)( helper ),
5896                    mkIRExprVec_2( addrQ2, vdataQ2 )
5897                 );
5898
5899       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
5900       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
5901       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
5902       diQ3    = unsafeIRDirty_0_N(
5903                    1/*regparms*/,
5904                    hname, VG_(fnptr_to_fnentry)( helper ),
5905                    mkIRExprVec_2( addrQ3, vdataQ3 )
5906                 );
5907
5908       if (guard)
5909          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
5910
5911       setHelperAnns( mce, diQ0 );
5912       setHelperAnns( mce, diQ1 );
5913       setHelperAnns( mce, diQ2 );
5914       setHelperAnns( mce, diQ3 );
5915       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
5916       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
5917       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
5918       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
5919
5920    }
5921    else if (UNLIKELY(ty == Ity_V128)) {
5922
5923       /* V128-bit case */
5924       /* See comment in next clause re 64-bit regparms */
5925       /* also, need to be careful about endianness */
5926
5927       Int     offLo64, offHi64;
5928       IRDirty *diLo64, *diHi64;
5929       IRAtom  *addrLo64, *addrHi64;
5930       IRAtom  *vdataLo64, *vdataHi64;
5931       IRAtom  *eBiasLo64, *eBiasHi64;
5932
5933       if (end == Iend_LE) {
5934          offLo64 = 0;
5935          offHi64 = 8;
5936       } else {
5937          offLo64 = 8;
5938          offHi64 = 0;
5939       }
5940
5941       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
5942       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
5943       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
5944       diLo64    = unsafeIRDirty_0_N(
5945                      1/*regparms*/,
5946                      hname, VG_(fnptr_to_fnentry)( helper ),
5947                      mkIRExprVec_2( addrLo64, vdataLo64 )
5948                   );
5949       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
5950       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
5951       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
5952       diHi64    = unsafeIRDirty_0_N(
5953                      1/*regparms*/,
5954                      hname, VG_(fnptr_to_fnentry)( helper ),
5955                      mkIRExprVec_2( addrHi64, vdataHi64 )
5956                   );
5957       if (guard) diLo64->guard = guard;
5958       if (guard) diHi64->guard = guard;
5959       setHelperAnns( mce, diLo64 );
5960       setHelperAnns( mce, diHi64 );
5961       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
5962       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
5963
5964    } else {
5965
5966       IRDirty *di;
5967       IRAtom  *addrAct;
5968
5969       /* 8/16/32/64-bit cases */
5970       /* Generate the actual address into addrAct. */
5971       if (bias == 0) {
5972          addrAct = addr;
5973       } else {
5974          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5975          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
5976       }
5977
5978       if (ty == Ity_I64) {
5979          /* We can't do this with regparm 2 on 32-bit platforms, since
5980             the back ends aren't clever enough to handle 64-bit
5981             regparm args.  Therefore be different. */
5982          di = unsafeIRDirty_0_N(
5983                  1/*regparms*/,
5984                  hname, VG_(fnptr_to_fnentry)( helper ),
5985                  mkIRExprVec_2( addrAct, vdata )
5986               );
5987       } else {
5988          di = unsafeIRDirty_0_N(
5989                  2/*regparms*/,
5990                  hname, VG_(fnptr_to_fnentry)( helper ),
5991                  mkIRExprVec_2( addrAct,
5992                                 zwidenToHostWord( mce, vdata ))
5993               );
5994       }
5995       if (guard) di->guard = guard;
5996       setHelperAnns( mce, di );
5997       stmt( 'V', mce, IRStmt_Dirty(di) );
5998    }
5999
6000 }
6001
6002
6003 /* Do lazy pessimistic propagation through a dirty helper call, by
6004    looking at the annotations on it.  This is the most complex part of
6005    Memcheck. */
6006
6007 static IRType szToITy ( Int n )
6008 {
6009    switch (n) {
6010       case 1: return Ity_I8;
6011       case 2: return Ity_I16;
6012       case 4: return Ity_I32;
6013       case 8: return Ity_I64;
6014       default: VG_(tool_panic)("szToITy(memcheck)");
6015    }
6016 }
6017
6018 static
6019 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6020 {
6021    Int       i, k, n, toDo, gSz, gOff;
6022    IRAtom    *src, *here, *curr;
6023    IRType    tySrc, tyDst;
6024    IRTemp    dst;
6025    IREndness end;
6026
6027    /* What's the native endianness?  We need to know this. */
6028 #  if defined(VG_BIGENDIAN)
6029    end = Iend_BE;
6030 #  elif defined(VG_LITTLEENDIAN)
6031    end = Iend_LE;
6032 #  else
6033 #    error "Unknown endianness"
6034 #  endif
6035
6036    /* First check the guard. */
6037    complainIfUndefined(mce, d->guard, NULL);
6038
6039    /* Now round up all inputs and PCast over them. */
6040    curr = definedOfType(Ity_I32);
6041
6042    /* Inputs: unmasked args
6043       Note: arguments are evaluated REGARDLESS of the guard expression */
6044    for (i = 0; d->args[i]; i++) {
6045       IRAtom* arg = d->args[i];
6046       if ( (d->cee->mcx_mask & (1<<i))
6047            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6048          /* ignore this arg */
6049       } else {
6050          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6051          curr = mkUifU32(mce, here, curr);
6052       }
6053    }
6054
6055    /* Inputs: guest state that we read. */
6056    for (i = 0; i < d->nFxState; i++) {
6057       tl_assert(d->fxState[i].fx != Ifx_None);
6058       if (d->fxState[i].fx == Ifx_Write)
6059          continue;
6060
6061       /* Enumerate the described state segments */
6062       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6063          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6064          gSz  = d->fxState[i].size;
6065
6066          /* Ignore any sections marked as 'always defined'. */
6067          if (isAlwaysDefd(mce, gOff, gSz)) {
6068             if (0)
6069             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6070                         gOff, gSz);
6071             continue;
6072          }
6073
6074          /* This state element is read or modified.  So we need to
6075             consider it.  If larger than 8 bytes, deal with it in
6076             8-byte chunks. */
6077          while (True) {
6078             tl_assert(gSz >= 0);
6079             if (gSz == 0) break;
6080             n = gSz <= 8 ? gSz : 8;
6081             /* update 'curr' with UifU of the state slice
6082                gOff .. gOff+n-1 */
6083             tySrc = szToITy( n );
6084
6085             /* Observe the guard expression. If it is false use an
6086                all-bits-defined bit pattern */
6087             IRAtom *cond, *iffalse, *iftrue;
6088
6089             cond    = assignNew('V', mce, Ity_I1, d->guard);
6090             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6091             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6092             src     = assignNew('V', mce, tySrc,
6093                                 IRExpr_ITE(cond, iftrue, iffalse));
6094
6095             here = mkPCastTo( mce, Ity_I32, src );
6096             curr = mkUifU32(mce, here, curr);
6097             gSz -= n;
6098             gOff += n;
6099          }
6100       }
6101    }
6102
6103    /* Inputs: memory.  First set up some info needed regardless of
6104       whether we're doing reads or writes. */
6105
6106    if (d->mFx != Ifx_None) {
6107       /* Because we may do multiple shadow loads/stores from the same
6108          base address, it's best to do a single test of its
6109          definedness right now.  Post-instrumentation optimisation
6110          should remove all but this test. */
6111       IRType tyAddr;
6112       tl_assert(d->mAddr);
6113       complainIfUndefined(mce, d->mAddr, d->guard);
6114
6115       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6116       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6117       tl_assert(tyAddr == mce->hWordTy); /* not really right */
6118    }
6119
6120    /* Deal with memory inputs (reads or modifies) */
6121    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6122       toDo   = d->mSize;
6123       /* chew off 32-bit chunks.  We don't care about the endianness
6124          since it's all going to be condensed down to a single bit,
6125          but nevertheless choose an endianness which is hopefully
6126          native to the platform. */
6127       while (toDo >= 4) {
6128          here = mkPCastTo(
6129                    mce, Ity_I32,
6130                    expr2vbits_Load_guarded_Simple(
6131                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6132                 );
6133          curr = mkUifU32(mce, here, curr);
6134          toDo -= 4;
6135       }
6136       /* chew off 16-bit chunks */
6137       while (toDo >= 2) {
6138          here = mkPCastTo(
6139                    mce, Ity_I32,
6140                    expr2vbits_Load_guarded_Simple(
6141                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6142                 );
6143          curr = mkUifU32(mce, here, curr);
6144          toDo -= 2;
6145       }
6146       /* chew off the remaining 8-bit chunk, if any */
6147       if (toDo == 1) {
6148          here = mkPCastTo(
6149                    mce, Ity_I32,
6150                    expr2vbits_Load_guarded_Simple(
6151                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6152                 );
6153          curr = mkUifU32(mce, here, curr);
6154          toDo -= 1;
6155       }
6156       tl_assert(toDo == 0);
6157    }
6158
6159    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
6160       all the inputs to the helper.  Now we need to re-distribute the
6161       results to all destinations. */
6162
6163    /* Outputs: the destination temporary, if there is one. */
6164    if (d->tmp != IRTemp_INVALID) {
6165       dst   = findShadowTmpV(mce, d->tmp);
6166       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6167       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6168    }
6169
6170    /* Outputs: guest state that we write or modify. */
6171    for (i = 0; i < d->nFxState; i++) {
6172       tl_assert(d->fxState[i].fx != Ifx_None);
6173       if (d->fxState[i].fx == Ifx_Read)
6174          continue;
6175
6176       /* Enumerate the described state segments */
6177       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6178          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6179          gSz  = d->fxState[i].size;
6180
6181          /* Ignore any sections marked as 'always defined'. */
6182          if (isAlwaysDefd(mce, gOff, gSz))
6183             continue;
6184
6185          /* This state element is written or modified.  So we need to
6186             consider it.  If larger than 8 bytes, deal with it in
6187             8-byte chunks. */
6188          while (True) {
6189             tl_assert(gSz >= 0);
6190             if (gSz == 0) break;
6191             n = gSz <= 8 ? gSz : 8;
6192             /* Write suitably-casted 'curr' to the state slice
6193                gOff .. gOff+n-1 */
6194             tyDst = szToITy( n );
6195             do_shadow_PUT( mce, gOff,
6196                                 NULL, /* original atom */
6197                                 mkPCastTo( mce, tyDst, curr ), d->guard );
6198             gSz -= n;
6199             gOff += n;
6200          }
6201       }
6202    }
6203
6204    /* Outputs: memory that we write or modify.  Same comments about
6205       endianness as above apply. */
6206    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6207       toDo   = d->mSize;
6208       /* chew off 32-bit chunks */
6209       while (toDo >= 4) {
6210          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6211                           NULL, /* original data */
6212                           mkPCastTo( mce, Ity_I32, curr ),
6213                           d->guard );
6214          toDo -= 4;
6215       }
6216       /* chew off 16-bit chunks */
6217       while (toDo >= 2) {
6218          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6219                           NULL, /* original data */
6220                           mkPCastTo( mce, Ity_I16, curr ),
6221                           d->guard );
6222          toDo -= 2;
6223       }
6224       /* chew off the remaining 8-bit chunk, if any */
6225       if (toDo == 1) {
6226          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6227                           NULL, /* original data */
6228                           mkPCastTo( mce, Ity_I8, curr ),
6229                           d->guard );
6230          toDo -= 1;
6231       }
6232       tl_assert(toDo == 0);
6233    }
6234
6235 }
6236
6237
6238 /* We have an ABI hint telling us that [base .. base+len-1] is to
6239    become undefined ("writable").  Generate code to call a helper to
6240    notify the A/V bit machinery of this fact.
6241
6242    We call
6243    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6244                                                     Addr nia );
6245 */
6246 static
6247 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6248 {
6249    IRDirty* di;
6250
6251    if (MC_(clo_mc_level) == 3) {
6252       di = unsafeIRDirty_0_N(
6253               3/*regparms*/,
6254               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6255               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6256               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6257            );
6258    } else {
6259       /* We ignore the supplied nia, since it is irrelevant. */
6260       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6261       /* Special-case the len==128 case, since that is for amd64-ELF,
6262          which is a very common target. */
6263       if (len == 128) {
6264          di = unsafeIRDirty_0_N(
6265                  1/*regparms*/,
6266                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6267                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6268                  mkIRExprVec_1( base )
6269               );
6270       } else {
6271          di = unsafeIRDirty_0_N(
6272                  2/*regparms*/,
6273                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6274                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6275                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6276               );
6277       }
6278    }
6279
6280    stmt( 'V', mce, IRStmt_Dirty(di) );
6281 }
6282
6283
6284 /* ------ Dealing with IRCAS (big and complex) ------ */
6285
6286 /* FWDS */
6287 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
6288                              IRAtom* baseaddr, Int offset );
6289 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6290 static void    gen_store_b ( MCEnv* mce, Int szB,
6291                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
6292                              IRAtom* guard );
6293
6294 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6295 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6296
6297
6298 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6299    IRExpr.Consts, else this asserts.  If they are both Consts, it
6300    doesn't do anything.  So that just leaves the RdTmp case.
6301
6302    In which case: this assigns the shadow value SHADOW to the IR
6303    shadow temporary associated with ORIG.  That is, ORIG, being an
6304    original temporary, will have a shadow temporary associated with
6305    it.  However, in the case envisaged here, there will so far have
6306    been no IR emitted to actually write a shadow value into that
6307    temporary.  What this routine does is to (emit IR to) copy the
6308    value in SHADOW into said temporary, so that after this call,
6309    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6310    value in SHADOW.
6311
6312    Point is to allow callers to compute "by hand" a shadow value for
6313    ORIG, and force it to be associated with ORIG.
6314
6315    How do we know that that shadow associated with ORIG has not so far
6316    been assigned to?  Well, we don't per se know that, but supposing
6317    it had.  Then this routine would create a second assignment to it,
6318    and later the IR sanity checker would barf.  But that never
6319    happens.  QED.
6320 */
6321 static void bind_shadow_tmp_to_orig ( UChar how,
6322                                       MCEnv* mce,
6323                                       IRAtom* orig, IRAtom* shadow )
6324 {
6325    tl_assert(isOriginalAtom(mce, orig));
6326    tl_assert(isShadowAtom(mce, shadow));
6327    switch (orig->tag) {
6328       case Iex_Const:
6329          tl_assert(shadow->tag == Iex_Const);
6330          break;
6331       case Iex_RdTmp:
6332          tl_assert(shadow->tag == Iex_RdTmp);
6333          if (how == 'V') {
6334             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6335                    shadow);
6336          } else {
6337             tl_assert(how == 'B');
6338             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6339                    shadow);
6340          }
6341          break;
6342       default:
6343          tl_assert(0);
6344    }
6345 }
6346
6347
6348 static
6349 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6350 {
6351    /* Scheme is (both single- and double- cases):
6352
6353       1. fetch data#,dataB (the proposed new value)
6354
6355       2. fetch expd#,expdB (what we expect to see at the address)
6356
6357       3. check definedness of address
6358
6359       4. load old#,oldB from shadow memory; this also checks
6360          addressibility of the address
6361
6362       5. the CAS itself
6363
6364       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
6365
6366       7. if "expected == old" (as computed by (6))
6367             store data#,dataB to shadow memory
6368
6369       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
6370       'data' but 7 stores 'data#'.  Hence it is possible for the
6371       shadow data to be incorrectly checked and/or updated:
6372
6373       * 7 is at least gated correctly, since the 'expected == old'
6374         condition is derived from outputs of 5.  However, the shadow
6375         write could happen too late: imagine after 5 we are
6376         descheduled, a different thread runs, writes a different
6377         (shadow) value at the address, and then we resume, hence
6378         overwriting the shadow value written by the other thread.
6379
6380       Because the original memory access is atomic, there's no way to
6381       make both the original and shadow accesses into a single atomic
6382       thing, hence this is unavoidable.
6383
6384       At least as Valgrind stands, I don't think it's a problem, since
6385       we're single threaded *and* we guarantee that there are no
6386       context switches during the execution of any specific superblock
6387       -- context switches can only happen at superblock boundaries.
6388
6389       If Valgrind ever becomes MT in the future, then it might be more
6390       of a problem.  A possible kludge would be to artificially
6391       associate with the location, a lock, which we must acquire and
6392       release around the transaction as a whole.  Hmm, that probably
6393       would't work properly since it only guards us against other
6394       threads doing CASs on the same location, not against other
6395       threads doing normal reads and writes.
6396
6397       ------------------------------------------------------------
6398
6399       COMMENT_ON_CasCmpEQ:
6400
6401       Note two things.  Firstly, in the sequence above, we compute
6402       "expected == old", but we don't check definedness of it.  Why
6403       not?  Also, the x86 and amd64 front ends use
6404       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6405       determination (expected == old ?) for themselves, and we also
6406       don't check definedness for those primops; we just say that the
6407       result is defined.  Why?  Details follow.
6408
6409       x86/amd64 contains various forms of locked insns:
6410       * lock prefix before all basic arithmetic insn;
6411         eg lock xorl %reg1,(%reg2)
6412       * atomic exchange reg-mem
6413       * compare-and-swaps
6414
6415       Rather than attempt to represent them all, which would be a
6416       royal PITA, I used a result from Maurice Herlihy
6417       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6418       demonstrates that compare-and-swap is a primitive more general
6419       than the other two, and so can be used to represent all of them.
6420       So the translation scheme for (eg) lock incl (%reg) is as
6421       follows:
6422
6423         again:
6424          old = * %reg
6425          new = old + 1
6426          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6427
6428       The "atomically" is the CAS bit.  The scheme is always the same:
6429       get old value from memory, compute new value, atomically stuff
6430       new value back in memory iff the old value has not changed (iow,
6431       no other thread modified it in the meantime).  If it has changed
6432       then we've been out-raced and we have to start over.
6433
6434       Now that's all very neat, but it has the bad side effect of
6435       introducing an explicit equality test into the translation.
6436       Consider the behaviour of said code on a memory location which
6437       is uninitialised.  We will wind up doing a comparison on
6438       uninitialised data, and mc duly complains.
6439
6440       What's difficult about this is, the common case is that the
6441       location is uncontended, and so we're usually comparing the same
6442       value (* %reg) with itself.  So we shouldn't complain even if it
6443       is undefined.  But mc doesn't know that.
6444
6445       My solution is to mark the == in the IR specially, so as to tell
6446       mc that it almost certainly compares a value with itself, and we
6447       should just regard the result as always defined.  Rather than
6448       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6449       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6450
6451       So there's always the question of, can this give a false
6452       negative?  eg, imagine that initially, * %reg is defined; and we
6453       read that; but then in the gap between the read and the CAS, a
6454       different thread writes an undefined (and different) value at
6455       the location.  Then the CAS in this thread will fail and we will
6456       go back to "again:", but without knowing that the trip back
6457       there was based on an undefined comparison.  No matter; at least
6458       the other thread won the race and the location is correctly
6459       marked as undefined.  What if it wrote an uninitialised version
6460       of the same value that was there originally, though?
6461
6462       etc etc.  Seems like there's a small corner case in which we
6463       might lose the fact that something's defined -- we're out-raced
6464       in between the "old = * reg" and the "atomically {", _and_ the
6465       other thread is writing in an undefined version of what's
6466       already there.  Well, that seems pretty unlikely.
6467
6468       ---
6469
6470       If we ever need to reinstate it .. code which generates a
6471       definedness test for "expected == old" was removed at r10432 of
6472       this file.
6473    */
6474    if (cas->oldHi == IRTemp_INVALID) {
6475       do_shadow_CAS_single( mce, cas );
6476    } else {
6477       do_shadow_CAS_double( mce, cas );
6478    }
6479 }
6480
6481
6482 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6483 {
6484    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6485    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6486    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6487    IRAtom *expd_eq_old = NULL;
6488    IROp   opCasCmpEQ;
6489    Int    elemSzB;
6490    IRType elemTy;
6491    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6492
6493    /* single CAS */
6494    tl_assert(cas->oldHi == IRTemp_INVALID);
6495    tl_assert(cas->expdHi == NULL);
6496    tl_assert(cas->dataHi == NULL);
6497
6498    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6499    switch (elemTy) {
6500       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
6501       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6502       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6503       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6504       default: tl_assert(0); /* IR defn disallows any other types */
6505    }
6506
6507    /* 1. fetch data# (the proposed new value) */
6508    tl_assert(isOriginalAtom(mce, cas->dataLo));
6509    vdataLo
6510       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6511    tl_assert(isShadowAtom(mce, vdataLo));
6512    if (otrak) {
6513       bdataLo
6514          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6515       tl_assert(isShadowAtom(mce, bdataLo));
6516    }
6517
6518    /* 2. fetch expected# (what we expect to see at the address) */
6519    tl_assert(isOriginalAtom(mce, cas->expdLo));
6520    vexpdLo
6521       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6522    tl_assert(isShadowAtom(mce, vexpdLo));
6523    if (otrak) {
6524       bexpdLo
6525          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6526       tl_assert(isShadowAtom(mce, bexpdLo));
6527    }
6528
6529    /* 3. check definedness of address */
6530    /* 4. fetch old# from shadow memory; this also checks
6531          addressibility of the address */
6532    voldLo
6533       = assignNew(
6534            'V', mce, elemTy,
6535            expr2vbits_Load(
6536               mce,
6537               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6538               NULL/*always happens*/
6539         ));
6540    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6541    if (otrak) {
6542       boldLo
6543          = assignNew('B', mce, Ity_I32,
6544                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6545       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6546    }
6547
6548    /* 5. the CAS itself */
6549    stmt( 'C', mce, IRStmt_CAS(cas) );
6550
6551    /* 6. compute "expected == old" */
6552    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6553    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6554       tree, but it's not copied from the input block. */
6555    expd_eq_old
6556       = assignNew('C', mce, Ity_I1,
6557                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6558
6559    /* 7. if "expected == old"
6560             store data# to shadow memory */
6561    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6562                     NULL/*data*/, vdataLo/*vdata*/,
6563                     expd_eq_old/*guard for store*/ );
6564    if (otrak) {
6565       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6566                    bdataLo/*bdata*/,
6567                    expd_eq_old/*guard for store*/ );
6568    }
6569 }
6570
6571
6572 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6573 {
6574    IRAtom *vdataHi = NULL, *bdataHi = NULL;
6575    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6576    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6577    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6578    IRAtom *voldHi  = NULL, *boldHi  = NULL;
6579    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6580    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6581    IRAtom *expd_eq_old = NULL, *zero = NULL;
6582    IROp   opCasCmpEQ, opOr, opXor;
6583    Int    elemSzB, memOffsLo, memOffsHi;
6584    IRType elemTy;
6585    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6586
6587    /* double CAS */
6588    tl_assert(cas->oldHi != IRTemp_INVALID);
6589    tl_assert(cas->expdHi != NULL);
6590    tl_assert(cas->dataHi != NULL);
6591
6592    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6593    switch (elemTy) {
6594       case Ity_I8:
6595          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6596          elemSzB = 1; zero = mkU8(0);
6597          break;
6598       case Ity_I16:
6599          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6600          elemSzB = 2; zero = mkU16(0);
6601          break;
6602       case Ity_I32:
6603          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6604          elemSzB = 4; zero = mkU32(0);
6605          break;
6606       case Ity_I64:
6607          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6608          elemSzB = 8; zero = mkU64(0);
6609          break;
6610       default:
6611          tl_assert(0); /* IR defn disallows any other types */
6612    }
6613
6614    /* 1. fetch data# (the proposed new value) */
6615    tl_assert(isOriginalAtom(mce, cas->dataHi));
6616    tl_assert(isOriginalAtom(mce, cas->dataLo));
6617    vdataHi
6618       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6619    vdataLo
6620       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6621    tl_assert(isShadowAtom(mce, vdataHi));
6622    tl_assert(isShadowAtom(mce, vdataLo));
6623    if (otrak) {
6624       bdataHi
6625          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6626       bdataLo
6627          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6628       tl_assert(isShadowAtom(mce, bdataHi));
6629       tl_assert(isShadowAtom(mce, bdataLo));
6630    }
6631
6632    /* 2. fetch expected# (what we expect to see at the address) */
6633    tl_assert(isOriginalAtom(mce, cas->expdHi));
6634    tl_assert(isOriginalAtom(mce, cas->expdLo));
6635    vexpdHi
6636       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6637    vexpdLo
6638       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6639    tl_assert(isShadowAtom(mce, vexpdHi));
6640    tl_assert(isShadowAtom(mce, vexpdLo));
6641    if (otrak) {
6642       bexpdHi
6643          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6644       bexpdLo
6645          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6646       tl_assert(isShadowAtom(mce, bexpdHi));
6647       tl_assert(isShadowAtom(mce, bexpdLo));
6648    }
6649
6650    /* 3. check definedness of address */
6651    /* 4. fetch old# from shadow memory; this also checks
6652          addressibility of the address */
6653    if (cas->end == Iend_LE) {
6654       memOffsLo = 0;
6655       memOffsHi = elemSzB;
6656    } else {
6657       tl_assert(cas->end == Iend_BE);
6658       memOffsLo = elemSzB;
6659       memOffsHi = 0;
6660    }
6661    voldHi
6662       = assignNew(
6663            'V', mce, elemTy,
6664            expr2vbits_Load(
6665               mce,
6666               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6667               NULL/*always happens*/
6668         ));
6669    voldLo
6670       = assignNew(
6671            'V', mce, elemTy,
6672            expr2vbits_Load(
6673               mce,
6674               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6675               NULL/*always happens*/
6676         ));
6677    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6678    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6679    if (otrak) {
6680       boldHi
6681          = assignNew('B', mce, Ity_I32,
6682                      gen_load_b(mce, elemSzB, cas->addr,
6683                                 memOffsHi/*addr bias*/));
6684       boldLo
6685          = assignNew('B', mce, Ity_I32,
6686                      gen_load_b(mce, elemSzB, cas->addr,
6687                                 memOffsLo/*addr bias*/));
6688       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6689       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6690    }
6691
6692    /* 5. the CAS itself */
6693    stmt( 'C', mce, IRStmt_CAS(cas) );
6694
6695    /* 6. compute "expected == old" */
6696    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6697    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6698       tree, but it's not copied from the input block. */
6699    /*
6700       xHi = oldHi ^ expdHi;
6701       xLo = oldLo ^ expdLo;
6702       xHL = xHi | xLo;
6703       expd_eq_old = xHL == 0;
6704    */
6705    xHi = assignNew('C', mce, elemTy,
6706                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6707    xLo = assignNew('C', mce, elemTy,
6708                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6709    xHL = assignNew('C', mce, elemTy,
6710                    binop(opOr, xHi, xLo));
6711    expd_eq_old
6712       = assignNew('C', mce, Ity_I1,
6713                   binop(opCasCmpEQ, xHL, zero));
6714
6715    /* 7. if "expected == old"
6716             store data# to shadow memory */
6717    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6718                     NULL/*data*/, vdataHi/*vdata*/,
6719                     expd_eq_old/*guard for store*/ );
6720    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6721                     NULL/*data*/, vdataLo/*vdata*/,
6722                     expd_eq_old/*guard for store*/ );
6723    if (otrak) {
6724       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6725                    bdataHi/*bdata*/,
6726                    expd_eq_old/*guard for store*/ );
6727       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6728                    bdataLo/*bdata*/,
6729                    expd_eq_old/*guard for store*/ );
6730    }
6731 }
6732
6733
6734 /* ------ Dealing with LL/SC (not difficult) ------ */
6735
6736 static void do_shadow_LLSC ( MCEnv*    mce,
6737                              IREndness stEnd,
6738                              IRTemp    stResult,
6739                              IRExpr*   stAddr,
6740                              IRExpr*   stStoredata )
6741 {
6742    /* In short: treat a load-linked like a normal load followed by an
6743       assignment of the loaded (shadow) data to the result temporary.
6744       Treat a store-conditional like a normal store, and mark the
6745       result temporary as defined. */
6746    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
6747    IRTemp resTmp = findShadowTmpV(mce, stResult);
6748
6749    tl_assert(isIRAtom(stAddr));
6750    if (stStoredata)
6751       tl_assert(isIRAtom(stStoredata));
6752
6753    if (stStoredata == NULL) {
6754       /* Load Linked */
6755       /* Just treat this as a normal load, followed by an assignment of
6756          the value to .result. */
6757       /* Stay sane */
6758       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6759                 || resTy == Ity_I16 || resTy == Ity_I8);
6760       assign( 'V', mce, resTmp,
6761                    expr2vbits_Load(
6762                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6763                       NULL/*always happens*/) );
6764    } else {
6765       /* Store Conditional */
6766       /* Stay sane */
6767       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6768                                    stStoredata);
6769       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
6770                 || dataTy == Ity_I16 || dataTy == Ity_I8);
6771       do_shadow_Store( mce, stEnd,
6772                             stAddr, 0/* addr bias */,
6773                             stStoredata,
6774                             NULL /* shadow data */,
6775                             NULL/*guard*/ );
6776       /* This is a store conditional, so it writes to .result a value
6777          indicating whether or not the store succeeded.  Just claim
6778          this value is always defined.  In the PowerPC interpretation
6779          of store-conditional, definedness of the success indication
6780          depends on whether the address of the store matches the
6781          reservation address.  But we can't tell that here (and
6782          anyway, we're not being PowerPC-specific).  At least we are
6783          guaranteed that the definedness of the store address, and its
6784          addressibility, will be checked as per normal.  So it seems
6785          pretty safe to just say that the success indication is always
6786          defined.
6787
6788          In schemeS, for origin tracking, we must correspondingly set
6789          a no-origin value for the origin shadow of .result.
6790       */
6791       tl_assert(resTy == Ity_I1);
6792       assign( 'V', mce, resTmp, definedOfType(resTy) );
6793    }
6794 }
6795
6796
6797 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6798
6799 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6800 {
6801    complainIfUndefined(mce, sg->guard, NULL);
6802    /* do_shadow_Store will generate code to check the definedness and
6803       validity of sg->addr, in the case where sg->guard evaluates to
6804       True at run-time. */
6805    do_shadow_Store( mce, sg->end,
6806                     sg->addr, 0/* addr bias */,
6807                     sg->data,
6808                     NULL /* shadow data */,
6809                     sg->guard );
6810 }
6811
6812 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6813 {
6814    complainIfUndefined(mce, lg->guard, NULL);
6815    /* expr2vbits_Load_guarded_General will generate code to check the
6816       definedness and validity of lg->addr, in the case where
6817       lg->guard evaluates to True at run-time. */
6818
6819    /* Look at the LoadG's built-in conversion operation, to determine
6820       the source (actual loaded data) type, and the equivalent IROp.
6821       NOTE that implicitly we are taking a widening operation to be
6822       applied to original atoms and producing one that applies to V
6823       bits.  Since signed and unsigned widening are self-shadowing,
6824       this is a straight copy of the op (modulo swapping from the
6825       IRLoadGOp form to the IROp form).  Note also therefore that this
6826       implicitly duplicates the logic to do with said widening ops in
6827       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
6828    IROp   vwiden   = Iop_INVALID;
6829    IRType loadedTy = Ity_INVALID;
6830    switch (lg->cvt) {
6831       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
6832       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
6833       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
6834       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
6835       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
6836       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
6837       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
6838       default: VG_(tool_panic)("do_shadow_LoadG");
6839    }
6840
6841    IRAtom* vbits_alt
6842       = expr2vbits( mce, lg->alt, HuOth );
6843    IRAtom* vbits_final
6844       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
6845                                         lg->addr, 0/*addr bias*/,
6846                                         lg->guard, vwiden, vbits_alt );
6847    /* And finally, bind the V bits to the destination temporary. */
6848    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
6849 }
6850
6851
6852 /*------------------------------------------------------------*/
6853 /*--- Origin tracking stuff                                ---*/
6854 /*------------------------------------------------------------*/
6855
6856 /* Almost identical to findShadowTmpV. */
6857 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
6858 {
6859    TempMapEnt* ent;
6860    /* VG_(indexXA) range-checks 'orig', hence no need to check
6861       here. */
6862    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6863    tl_assert(ent->kind == Orig);
6864    if (ent->shadowB == IRTemp_INVALID) {
6865       IRTemp tmpB
6866         = newTemp( mce, Ity_I32, BSh );
6867       /* newTemp may cause mce->tmpMap to resize, hence previous results
6868          from VG_(indexXA) are invalid. */
6869       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6870       tl_assert(ent->kind == Orig);
6871       tl_assert(ent->shadowB == IRTemp_INVALID);
6872       ent->shadowB = tmpB;
6873    }
6874    return ent->shadowB;
6875 }
6876
6877 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
6878 {
6879    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
6880 }
6881
6882
6883 /* Make a guarded origin load, with no special handling in the
6884    didn't-happen case.  A GUARD of NULL is assumed to mean "always
6885    True".
6886
6887    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6888    return the otag.  The loaded size is SZB.  If GUARD evaluates to
6889    False at run time then the returned otag is zero.
6890 */
6891 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
6892                                     IRAtom* baseaddr,
6893                                     Int offset, IRExpr* guard )
6894 {
6895    void*    hFun;
6896    const HChar* hName;
6897    IRTemp   bTmp;
6898    IRDirty* di;
6899    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6900    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6901    IRAtom*  ea    = baseaddr;
6902    if (offset != 0) {
6903       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6904                                    : mkU64( (Long)(Int)offset );
6905       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
6906    }
6907    bTmp = newTemp(mce, mce->hWordTy, BSh);
6908
6909    switch (szB) {
6910       case 1: hFun  = (void*)&MC_(helperc_b_load1);
6911               hName = "MC_(helperc_b_load1)";
6912               break;
6913       case 2: hFun  = (void*)&MC_(helperc_b_load2);
6914               hName = "MC_(helperc_b_load2)";
6915               break;
6916       case 4: hFun  = (void*)&MC_(helperc_b_load4);
6917               hName = "MC_(helperc_b_load4)";
6918               break;
6919       case 8: hFun  = (void*)&MC_(helperc_b_load8);
6920               hName = "MC_(helperc_b_load8)";
6921               break;
6922       case 16: hFun  = (void*)&MC_(helperc_b_load16);
6923                hName = "MC_(helperc_b_load16)";
6924                break;
6925       case 32: hFun  = (void*)&MC_(helperc_b_load32);
6926                hName = "MC_(helperc_b_load32)";
6927                break;
6928       default:
6929          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
6930          tl_assert(0);
6931    }
6932    di = unsafeIRDirty_1_N(
6933            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
6934            mkIRExprVec_1( ea )
6935         );
6936    if (guard) {
6937       di->guard = guard;
6938       /* Ideally the didn't-happen return value here would be
6939          all-zeroes (unknown-origin), so it'd be harmless if it got
6940          used inadvertently.  We slum it out with the IR-mandated
6941          default value (0b01 repeating, 0x55 etc) as that'll probably
6942          trump all legitimate otags via Max32, and it's pretty
6943          obviously bogus. */
6944    }
6945    /* no need to mess with any annotations.  This call accesses
6946       neither guest state nor guest memory. */
6947    stmt( 'B', mce, IRStmt_Dirty(di) );
6948    if (mce->hWordTy == Ity_I64) {
6949       /* 64-bit host */
6950       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
6951       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
6952       return mkexpr(bTmp32);
6953    } else {
6954       /* 32-bit host */
6955       return mkexpr(bTmp);
6956    }
6957 }
6958
6959
6960 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
6961    loaded size is SZB.  The load is regarded as unconditional (always
6962    happens).
6963 */
6964 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
6965                             Int offset )
6966 {
6967    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
6968 }
6969
6970
6971 /* The most general handler for guarded origin loads.  A GUARD of NULL
6972    is assumed to mean "always True".
6973
6974    Generate IR to do a shadow origin load from ADDR+BIAS and return
6975    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
6976    run time then the returned B bits are simply BALT instead.
6977 */
6978 static
6979 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
6980                                         IRType ty,
6981                                         IRAtom* addr, UInt bias,
6982                                         IRAtom* guard, IRAtom* balt )
6983 {
6984    /* If the guard evaluates to True, this will hold the loaded
6985       origin.  If the guard evaluates to False, this will be zero,
6986       meaning "unknown origin", in which case we will have to replace
6987       it using an ITE below. */
6988    IRAtom* iftrue
6989       = assignNew('B', mce, Ity_I32,
6990                   gen_guarded_load_b(mce, sizeofIRType(ty),
6991                                      addr, bias, guard));
6992    /* These are the bits we will return if the load doesn't take
6993       place. */
6994    IRAtom* iffalse
6995       = balt;
6996    /* Prepare the cond for the ITE.  Convert a NULL cond into
6997       something that iropt knows how to fold out later. */
6998    IRAtom* cond
6999       = guard == NULL  ? mkU1(1)  : guard;
7000    /* And assemble the final result. */
7001    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7002 }
7003
7004
7005 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
7006    the store really happens; NULL means it unconditionally does. */
7007 static void gen_store_b ( MCEnv* mce, Int szB,
7008                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
7009                           IRAtom* guard )
7010 {
7011    void*    hFun;
7012    const HChar* hName;
7013    IRDirty* di;
7014    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7015    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7016    IRAtom*  ea    = baseaddr;
7017    if (guard) {
7018       tl_assert(isOriginalAtom(mce, guard));
7019       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7020    }
7021    if (offset != 0) {
7022       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7023                                    : mkU64( (Long)(Int)offset );
7024       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
7025    }
7026    if (mce->hWordTy == Ity_I64)
7027       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7028
7029    switch (szB) {
7030       case 1: hFun  = (void*)&MC_(helperc_b_store1);
7031               hName = "MC_(helperc_b_store1)";
7032               break;
7033       case 2: hFun  = (void*)&MC_(helperc_b_store2);
7034               hName = "MC_(helperc_b_store2)";
7035               break;
7036       case 4: hFun  = (void*)&MC_(helperc_b_store4);
7037               hName = "MC_(helperc_b_store4)";
7038               break;
7039       case 8: hFun  = (void*)&MC_(helperc_b_store8);
7040               hName = "MC_(helperc_b_store8)";
7041               break;
7042       case 16: hFun  = (void*)&MC_(helperc_b_store16);
7043                hName = "MC_(helperc_b_store16)";
7044                break;
7045       case 32: hFun  = (void*)&MC_(helperc_b_store32);
7046                hName = "MC_(helperc_b_store32)";
7047                break;
7048       default:
7049          tl_assert(0);
7050    }
7051    di = unsafeIRDirty_0_N( 2/*regparms*/,
7052            hName, VG_(fnptr_to_fnentry)( hFun ),
7053            mkIRExprVec_2( ea, dataB )
7054         );
7055    /* no need to mess with any annotations.  This call accesses
7056       neither guest state nor guest memory. */
7057    if (guard) di->guard = guard;
7058    stmt( 'B', mce, IRStmt_Dirty(di) );
7059 }
7060
7061 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7062    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7063    if (eTy == Ity_I64)
7064       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7065    if (eTy == Ity_I32)
7066       return e;
7067    tl_assert(0);
7068 }
7069
7070 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7071    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7072    tl_assert(eTy == Ity_I32);
7073    if (dstTy == Ity_I64)
7074       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7075    tl_assert(0);
7076 }
7077
7078
7079 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7080 {
7081    tl_assert(MC_(clo_mc_level) == 3);
7082
7083    switch (e->tag) {
7084
7085       case Iex_GetI: {
7086          IRRegArray* descr_b;
7087          IRAtom      *t1, *t2, *t3, *t4;
7088          IRRegArray* descr      = e->Iex.GetI.descr;
7089          IRType equivIntTy
7090             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7091          /* If this array is unshadowable for whatever reason, use the
7092             usual approximation. */
7093          if (equivIntTy == Ity_INVALID)
7094             return mkU32(0);
7095          tl_assert(sizeofIRType(equivIntTy) >= 4);
7096          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7097          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7098                                  equivIntTy, descr->nElems );
7099          /* Do a shadow indexed get of the same size, giving t1.  Take
7100             the bottom 32 bits of it, giving t2.  Compute into t3 the
7101             origin for the index (almost certainly zero, but there's
7102             no harm in being completely general here, since iropt will
7103             remove any useless code), and fold it in, giving a final
7104             value t4. */
7105          t1 = assignNew( 'B', mce, equivIntTy,
7106                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7107                                                 e->Iex.GetI.bias ));
7108          t2 = narrowTo32( mce, t1 );
7109          t3 = schemeE( mce, e->Iex.GetI.ix );
7110          t4 = gen_maxU32( mce, t2, t3 );
7111          return t4;
7112       }
7113       case Iex_CCall: {
7114          Int i;
7115          IRAtom*  here;
7116          IRExpr** args = e->Iex.CCall.args;
7117          IRAtom*  curr = mkU32(0);
7118          for (i = 0; args[i]; i++) {
7119             tl_assert(i < 32);
7120             tl_assert(isOriginalAtom(mce, args[i]));
7121             /* Only take notice of this arg if the callee's
7122                mc-exclusion mask does not say it is to be excluded. */
7123             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7124                /* the arg is to be excluded from definedness checking.
7125                   Do nothing. */
7126                if (0) VG_(printf)("excluding %s(%d)\n",
7127                                   e->Iex.CCall.cee->name, i);
7128             } else {
7129                /* calculate the arg's definedness, and pessimistically
7130                   merge it in. */
7131                here = schemeE( mce, args[i] );
7132                curr = gen_maxU32( mce, curr, here );
7133             }
7134          }
7135          return curr;
7136       }
7137       case Iex_Load: {
7138          Int dszB;
7139          dszB = sizeofIRType(e->Iex.Load.ty);
7140          /* assert that the B value for the address is already
7141             available (somewhere) */
7142          tl_assert(isIRAtom(e->Iex.Load.addr));
7143          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7144          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7145       }
7146       case Iex_ITE: {
7147          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7148          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7149          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7150          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7151       }
7152       case Iex_Qop: {
7153          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7154          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7155          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7156          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7157          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7158                                  gen_maxU32( mce, b3, b4 ) );
7159       }
7160       case Iex_Triop: {
7161          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7162          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7163          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7164          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7165       }
7166       case Iex_Binop: {
7167          switch (e->Iex.Binop.op) {
7168             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
7169             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7170             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7171             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7172                /* Just say these all produce a defined result,
7173                   regardless of their arguments.  See
7174                   COMMENT_ON_CasCmpEQ in this file. */
7175                return mkU32(0);
7176             default: {
7177                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7178                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7179                return gen_maxU32( mce, b1, b2 );
7180             }
7181          }
7182          tl_assert(0);
7183          /*NOTREACHED*/
7184       }
7185       case Iex_Unop: {
7186          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7187          return b1;
7188       }
7189       case Iex_Const:
7190          return mkU32(0);
7191       case Iex_RdTmp:
7192          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7193       case Iex_Get: {
7194          Int b_offset = MC_(get_otrack_shadow_offset)(
7195                            e->Iex.Get.offset,
7196                            sizeofIRType(e->Iex.Get.ty)
7197                         );
7198          tl_assert(b_offset >= -1
7199                    && b_offset <= mce->layout->total_sizeB -4);
7200          if (b_offset >= 0) {
7201             /* FIXME: this isn't an atom! */
7202             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7203                                Ity_I32 );
7204          }
7205          return mkU32(0);
7206       }
7207       default:
7208          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7209          ppIRExpr(e);
7210          VG_(tool_panic)("memcheck:schemeE");
7211    }
7212 }
7213
7214
7215 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7216 {
7217    // This is a hacked version of do_shadow_Dirty
7218    Int       i, k, n, toDo, gSz, gOff;
7219    IRAtom    *here, *curr;
7220    IRTemp    dst;
7221
7222    /* First check the guard. */
7223    curr = schemeE( mce, d->guard );
7224
7225    /* Now round up all inputs and maxU32 over them. */
7226
7227    /* Inputs: unmasked args
7228       Note: arguments are evaluated REGARDLESS of the guard expression */
7229    for (i = 0; d->args[i]; i++) {
7230       IRAtom* arg = d->args[i];
7231       if ( (d->cee->mcx_mask & (1<<i))
7232            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7233          /* ignore this arg */
7234       } else {
7235          here = schemeE( mce, arg );
7236          curr = gen_maxU32( mce, curr, here );
7237       }
7238    }
7239
7240    /* Inputs: guest state that we read. */
7241    for (i = 0; i < d->nFxState; i++) {
7242       tl_assert(d->fxState[i].fx != Ifx_None);
7243       if (d->fxState[i].fx == Ifx_Write)
7244          continue;
7245
7246       /* Enumerate the described state segments */
7247       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7248          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7249          gSz  = d->fxState[i].size;
7250
7251          /* Ignore any sections marked as 'always defined'. */
7252          if (isAlwaysDefd(mce, gOff, gSz)) {
7253             if (0)
7254             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7255                         gOff, gSz);
7256             continue;
7257          }
7258
7259          /* This state element is read or modified.  So we need to
7260             consider it.  If larger than 4 bytes, deal with it in
7261             4-byte chunks. */
7262          while (True) {
7263             Int b_offset;
7264             tl_assert(gSz >= 0);
7265             if (gSz == 0) break;
7266             n = gSz <= 4 ? gSz : 4;
7267             /* update 'curr' with maxU32 of the state slice
7268                gOff .. gOff+n-1 */
7269             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7270             if (b_offset != -1) {
7271                /* Observe the guard expression. If it is false use 0, i.e.
7272                   nothing is known about the origin */
7273                IRAtom *cond, *iffalse, *iftrue;
7274
7275                cond = assignNew( 'B', mce, Ity_I1, d->guard);
7276                iffalse = mkU32(0);
7277                iftrue  = assignNew( 'B', mce, Ity_I32,
7278                                     IRExpr_Get(b_offset
7279                                                  + 2*mce->layout->total_sizeB,
7280                                                Ity_I32));
7281                here = assignNew( 'B', mce, Ity_I32,
7282                                  IRExpr_ITE(cond, iftrue, iffalse));
7283                curr = gen_maxU32( mce, curr, here );
7284             }
7285             gSz -= n;
7286             gOff += n;
7287          }
7288       }
7289    }
7290
7291    /* Inputs: memory */
7292
7293    if (d->mFx != Ifx_None) {
7294       /* Because we may do multiple shadow loads/stores from the same
7295          base address, it's best to do a single test of its
7296          definedness right now.  Post-instrumentation optimisation
7297          should remove all but this test. */
7298       tl_assert(d->mAddr);
7299       here = schemeE( mce, d->mAddr );
7300       curr = gen_maxU32( mce, curr, here );
7301    }
7302
7303    /* Deal with memory inputs (reads or modifies) */
7304    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7305       toDo   = d->mSize;
7306       /* chew off 32-bit chunks.  We don't care about the endianness
7307          since it's all going to be condensed down to a single bit,
7308          but nevertheless choose an endianness which is hopefully
7309          native to the platform. */
7310       while (toDo >= 4) {
7311          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7312                                     d->guard );
7313          curr = gen_maxU32( mce, curr, here );
7314          toDo -= 4;
7315       }
7316       /* handle possible 16-bit excess */
7317       while (toDo >= 2) {
7318          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7319                                     d->guard );
7320          curr = gen_maxU32( mce, curr, here );
7321          toDo -= 2;
7322       }
7323       /* chew off the remaining 8-bit chunk, if any */
7324       if (toDo == 1) {
7325          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7326                                     d->guard );
7327          curr = gen_maxU32( mce, curr, here );
7328          toDo -= 1;
7329       }
7330       tl_assert(toDo == 0);
7331    }
7332
7333    /* Whew!  So curr is a 32-bit B-value which should give an origin
7334       of some use if any of the inputs to the helper are undefined.
7335       Now we need to re-distribute the results to all destinations. */
7336
7337    /* Outputs: the destination temporary, if there is one. */
7338    if (d->tmp != IRTemp_INVALID) {
7339       dst   = findShadowTmpB(mce, d->tmp);
7340       assign( 'V', mce, dst, curr );
7341    }
7342
7343    /* Outputs: guest state that we write or modify. */
7344    for (i = 0; i < d->nFxState; i++) {
7345       tl_assert(d->fxState[i].fx != Ifx_None);
7346       if (d->fxState[i].fx == Ifx_Read)
7347          continue;
7348
7349       /* Enumerate the described state segments */
7350       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7351          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7352          gSz  = d->fxState[i].size;
7353
7354          /* Ignore any sections marked as 'always defined'. */
7355          if (isAlwaysDefd(mce, gOff, gSz))
7356             continue;
7357
7358          /* This state element is written or modified.  So we need to
7359             consider it.  If larger than 4 bytes, deal with it in
7360             4-byte chunks. */
7361          while (True) {
7362             Int b_offset;
7363             tl_assert(gSz >= 0);
7364             if (gSz == 0) break;
7365             n = gSz <= 4 ? gSz : 4;
7366             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7367             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7368             if (b_offset != -1) {
7369
7370                /* If the guard expression evaluates to false we simply Put
7371                   the value that is already stored in the guest state slot */
7372                IRAtom *cond, *iffalse;
7373
7374                cond    = assignNew('B', mce, Ity_I1,
7375                                    d->guard);
7376                iffalse = assignNew('B', mce, Ity_I32,
7377                                    IRExpr_Get(b_offset +
7378                                               2*mce->layout->total_sizeB,
7379                                               Ity_I32));
7380                curr = assignNew('V', mce, Ity_I32,
7381                                 IRExpr_ITE(cond, curr, iffalse));
7382
7383                stmt( 'B', mce, IRStmt_Put(b_offset
7384                                           + 2*mce->layout->total_sizeB,
7385                                           curr ));
7386             }
7387             gSz -= n;
7388             gOff += n;
7389          }
7390       }
7391    }
7392
7393    /* Outputs: memory that we write or modify.  Same comments about
7394       endianness as above apply. */
7395    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7396       toDo   = d->mSize;
7397       /* chew off 32-bit chunks */
7398       while (toDo >= 4) {
7399          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7400                       d->guard );
7401          toDo -= 4;
7402       }
7403       /* handle possible 16-bit excess */
7404       while (toDo >= 2) {
7405          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7406                       d->guard );
7407          toDo -= 2;
7408       }
7409       /* chew off the remaining 8-bit chunk, if any */
7410       if (toDo == 1) {
7411          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7412                       d->guard );
7413          toDo -= 1;
7414       }
7415       tl_assert(toDo == 0);
7416    }
7417 }
7418
7419
7420 /* Generate IR for origin shadowing for a general guarded store. */
7421 static void do_origins_Store_guarded ( MCEnv* mce,
7422                                        IREndness stEnd,
7423                                        IRExpr* stAddr,
7424                                        IRExpr* stData,
7425                                        IRExpr* guard )
7426 {
7427    Int     dszB;
7428    IRAtom* dataB;
7429    /* assert that the B value for the address is already available
7430       (somewhere), since the call to schemeE will want to see it.
7431       XXXX how does this actually ensure that?? */
7432    tl_assert(isIRAtom(stAddr));
7433    tl_assert(isIRAtom(stData));
7434    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7435    dataB = schemeE( mce, stData );
7436    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7437 }
7438
7439
7440 /* Generate IR for origin shadowing for a plain store. */
7441 static void do_origins_Store_plain ( MCEnv* mce,
7442                                      IREndness stEnd,
7443                                      IRExpr* stAddr,
7444                                      IRExpr* stData )
7445 {
7446    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7447                               NULL/*guard*/ );
7448 }
7449
7450
7451 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7452
7453 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7454 {
7455    do_origins_Store_guarded( mce, sg->end, sg->addr,
7456                              sg->data, sg->guard );
7457 }
7458
7459 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7460 {
7461    IRType loadedTy = Ity_INVALID;
7462    switch (lg->cvt) {
7463       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7464       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7465       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7466       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7467       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7468       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7469       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7470       default: VG_(tool_panic)("schemeS.IRLoadG");
7471    }
7472    IRAtom* ori_alt
7473       = schemeE( mce,lg->alt );
7474    IRAtom* ori_final
7475       = expr2ori_Load_guarded_General(mce, loadedTy,
7476                                       lg->addr, 0/*addr bias*/,
7477                                       lg->guard, ori_alt );
7478    /* And finally, bind the origin to the destination temporary. */
7479    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7480 }
7481
7482
7483 static void schemeS ( MCEnv* mce, IRStmt* st )
7484 {
7485    tl_assert(MC_(clo_mc_level) == 3);
7486
7487    switch (st->tag) {
7488
7489       case Ist_AbiHint:
7490          /* The value-check instrumenter handles this - by arranging
7491             to pass the address of the next instruction to
7492             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7493             happen for origin tracking w.r.t. AbiHints.  So there is
7494             nothing to do here. */
7495          break;
7496
7497       case Ist_PutI: {
7498          IRPutI *puti = st->Ist.PutI.details;
7499          IRRegArray* descr_b;
7500          IRAtom      *t1, *t2, *t3, *t4;
7501          IRRegArray* descr = puti->descr;
7502          IRType equivIntTy
7503             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7504          /* If this array is unshadowable for whatever reason,
7505             generate no code. */
7506          if (equivIntTy == Ity_INVALID)
7507             break;
7508          tl_assert(sizeofIRType(equivIntTy) >= 4);
7509          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7510          descr_b
7511             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7512                             equivIntTy, descr->nElems );
7513          /* Compute a value to Put - the conjoinment of the origin for
7514             the data to be Put-ted (obviously) and of the index value
7515             (not so obviously). */
7516          t1 = schemeE( mce, puti->data );
7517          t2 = schemeE( mce, puti->ix );
7518          t3 = gen_maxU32( mce, t1, t2 );
7519          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7520          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7521                                                puti->bias, t4) ));
7522          break;
7523       }
7524
7525       case Ist_Dirty:
7526          do_origins_Dirty( mce, st->Ist.Dirty.details );
7527          break;
7528
7529       case Ist_Store:
7530          do_origins_Store_plain( mce, st->Ist.Store.end,
7531                                       st->Ist.Store.addr,
7532                                       st->Ist.Store.data );
7533          break;
7534
7535       case Ist_StoreG:
7536          do_origins_StoreG( mce, st->Ist.StoreG.details );
7537          break;
7538
7539       case Ist_LoadG:
7540          do_origins_LoadG( mce, st->Ist.LoadG.details );
7541          break;
7542
7543       case Ist_LLSC: {
7544          /* In short: treat a load-linked like a normal load followed
7545             by an assignment of the loaded (shadow) data the result
7546             temporary.  Treat a store-conditional like a normal store,
7547             and mark the result temporary as defined. */
7548          if (st->Ist.LLSC.storedata == NULL) {
7549             /* Load Linked */
7550             IRType resTy
7551                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7552             IRExpr* vanillaLoad
7553                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7554             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
7555                       || resTy == Ity_I16 || resTy == Ity_I8);
7556             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7557                               schemeE(mce, vanillaLoad));
7558          } else {
7559             /* Store conditional */
7560             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7561                                     st->Ist.LLSC.addr,
7562                                     st->Ist.LLSC.storedata );
7563             /* For the rationale behind this, see comments at the
7564                place where the V-shadow for .result is constructed, in
7565                do_shadow_LLSC.  In short, we regard .result as
7566                always-defined. */
7567             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7568                               mkU32(0) );
7569          }
7570          break;
7571       }
7572
7573       case Ist_Put: {
7574          Int b_offset
7575             = MC_(get_otrack_shadow_offset)(
7576                  st->Ist.Put.offset,
7577                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7578               );
7579          if (b_offset >= 0) {
7580             /* FIXME: this isn't an atom! */
7581             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7582                                        schemeE( mce, st->Ist.Put.data )) );
7583          }
7584          break;
7585       }
7586
7587       case Ist_WrTmp:
7588          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7589                            schemeE(mce, st->Ist.WrTmp.data) );
7590          break;
7591
7592       case Ist_MBE:
7593       case Ist_NoOp:
7594       case Ist_Exit:
7595       case Ist_IMark:
7596          break;
7597
7598       default:
7599          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7600          ppIRStmt(st);
7601          VG_(tool_panic)("memcheck:schemeS");
7602    }
7603 }
7604
7605
7606 /*------------------------------------------------------------*/
7607 /*--- Post-tree-build final tidying                        ---*/
7608 /*------------------------------------------------------------*/
7609
7610 /* This exploits the observation that Memcheck often produces
7611    repeated conditional calls of the form
7612
7613    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7614
7615    with the same guard expression G guarding the same helper call.
7616    The second and subsequent calls are redundant.  This usually
7617    results from instrumentation of guest code containing multiple
7618    memory references at different constant offsets from the same base
7619    register.  After optimisation of the instrumentation, you get a
7620    test for the definedness of the base register for each memory
7621    reference, which is kinda pointless.  MC_(final_tidy) therefore
7622    looks for such repeated calls and removes all but the first. */
7623
7624
7625 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7626    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7627    get almost all the benefits of this transformation whilst causing
7628    the slide-back case to just often enough to be verifiably
7629    correct.  For posterity, the numbers are:
7630
7631    bz2-32
7632
7633    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
7634    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
7635    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
7636    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
7637    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
7638    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
7639    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
7640    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
7641    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
7642    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
7643    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
7644    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
7645    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7646
7647    bz2-64
7648
7649    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
7650    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
7651    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
7652    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
7653    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
7654    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
7655    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
7656    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
7657    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
7658    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
7659    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
7660    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
7661    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7662 */
7663
7664 /* Structs for recording which (helper, guard) pairs we have already
7665    seen. */
7666
7667 #define N_TIDYING_PAIRS 16
7668
7669 typedef
7670    struct { void* entry; IRExpr* guard; }
7671    Pair;
7672
7673 typedef
7674    struct {
7675       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7676       UInt pairsUsed;
7677    }
7678    Pairs;
7679
7680
7681 /* Return True if e1 and e2 definitely denote the same value (used to
7682    compare guards).  Return False if unknown; False is the safe
7683    answer.  Since guest registers and guest memory do not have the
7684    SSA property we must return False if any Gets or Loads appear in
7685    the expression.  This implicitly assumes that e1 and e2 have the
7686    same IR type, which is always true here -- the type is Ity_I1. */
7687
7688 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7689 {
7690    if (e1->tag != e2->tag)
7691       return False;
7692    switch (e1->tag) {
7693       case Iex_Const:
7694          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7695       case Iex_Binop:
7696          return e1->Iex.Binop.op == e2->Iex.Binop.op
7697                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7698                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7699       case Iex_Unop:
7700          return e1->Iex.Unop.op == e2->Iex.Unop.op
7701                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7702       case Iex_RdTmp:
7703          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7704       case Iex_ITE:
7705          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7706                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
7707                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7708       case Iex_Qop:
7709       case Iex_Triop:
7710       case Iex_CCall:
7711          /* be lazy.  Could define equality for these, but they never
7712             appear to be used. */
7713          return False;
7714       case Iex_Get:
7715       case Iex_GetI:
7716       case Iex_Load:
7717          /* be conservative - these may not give the same value each
7718             time */
7719          return False;
7720       case Iex_Binder:
7721          /* should never see this */
7722          /* fallthrough */
7723       default:
7724          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7725          ppIRExpr(e1);
7726          VG_(tool_panic)("memcheck:sameIRValue");
7727          return False;
7728    }
7729 }
7730
7731 /* See if 'pairs' already has an entry for (entry, guard).  Return
7732    True if so.  If not, add an entry. */
7733
7734 static
7735 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
7736 {
7737    UInt i, n = tidyingEnv->pairsUsed;
7738    tl_assert(n <= N_TIDYING_PAIRS);
7739    for (i = 0; i < n; i++) {
7740       if (tidyingEnv->pairs[i].entry == entry
7741           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
7742          return True;
7743    }
7744    /* (guard, entry) wasn't found in the array.  Add it at the end.
7745       If the array is already full, slide the entries one slot
7746       backwards.  This means we will lose to ability to detect
7747       duplicates from the pair in slot zero, but that happens so
7748       rarely that it's unlikely to have much effect on overall code
7749       quality.  Also, this strategy loses the check for the oldest
7750       tracked exit (memory reference, basically) and so that is (I'd
7751       guess) least likely to be re-used after this point. */
7752    tl_assert(i == n);
7753    if (n == N_TIDYING_PAIRS) {
7754       for (i = 1; i < N_TIDYING_PAIRS; i++) {
7755          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
7756       }
7757       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
7758       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
7759    } else {
7760       tl_assert(n < N_TIDYING_PAIRS);
7761       tidyingEnv->pairs[n].entry = entry;
7762       tidyingEnv->pairs[n].guard = guard;
7763       n++;
7764       tidyingEnv->pairsUsed = n;
7765    }
7766    return False;
7767 }
7768
7769 static Bool is_helperc_value_checkN_fail ( const HChar* name )
7770 {
7771    /* This is expensive because it happens a lot.  We are checking to
7772       see whether |name| is one of the following 8 strings:
7773
7774          MC_(helperc_value_check8_fail_no_o)
7775          MC_(helperc_value_check4_fail_no_o)
7776          MC_(helperc_value_check0_fail_no_o)
7777          MC_(helperc_value_check1_fail_no_o)
7778          MC_(helperc_value_check8_fail_w_o)
7779          MC_(helperc_value_check0_fail_w_o)
7780          MC_(helperc_value_check1_fail_w_o)
7781          MC_(helperc_value_check4_fail_w_o)
7782
7783       To speed it up, check the common prefix just once, rather than
7784       all 8 times.
7785    */
7786    const HChar* prefix = "MC_(helperc_value_check";
7787
7788    HChar n, p;
7789    while (True) {
7790       n = *name;
7791       p = *prefix;
7792       if (p == 0) break; /* ran off the end of the prefix */
7793       /* We still have some prefix to use */
7794       if (n == 0) return False; /* have prefix, but name ran out */
7795       if (n != p) return False; /* have both pfx and name, but no match */
7796       name++;
7797       prefix++;
7798    }
7799
7800    /* Check the part after the prefix. */
7801    tl_assert(*prefix == 0 && *name != 0);
7802    return    0==VG_(strcmp)(name, "8_fail_no_o)")
7803           || 0==VG_(strcmp)(name, "4_fail_no_o)")
7804           || 0==VG_(strcmp)(name, "0_fail_no_o)")
7805           || 0==VG_(strcmp)(name, "1_fail_no_o)")
7806           || 0==VG_(strcmp)(name, "8_fail_w_o)")
7807           || 0==VG_(strcmp)(name, "4_fail_w_o)")
7808           || 0==VG_(strcmp)(name, "0_fail_w_o)")
7809           || 0==VG_(strcmp)(name, "1_fail_w_o)");
7810 }
7811
7812 IRSB* MC_(final_tidy) ( IRSB* sb_in )
7813 {
7814    Int       i;
7815    IRStmt*   st;
7816    IRDirty*  di;
7817    IRExpr*   guard;
7818    IRCallee* cee;
7819    Bool      alreadyPresent;
7820    Pairs     pairs;
7821
7822    pairs.pairsUsed = 0;
7823
7824    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
7825    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
7826
7827    /* Scan forwards through the statements.  Each time a call to one
7828       of the relevant helpers is seen, check if we have made a
7829       previous call to the same helper using the same guard
7830       expression, and if so, delete the call. */
7831    for (i = 0; i < sb_in->stmts_used; i++) {
7832       st = sb_in->stmts[i];
7833       tl_assert(st);
7834       if (st->tag != Ist_Dirty)
7835          continue;
7836       di = st->Ist.Dirty.details;
7837       guard = di->guard;
7838       tl_assert(guard);
7839       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
7840       cee = di->cee;
7841       if (!is_helperc_value_checkN_fail( cee->name ))
7842          continue;
7843        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
7844           guard 'guard'.  Check if we have already seen a call to this
7845           function with the same guard.  If so, delete it.  If not,
7846           add it to the set of calls we do know about. */
7847       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
7848       if (alreadyPresent) {
7849          sb_in->stmts[i] = IRStmt_NoOp();
7850          if (0) VG_(printf)("XX\n");
7851       }
7852    }
7853
7854    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
7855    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
7856
7857    return sb_in;
7858 }
7859
7860 #undef N_TIDYING_PAIRS
7861
7862
7863 /*------------------------------------------------------------*/
7864 /*--- Startup assertion checking                           ---*/
7865 /*------------------------------------------------------------*/
7866
7867 void MC_(do_instrumentation_startup_checks)( void )
7868 {
7869    /* Make a best-effort check to see that is_helperc_value_checkN_fail
7870       is working as we expect. */
7871
7872 #  define CHECK(_expected, _string) \
7873       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
7874
7875    /* It should identify these 8, and no others, as targets. */
7876    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
7877    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
7878    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
7879    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
7880    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
7881    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
7882    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
7883    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
7884
7885    /* Ad-hoc selection of other strings gathered via a quick test. */
7886    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
7887    CHECK(False, "amd64g_dirtyhelper_RDTSC");
7888    CHECK(False, "MC_(helperc_b_load1)");
7889    CHECK(False, "MC_(helperc_b_load2)");
7890    CHECK(False, "MC_(helperc_b_load4)");
7891    CHECK(False, "MC_(helperc_b_load8)");
7892    CHECK(False, "MC_(helperc_b_load16)");
7893    CHECK(False, "MC_(helperc_b_load32)");
7894    CHECK(False, "MC_(helperc_b_store1)");
7895    CHECK(False, "MC_(helperc_b_store2)");
7896    CHECK(False, "MC_(helperc_b_store4)");
7897    CHECK(False, "MC_(helperc_b_store8)");
7898    CHECK(False, "MC_(helperc_b_store16)");
7899    CHECK(False, "MC_(helperc_b_store32)");
7900    CHECK(False, "MC_(helperc_LOADV8)");
7901    CHECK(False, "MC_(helperc_LOADV16le)");
7902    CHECK(False, "MC_(helperc_LOADV32le)");
7903    CHECK(False, "MC_(helperc_LOADV64le)");
7904    CHECK(False, "MC_(helperc_LOADV128le)");
7905    CHECK(False, "MC_(helperc_LOADV256le)");
7906    CHECK(False, "MC_(helperc_STOREV16le)");
7907    CHECK(False, "MC_(helperc_STOREV32le)");
7908    CHECK(False, "MC_(helperc_STOREV64le)");
7909    CHECK(False, "MC_(helperc_STOREV8)");
7910    CHECK(False, "track_die_mem_stack_8");
7911    CHECK(False, "track_new_mem_stack_8_w_ECU");
7912    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
7913    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
7914
7915 #  undef CHECK
7916 }
7917
7918
7919 /*------------------------------------------------------------*/
7920 /*--- Memcheck main                                        ---*/
7921 /*------------------------------------------------------------*/
7922
7923 static Bool isBogusAtom ( IRAtom* at )
7924 {
7925    if (at->tag == Iex_RdTmp)
7926       return False;
7927    tl_assert(at->tag == Iex_Const);
7928
7929    ULong n = 0;
7930    IRConst* con = at->Iex.Const.con;
7931    switch (con->tag) {
7932       case Ico_U1:   return False;
7933       case Ico_U8:   n = (ULong)con->Ico.U8; break;
7934       case Ico_U16:  n = (ULong)con->Ico.U16; break;
7935       case Ico_U32:  n = (ULong)con->Ico.U32; break;
7936       case Ico_U64:  n = (ULong)con->Ico.U64; break;
7937       case Ico_F32:  return False;
7938       case Ico_F64:  return False;
7939       case Ico_F32i: return False;
7940       case Ico_F64i: return False;
7941       case Ico_V128: return False;
7942       case Ico_V256: return False;
7943       default: ppIRExpr(at); tl_assert(0);
7944    }
7945    /* VG_(printf)("%llx\n", n); */
7946    /* Shortcuts */
7947    if (LIKELY(n <= 0x0000000000001000ULL)) return False;
7948    if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
7949    /* The list of bogus atoms is: */
7950    return (/*32*/    n == 0xFEFEFEFFULL
7951            /*32*/ || n == 0x80808080ULL
7952            /*32*/ || n == 0x7F7F7F7FULL
7953            /*32*/ || n == 0x7EFEFEFFULL
7954            /*32*/ || n == 0x81010100ULL
7955            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
7956            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
7957            /*64*/ || n == 0x0000000000008080ULL
7958            /*64*/ || n == 0x8080808080808080ULL
7959            /*64*/ || n == 0x0101010101010101ULL
7960           );
7961 }
7962
7963
7964 /* Does 'st' mention any of the literals identified/listed in
7965    isBogusAtom()? */
7966 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
7967 {
7968    Int      i;
7969    IRExpr*  e;
7970    IRDirty* d;
7971    IRCAS*   cas;
7972    switch (st->tag) {
7973       case Ist_WrTmp:
7974          e = st->Ist.WrTmp.data;
7975          switch (e->tag) {
7976             case Iex_Get:
7977             case Iex_RdTmp:
7978                return False;
7979             case Iex_Const:
7980                return isBogusAtom(e);
7981             case Iex_Unop:
7982                return isBogusAtom(e->Iex.Unop.arg)
7983                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
7984             case Iex_GetI:
7985                return isBogusAtom(e->Iex.GetI.ix);
7986             case Iex_Binop:
7987                return isBogusAtom(e->Iex.Binop.arg1)
7988                       || isBogusAtom(e->Iex.Binop.arg2);
7989             case Iex_Triop:
7990                return isBogusAtom(e->Iex.Triop.details->arg1)
7991                       || isBogusAtom(e->Iex.Triop.details->arg2)
7992                       || isBogusAtom(e->Iex.Triop.details->arg3);
7993             case Iex_Qop:
7994                return isBogusAtom(e->Iex.Qop.details->arg1)
7995                       || isBogusAtom(e->Iex.Qop.details->arg2)
7996                       || isBogusAtom(e->Iex.Qop.details->arg3)
7997                       || isBogusAtom(e->Iex.Qop.details->arg4);
7998             case Iex_ITE:
7999                return isBogusAtom(e->Iex.ITE.cond)
8000                       || isBogusAtom(e->Iex.ITE.iftrue)
8001                       || isBogusAtom(e->Iex.ITE.iffalse);
8002             case Iex_Load:
8003                return isBogusAtom(e->Iex.Load.addr);
8004             case Iex_CCall:
8005                for (i = 0; e->Iex.CCall.args[i]; i++)
8006                   if (isBogusAtom(e->Iex.CCall.args[i]))
8007                      return True;
8008                return False;
8009             default:
8010                goto unhandled;
8011          }
8012       case Ist_Dirty:
8013          d = st->Ist.Dirty.details;
8014          for (i = 0; d->args[i]; i++) {
8015             IRAtom* atom = d->args[i];
8016             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8017                if (isBogusAtom(atom))
8018                   return True;
8019             }
8020          }
8021          if (isBogusAtom(d->guard))
8022             return True;
8023          if (d->mAddr && isBogusAtom(d->mAddr))
8024             return True;
8025          return False;
8026       case Ist_Put:
8027          return isBogusAtom(st->Ist.Put.data);
8028       case Ist_PutI:
8029          return isBogusAtom(st->Ist.PutI.details->ix)
8030                 || isBogusAtom(st->Ist.PutI.details->data);
8031       case Ist_Store:
8032          return isBogusAtom(st->Ist.Store.addr)
8033                 || isBogusAtom(st->Ist.Store.data);
8034       case Ist_StoreG: {
8035          IRStoreG* sg = st->Ist.StoreG.details;
8036          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8037                 || isBogusAtom(sg->guard);
8038       }
8039       case Ist_LoadG: {
8040          IRLoadG* lg = st->Ist.LoadG.details;
8041          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8042                 || isBogusAtom(lg->guard);
8043       }
8044       case Ist_Exit:
8045          return isBogusAtom(st->Ist.Exit.guard);
8046       case Ist_AbiHint:
8047          return isBogusAtom(st->Ist.AbiHint.base)
8048                 || isBogusAtom(st->Ist.AbiHint.nia);
8049       case Ist_NoOp:
8050       case Ist_IMark:
8051       case Ist_MBE:
8052          return False;
8053       case Ist_CAS:
8054          cas = st->Ist.CAS.details;
8055          return isBogusAtom(cas->addr)
8056                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8057                 || isBogusAtom(cas->expdLo)
8058                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8059                 || isBogusAtom(cas->dataLo);
8060       case Ist_LLSC:
8061          return isBogusAtom(st->Ist.LLSC.addr)
8062                 || (st->Ist.LLSC.storedata
8063                        ? isBogusAtom(st->Ist.LLSC.storedata)
8064                        : False);
8065       default:
8066       unhandled:
8067          ppIRStmt(st);
8068          VG_(tool_panic)("hasBogusLiterals");
8069    }
8070 }
8071
8072
8073 /* This is the pre-instrumentation analysis.  It does a backwards pass over
8074    the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8075    the block.
8076
8077    Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8078    as a positive result from that is a strong indication that we need to
8079    expensively instrument add/sub in the block.  We do both analyses in one
8080    pass, even though they are independent, so as to avoid the overhead of
8081    having to traverse the whole block twice.
8082
8083    The usage pass proceeds as follows.  Let max= be the max operation in the
8084    HowUsed lattice, hence
8085
8086      X max= Y   means   X = max(X, Y)
8087
8088    then
8089
8090      for t in original tmps . useEnv[t] = HuUnU
8091
8092      for t used in the block's . next field
8093         useEnv[t] max= HuPCa  // because jmp targets are PCast-tested
8094
8095      for st iterating *backwards* in the block
8096
8097         match st
8098
8099            case "t1 = load(t2)"          // case 1
8100               useEnv[t2] max= HuPCa
8101
8102            case "t1 = add(t2, t3)"       // case 2
8103               useEnv[t2] max= useEnv[t1]
8104               useEnv[t3] max= useEnv[t1]
8105
8106            other
8107               for t in st.usedTmps       // case 3
8108                  useEnv[t] max= HuOth
8109                  // same as useEnv[t] = HuOth
8110
8111    The general idea is that we accumulate, in useEnv[], information about
8112    how each tmp is used.  That can be updated as we work further back
8113    through the block and find more uses of it, but its HowUsed value can
8114    only ascend the lattice, not descend.
8115
8116    Initially we mark all tmps as unused.  In case (1), if a tmp is seen to
8117    be used as a memory address, then its use is at least HuPCa.  The point
8118    is that for a memory address we will add instrumentation to check if any
8119    bit of the address is undefined, which means that we won't need expensive
8120    V-bit propagation through an add expression that computed the address --
8121    cheap add instrumentation will be equivalent.
8122
8123    Note in case (1) that if we have previously seen a non-memory-address use
8124    of the tmp, then its use will already be HuOth and will be unchanged by
8125    the max= operation.  And if it turns out that the source of the tmp was
8126    an add, then we'll have to expensively instrument the add, because we
8127    can't prove that, for the previous non-memory-address use of the tmp,
8128    cheap and expensive instrumentation will be equivalent.
8129
8130    In case 2, we propagate the usage-mode of the result of an add back
8131    through to its operands.  Again, we use max= so as to take account of the
8132    fact that t2 or t3 might later in the block (viz, earlier in the
8133    iteration) have been used in a way that requires expensive add
8134    instrumentation.
8135
8136    In case 3, we deal with all other tmp uses.  We assume that we'll need a
8137    result that is as accurate as possible, so we max= HuOth into its use
8138    mode.  Since HuOth is the top of the lattice, that's equivalent to just
8139    setting its use to HuOth.
8140
8141    The net result of all this is that:
8142
8143      tmps that are used either
8144        - only as a memory address, or
8145        - only as part of a tree of adds that computes a memory address,
8146          and has no other use
8147      are marked as HuPCa, and so we can instrument their generating Add
8148      nodes cheaply, which is the whole point of this analysis
8149
8150      tmps that are used any other way at all are marked as HuOth
8151
8152      tmps that are unused are marked as HuUnU.  We don't expect to see any
8153      since we expect that the incoming IR has had all dead assignments
8154      removed by previous optimisation passes.  Nevertheless the analysis is
8155      correct even in the presence of dead tmps.
8156
8157    A final comment on dead tmps.  In case 1 and case 2, we could actually
8158    conditionalise the updates thusly:
8159
8160      if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa }  // case 1
8161
8162      if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] }  // case 2
8163      if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] }  // case 2
8164
8165    In other words, if the assigned-to tmp |t1| is never used, then there's
8166    no point in propagating any use through to its operands.  That won't
8167    change the final HuPCa-vs-HuOth results, which is what we care about.
8168    Given that we expect to get dead-code-free inputs, there's no point in
8169    adding this extra refinement.
8170 */
8171
8172 /* Helper for |preInstrumentationAnalysis|. */
8173 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8174                                    UInt tyenvUsed,
8175                                    HowUsed newUse, IRAtom* at )
8176 {
8177    /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8178       seen a use of |newUse|.  So, merge that info into |t|'s accumulated
8179       use info. */
8180    switch (at->tag) {
8181       case Iex_GSPTR:
8182       case Iex_VECRET:
8183       case Iex_Const:
8184          return;
8185       case Iex_RdTmp: {
8186          IRTemp t = at->Iex.RdTmp.tmp;
8187          tl_assert(t < tyenvUsed); // "is an original tmp"
8188          // The "max" operation in the lattice
8189          if (newUse > useEnv[t]) useEnv[t] = newUse;
8190          return;
8191       }
8192       default:
8193          // We should never get here -- it implies non-flat IR
8194          ppIRExpr(at);
8195          VG_(tool_panic)("noteTmpUsesIn");
8196    }
8197    /*NOTREACHED*/
8198    tl_assert(0);
8199 }
8200
8201
8202 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8203                                          /*OUT*/Bool* hasBogusLiteralsP,
8204                                          const IRSB* sb_in )
8205 {
8206    const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8207
8208    // We've seen no bogus literals so far.
8209    Bool bogus = False;
8210
8211    // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8212    HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8213                                  nOrigTmps, sizeof(HowUsed));
8214
8215    // Firstly, roll in contributions from the final dst address.
8216    bogus = isBogusAtom(sb_in->next);
8217    noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8218
8219    // Now work backwards through the stmts.
8220    for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8221       IRStmt* st = sb_in->stmts[i];
8222
8223       // Deal with literals.
8224       if (LIKELY(!bogus)) {
8225          bogus = containsBogusLiterals(st);
8226       }
8227
8228       // Deal with tmp uses.
8229       switch (st->tag) {
8230          case Ist_WrTmp: {
8231             IRTemp  dst = st->Ist.WrTmp.tmp;
8232             IRExpr* rhs = st->Ist.WrTmp.data;
8233             // This is the one place where we have to consider all possible
8234             // tags for |rhs|, and can't just assume it is a tmp or a const.
8235             switch (rhs->tag) {
8236                case Iex_RdTmp:
8237                   // just propagate demand for |dst| into this tmp use.
8238                   noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8239                   break;
8240                case Iex_Unop:
8241                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8242                   break;
8243                case Iex_Binop:
8244                   if (rhs->Iex.Binop.op == Iop_Add64
8245                       || rhs->Iex.Binop.op == Iop_Add32) {
8246                      // propagate demand for |dst| through to the operands.
8247                      noteTmpUsesIn(useEnv, nOrigTmps,
8248                                    useEnv[dst], rhs->Iex.Binop.arg1);
8249                      noteTmpUsesIn(useEnv, nOrigTmps,
8250                                    useEnv[dst], rhs->Iex.Binop.arg2);
8251                   } else {
8252                      // just say that the operands are used in some unknown way.
8253                      noteTmpUsesIn(useEnv, nOrigTmps,
8254                                    HuOth, rhs->Iex.Binop.arg1);
8255                      noteTmpUsesIn(useEnv, nOrigTmps,
8256                                    HuOth, rhs->Iex.Binop.arg2);
8257                   }
8258                   break;
8259                case Iex_Triop: {
8260                   // All operands are used in some unknown way.
8261                   IRTriop* tri = rhs->Iex.Triop.details;
8262                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8263                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8264                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8265                   break;
8266                }
8267                case Iex_Qop: {
8268                   // All operands are used in some unknown way.
8269                   IRQop* qop = rhs->Iex.Qop.details;
8270                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8271                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8272                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8273                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8274                   break;
8275                }
8276                case Iex_Load:
8277                   // The address will be checked (== PCasted).
8278                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8279                   break;
8280                case Iex_ITE:
8281                   // The condition is PCasted, the then- and else-values
8282                   // aren't.
8283                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8284                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8285                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8286                   break;
8287                case Iex_CCall:
8288                   // The args are used in unknown ways.
8289                   for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8290                      noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8291                   }
8292                   break;
8293                case Iex_GetI: {
8294                   // The index will be checked/PCasted (see do_shadow_GETI)
8295                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8296                   break;
8297                }
8298                case Iex_Const:
8299                case Iex_Get:
8300                   break;
8301                default:
8302                   ppIRExpr(rhs);
8303                   VG_(tool_panic)("preInstrumentationAnalysis:"
8304                                   " unhandled IRExpr");
8305             }
8306             break;
8307          }
8308          case Ist_Store:
8309             // The address will be checked (== PCasted).  The data will be
8310             // used in some unknown way.
8311             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8312             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8313             break;
8314          case Ist_Exit:
8315             // The guard will be checked (== PCasted)
8316             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8317             break;
8318          case Ist_Put:
8319             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8320             break;
8321          case Ist_PutI: {
8322             IRPutI* putI = st->Ist.PutI.details;
8323             // The index will be checked/PCasted (see do_shadow_PUTI).  The
8324             // data will be used in an unknown way.
8325             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8326             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8327             break;
8328          }
8329          case Ist_Dirty: {
8330             IRDirty* d = st->Ist.Dirty.details;
8331             // The guard will be checked (== PCasted)
8332             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8333             // The args will be used in unknown ways.
8334             for (IRExpr** args = d->args; *args; args++) {
8335                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8336             }
8337             break;
8338          }
8339          case Ist_CAS: {
8340             IRCAS* cas = st->Ist.CAS.details;
8341             // Address will be pcasted, everything else used as unknown
8342             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8343             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8344             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8345             if (cas->expdHi)
8346                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8347             if (cas->dataHi)
8348                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8349             break;
8350          }
8351          case Ist_AbiHint:
8352             // Both exprs are used in unknown ways.  TODO: can we safely
8353             // just ignore AbiHints?
8354             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8355             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8356             break;
8357          case Ist_StoreG: {
8358             // We might be able to do better, and use HuPCa for the addr.
8359             // It's not immediately obvious that we can, because the address
8360             // is regarded as "used" only when the guard is true.
8361             IRStoreG* sg = st->Ist.StoreG.details;
8362             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8363             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8364             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8365             break;
8366          }
8367          case Ist_LoadG: {
8368             // Per similar comments to Ist_StoreG .. not sure whether this
8369             // is really optimal.
8370             IRLoadG* lg = st->Ist.LoadG.details;
8371             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8372             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8373             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8374             break;
8375          }
8376          case Ist_LLSC: {
8377             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8378             if (st->Ist.LLSC.storedata)
8379                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8380             break;
8381          }
8382          case Ist_MBE:
8383          case Ist_IMark:
8384          case Ist_NoOp:
8385             break;
8386          default: {
8387             ppIRStmt(st);
8388             VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8389          }
8390       }
8391    } // Now work backwards through the stmts.
8392
8393    // Return the computed use env and the bogus-atom flag.
8394    tl_assert(*useEnvP == NULL);
8395    *useEnvP = useEnv;
8396
8397    tl_assert(*hasBogusLiteralsP == False);
8398    *hasBogusLiteralsP = bogus;
8399 }
8400
8401
8402 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8403                         IRSB* sb_in,
8404                         const VexGuestLayout* layout,
8405                         const VexGuestExtents* vge,
8406                         const VexArchInfo* archinfo_host,
8407                         IRType gWordTy, IRType hWordTy )
8408 {
8409    Bool    verboze = 0||False;
8410    Int     i, j, first_stmt;
8411    IRStmt* st;
8412    MCEnv   mce;
8413    IRSB*   sb_out;
8414
8415    if (gWordTy != hWordTy) {
8416       /* We don't currently support this case. */
8417       VG_(tool_panic)("host/guest word size mismatch");
8418    }
8419
8420    /* Check we're not completely nuts */
8421    tl_assert(sizeof(UWord)  == sizeof(void*));
8422    tl_assert(sizeof(Word)   == sizeof(void*));
8423    tl_assert(sizeof(Addr)   == sizeof(void*));
8424    tl_assert(sizeof(ULong)  == 8);
8425    tl_assert(sizeof(Long)   == 8);
8426    tl_assert(sizeof(UInt)   == 4);
8427    tl_assert(sizeof(Int)    == 4);
8428
8429    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8430
8431    /* Set up SB */
8432    sb_out = deepCopyIRSBExceptStmts(sb_in);
8433
8434    /* Set up the running environment.  Both .sb and .tmpMap are
8435       modified as we go along.  Note that tmps are added to both
8436       .sb->tyenv and .tmpMap together, so the valid index-set for
8437       those two arrays should always be identical. */
8438    VG_(memset)(&mce, 0, sizeof(mce));
8439    mce.sb             = sb_out;
8440    mce.trace          = verboze;
8441    mce.layout         = layout;
8442    mce.hWordTy        = hWordTy;
8443    mce.tmpHowUsed     = NULL;
8444
8445    /* BEGIN decide on expense levels for instrumentation. */
8446
8447    /* Initially, select the cheap version of everything for which we have an
8448       option. */
8449    DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8450
8451    /* Take account of the --expensive-definedness-checks= flag. */
8452    if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8453       /* We just selected 'cheap for everything', so we don't need to do
8454          anything here.  mce.tmpHowUsed remains NULL. */
8455    }
8456    else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8457       /* Select 'expensive for everything'.  mce.tmpHowUsed remains NULL. */
8458       DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8459    }
8460    else {
8461       tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8462       /* We'll make our own selection, based on known per-target constraints
8463          and also on analysis of the block to be instrumented.  First, set
8464          up default values for detail levels.
8465
8466          On x86 and amd64, we'll routinely encounter code optimised by LLVM
8467          5 and above.  Enable accurate interpretation of the following.
8468          LLVM uses adds for some bitfield inserts, and we get a lot of false
8469          errors if the cheap interpretation is used, alas.  Could solve this
8470          much better if we knew which of such adds came from x86/amd64 LEA
8471          instructions, since these are the only ones really needing the
8472          expensive interpretation, but that would require some way to tag
8473          them in the _toIR.c front ends, which is a lot of faffing around.
8474          So for now we use preInstrumentationAnalysis() to detect adds which
8475          are used only to construct memory addresses, which is an
8476          approximation to the above, and is self-contained.*/
8477 #     if defined(VGA_x86)
8478       mce.dlbo.dl_Add32           = DLauto;
8479       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8480 #     elif defined(VGA_amd64)
8481       mce.dlbo.dl_Add32           = DLexpensive;
8482       mce.dlbo.dl_Add64           = DLauto;
8483       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8484       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8485 #     elif defined(VGA_ppc64le)
8486       // Needed by (at least) set_AV_CR6() in the front end.
8487       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8488 #     endif
8489
8490       /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8491          fill it in. */
8492       Bool hasBogusLiterals = False;
8493       preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8494
8495       if (hasBogusLiterals) {
8496          /* This happens very rarely.  In this case just select expensive
8497             for everything, and throw away the tmp-use analysis results. */
8498          DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8499          VG_(free)( mce.tmpHowUsed );
8500          mce.tmpHowUsed = NULL;
8501       } else {
8502          /* Nothing.  mce.tmpHowUsed contains tmp-use analysis results,
8503             which will be used for some subset of Iop_{Add,Sub}{32,64},
8504             based on which ones are set to DLauto for this target. */
8505       }
8506    }
8507
8508    DetailLevelByOp__check_sanity( &mce.dlbo );
8509
8510    if (0) {
8511       // Debug printing: which tmps have been identified as PCast-only use
8512       if (mce.tmpHowUsed) {
8513          VG_(printf)("Cheapies: ");
8514          for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8515             if (mce.tmpHowUsed[q] == HuPCa) {
8516                VG_(printf)("t%u ", q);
8517             }
8518          }
8519          VG_(printf)("\n");
8520       }
8521
8522       // Debug printing: number of ops by detail level
8523       UChar nCheap     = DetailLevelByOp__count( &mce.dlbo, DLcheap     );
8524       UChar nAuto      = DetailLevelByOp__count( &mce.dlbo, DLauto      );
8525       UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8526       tl_assert(nCheap + nAuto + nExpensive == 8);
8527
8528       VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8529    }
8530    /* END decide on expense levels for instrumentation. */
8531
8532    /* Initialise the running the tmp environment. */
8533
8534    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8535                             sizeof(TempMapEnt));
8536    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8537    for (i = 0; i < sb_in->tyenv->types_used; i++) {
8538       TempMapEnt ent;
8539       ent.kind    = Orig;
8540       ent.shadowV = IRTemp_INVALID;
8541       ent.shadowB = IRTemp_INVALID;
8542       VG_(addToXA)( mce.tmpMap, &ent );
8543    }
8544    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8545
8546    /* Finally, begin instrumentation. */
8547    /* Copy verbatim any IR preamble preceding the first IMark */
8548
8549    tl_assert(mce.sb == sb_out);
8550    tl_assert(mce.sb != sb_in);
8551
8552    i = 0;
8553    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8554
8555       st = sb_in->stmts[i];
8556       tl_assert(st);
8557       tl_assert(isFlatIRStmt(st));
8558
8559       stmt( 'C', &mce, sb_in->stmts[i] );
8560       i++;
8561    }
8562
8563    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
8564       cause the IR following the preamble to contain references to IR
8565       temporaries defined in the preamble.  Because the preamble isn't
8566       instrumented, these temporaries don't have any shadows.
8567       Nevertheless uses of them following the preamble will cause
8568       memcheck to generate references to their shadows.  End effect is
8569       to cause IR sanity check failures, due to references to
8570       non-existent shadows.  This is only evident for the complex
8571       preambles used for function wrapping on TOC-afflicted platforms
8572       (ppc64-linux).
8573
8574       The following loop therefore scans the preamble looking for
8575       assignments to temporaries.  For each one found it creates an
8576       assignment to the corresponding (V) shadow temp, marking it as
8577       'defined'.  This is the same resulting IR as if the main
8578       instrumentation loop before had been applied to the statement
8579       'tmp = CONSTANT'.
8580
8581       Similarly, if origin tracking is enabled, we must generate an
8582       assignment for the corresponding origin (B) shadow, claiming
8583       no-origin, as appropriate for a defined value.
8584    */
8585    for (j = 0; j < i; j++) {
8586       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8587          /* findShadowTmpV checks its arg is an original tmp;
8588             no need to assert that here. */
8589          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8590          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8591          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
8592          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8593          if (MC_(clo_mc_level) == 3) {
8594             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8595             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8596             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8597          }
8598          if (0) {
8599             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8600             ppIRType( ty_v );
8601             VG_(printf)("\n");
8602          }
8603       }
8604    }
8605
8606    /* Iterate over the remaining stmts to generate instrumentation. */
8607
8608    tl_assert(sb_in->stmts_used > 0);
8609    tl_assert(i >= 0);
8610    tl_assert(i < sb_in->stmts_used);
8611    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8612
8613    for (/* use current i*/; i < sb_in->stmts_used; i++) {
8614
8615       st = sb_in->stmts[i];
8616       first_stmt = sb_out->stmts_used;
8617
8618       if (verboze) {
8619          VG_(printf)("\n");
8620          ppIRStmt(st);
8621          VG_(printf)("\n");
8622       }
8623
8624       if (MC_(clo_mc_level) == 3) {
8625          /* See comments on case Ist_CAS below. */
8626          if (st->tag != Ist_CAS)
8627             schemeS( &mce, st );
8628       }
8629
8630       /* Generate instrumentation code for each stmt ... */
8631
8632       switch (st->tag) {
8633
8634          case Ist_WrTmp: {
8635             IRTemp dst = st->Ist.WrTmp.tmp;
8636             tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8637             HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8638                                         : HuOth/*we don't know, so play safe*/;
8639             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8640                                expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8641             break;
8642          }
8643
8644          case Ist_Put:
8645             do_shadow_PUT( &mce,
8646                            st->Ist.Put.offset,
8647                            st->Ist.Put.data,
8648                            NULL /* shadow atom */, NULL /* guard */ );
8649             break;
8650
8651          case Ist_PutI:
8652             do_shadow_PUTI( &mce, st->Ist.PutI.details);
8653             break;
8654
8655          case Ist_Store:
8656             do_shadow_Store( &mce, st->Ist.Store.end,
8657                                    st->Ist.Store.addr, 0/* addr bias */,
8658                                    st->Ist.Store.data,
8659                                    NULL /* shadow data */,
8660                                    NULL/*guard*/ );
8661             break;
8662
8663          case Ist_StoreG:
8664             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8665             break;
8666
8667          case Ist_LoadG:
8668             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8669             break;
8670
8671          case Ist_Exit:
8672             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8673             break;
8674
8675          case Ist_IMark:
8676             break;
8677
8678          case Ist_NoOp:
8679          case Ist_MBE:
8680             break;
8681
8682          case Ist_Dirty:
8683             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8684             break;
8685
8686          case Ist_AbiHint:
8687             do_AbiHint( &mce, st->Ist.AbiHint.base,
8688                               st->Ist.AbiHint.len,
8689                               st->Ist.AbiHint.nia );
8690             break;
8691
8692          case Ist_CAS:
8693             do_shadow_CAS( &mce, st->Ist.CAS.details );
8694             /* Note, do_shadow_CAS copies the CAS itself to the output
8695                block, because it needs to add instrumentation both
8696                before and after it.  Hence skip the copy below.  Also
8697                skip the origin-tracking stuff (call to schemeS) above,
8698                since that's all tangled up with it too; do_shadow_CAS
8699                does it all. */
8700             break;
8701
8702          case Ist_LLSC:
8703             do_shadow_LLSC( &mce,
8704                             st->Ist.LLSC.end,
8705                             st->Ist.LLSC.result,
8706                             st->Ist.LLSC.addr,
8707                             st->Ist.LLSC.storedata );
8708             break;
8709
8710          default:
8711             VG_(printf)("\n");
8712             ppIRStmt(st);
8713             VG_(printf)("\n");
8714             VG_(tool_panic)("memcheck: unhandled IRStmt");
8715
8716       } /* switch (st->tag) */
8717
8718       if (0 && verboze) {
8719          for (j = first_stmt; j < sb_out->stmts_used; j++) {
8720             VG_(printf)("   ");
8721             ppIRStmt(sb_out->stmts[j]);
8722             VG_(printf)("\n");
8723          }
8724          VG_(printf)("\n");
8725       }
8726
8727       /* ... and finally copy the stmt itself to the output.  Except,
8728          skip the copy of IRCASs; see comments on case Ist_CAS
8729          above. */
8730       if (st->tag != Ist_CAS)
8731          stmt('C', &mce, st);
8732    }
8733
8734    /* Now we need to complain if the jump target is undefined. */
8735    first_stmt = sb_out->stmts_used;
8736
8737    if (verboze) {
8738       VG_(printf)("sb_in->next = ");
8739       ppIRExpr(sb_in->next);
8740       VG_(printf)("\n\n");
8741    }
8742
8743    complainIfUndefined( &mce, sb_in->next, NULL );
8744
8745    if (0 && verboze) {
8746       for (j = first_stmt; j < sb_out->stmts_used; j++) {
8747          VG_(printf)("   ");
8748          ppIRStmt(sb_out->stmts[j]);
8749          VG_(printf)("\n");
8750       }
8751       VG_(printf)("\n");
8752    }
8753
8754    /* If this fails, there's been some serious snafu with tmp management,
8755       that should be investigated. */
8756    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
8757    VG_(deleteXA)( mce.tmpMap );
8758
8759    if (mce.tmpHowUsed) {
8760       VG_(free)( mce.tmpHowUsed );
8761    }
8762
8763    tl_assert(mce.sb == sb_out);
8764    return sb_out;
8765 }
8766
8767
8768 /*--------------------------------------------------------------------*/
8769 /*--- end                                           mc_translate.c ---*/
8770 /*--------------------------------------------------------------------*/