memcheck/mc_translate.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Instrument IR to perform memory checking operations.         ---*/
   4 /*---                                               mc_translate.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of MemCheck, a heavyweight Valgrind tool for
   9    detecting memory errors.
  10
  11    Copyright (C) 2000-2017 Julian Seward
  12       jseward@acm.org
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 #include "pub_tool_basics.h"
  31 #include "pub_tool_poolalloc.h"     // For mc_include.h
  32 #include "pub_tool_hashtable.h"     // For mc_include.h
  33 #include "pub_tool_libcassert.h"
  34 #include "pub_tool_libcprint.h"
  35 #include "pub_tool_tooliface.h"
  36 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
  37 #include "pub_tool_xarray.h"
  38 #include "pub_tool_mallocfree.h"
  39 #include "pub_tool_libcbase.h"
  40
  41 #include "mc_include.h"
  42
  43
  44 /* FIXMEs JRS 2011-June-16.
  45
  46    Check the interpretation for vector narrowing and widening ops,
  47    particularly the saturating ones.  I suspect they are either overly
  48    pessimistic and/or wrong.
  49
  50    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
  51    saturating shifts): the interpretation is overly pessimistic.
  52    See comments on the relevant cases below for details.
  53
  54    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
  55    both rounding and non-rounding variants): ditto
  56 */
  57
  58 /* This file implements the Memcheck instrumentation, and in
  59    particular contains the core of its undefined value detection
  60    machinery.  For a comprehensive background of the terminology,
  61    algorithms and rationale used herein, read:
  62
  63      Using Valgrind to detect undefined value errors with
  64      bit-precision
  65
  66      Julian Seward and Nicholas Nethercote
  67
  68      2005 USENIX Annual Technical Conference (General Track),
  69      Anaheim, CA, USA, April 10-15, 2005.
  70
  71    ----
  72
  73    Here is as good a place as any to record exactly when V bits are and
  74    should be checked, why, and what function is responsible.
  75
  76
  77    Memcheck complains when an undefined value is used:
  78
  79    1. In the condition of a conditional branch.  Because it could cause
  80       incorrect control flow, and thus cause incorrect externally-visible
  81       behaviour.  [mc_translate.c:complainIfUndefined]
  82
  83    2. As an argument to a system call, or as the value that specifies
  84       the system call number.  Because it could cause an incorrect
  85       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
  86
  87    3. As the address in a load or store.  Because it could cause an
  88       incorrect value to be used later, which could cause externally-visible
  89       behaviour (eg. via incorrect control flow or an incorrect system call
  90       argument)  [complainIfUndefined]
  91
  92    4. As the target address of a branch.  Because it could cause incorrect
  93       control flow.  [complainIfUndefined]
  94
  95    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
  96       an incorrect value into the external environment.
  97       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
  98
  99    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
 100       [complainIfUndefined]
 101
 102    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
 103       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
 104       requested it.  [in memcheck.h]
 105
 106
 107    Memcheck also complains, but should not, when an undefined value is used:
 108
 109    8. As the shift value in certain SIMD shift operations (but not in the
 110       standard integer shift operations).  This inconsistency is due to
 111       historical reasons.)  [complainIfUndefined]
 112
 113
 114    Memcheck does not complain, but should, when an undefined value is used:
 115
 116    9. As an input to a client request.  Because the client request may
 117       affect the visible behaviour -- see bug #144362 for an example
 118       involving the malloc replacements in vg_replace_malloc.c and
 119       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
 120       isn't identified.  That bug report also has some info on how to solve
 121       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
 122
 123
 124    In practice, 1 and 2 account for the vast majority of cases.
 125 */
 126
 127 /* Generation of addr-definedness, addr-validity and
 128    guard-definedness checks pertaining to loads and stores (Iex_Load,
 129    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
 130    loads/stores) was re-checked 11 May 2013. */
 131
 132
 133 /*------------------------------------------------------------*/
 134 /*--- Forward decls                                        ---*/
 135 /*------------------------------------------------------------*/
 136
 137 struct _MCEnv;
 138
 139 // See below for comments explaining what this is for.
 140 typedef
 141    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 142    HowUsed;
 143
 144 static IRType  shadowTypeV ( IRType ty );
 145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
 146                             HowUsed hu/*use HuOth if unknown*/ );
 147 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
 148
 149 static IRExpr *i128_const_zero(void);
 150
 151
 152 /*------------------------------------------------------------*/
 153 /*--- Memcheck running state, and tmp management.          ---*/
 154 /*------------------------------------------------------------*/
 155
 156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
 157    propagation scheme, and a more expensive, more precise vbit propagation
 158    scheme.  This enum describes, for such an IROp, which scheme to use. */
 159 typedef
 160    enum {
 161       // Use the cheaper, less-exact variant.
 162       DLcheap=4,
 163       // Choose between cheap and expensive based on analysis of the block
 164       // to be instrumented.  Note that the choice may be done on a
 165       // per-instance basis of the IROp that this DetailLevel describes.
 166       DLauto,
 167       // Use the more expensive, more-exact variant.
 168       DLexpensive
 169    }
 170    DetailLevel;
 171
 172
 173 /* A readonly part of the running state.  For IROps that have both a
 174    less-exact and more-exact interpretation, records which interpretation is
 175    to be used.  */
 176 typedef
 177    struct {
 178       // For Add32/64 and Sub32/64, all 3 settings are allowed.  For the
 179       // DLauto case, a per-instance decision is to be made by inspecting
 180       // the associated tmp's entry in MCEnv.tmpHowUsed.
 181       DetailLevel dl_Add32;
 182       DetailLevel dl_Add64;
 183       DetailLevel dl_Sub32;
 184       DetailLevel dl_Sub64;
 185       // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
 186       // allowed.
 187       DetailLevel dl_CmpEQ64_CmpNE64;
 188       DetailLevel dl_CmpEQ32_CmpNE32;
 189       DetailLevel dl_CmpEQ16_CmpNE16;
 190       DetailLevel dl_CmpEQ8_CmpNE8;
 191    }
 192    DetailLevelByOp;
 193
 194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
 195                                        DetailLevel dl )
 196 {
 197    dlbo->dl_Add32           = dl;
 198    dlbo->dl_Add64           = dl;
 199    dlbo->dl_Sub32           = dl;
 200    dlbo->dl_Sub64           = dl;
 201    dlbo->dl_CmpEQ64_CmpNE64 = dl;
 202    dlbo->dl_CmpEQ32_CmpNE32 = dl;
 203    dlbo->dl_CmpEQ16_CmpNE16 = dl;
 204    dlbo->dl_CmpEQ8_CmpNE8   = dl;
 205 }
 206
 207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
 208 {
 209    tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
 210    tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
 211    tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
 212    tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
 213    tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
 214              || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
 215    tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
 216              || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
 217    tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
 218              || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
 219    tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
 220              || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
 221 }
 222
 223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
 224                                      DetailLevel dl )
 225 {
 226    UInt n = 0;
 227    n += (dlbo->dl_Add32 == dl            ? 1 : 0);
 228    n += (dlbo->dl_Add64 == dl            ? 1 : 0);
 229    n += (dlbo->dl_Sub32 == dl            ? 1 : 0);
 230    n += (dlbo->dl_Sub64 == dl            ? 1 : 0);
 231    n += (dlbo->dl_CmpEQ64_CmpNE64 == dl  ? 1 : 0);
 232    n += (dlbo->dl_CmpEQ32_CmpNE32 == dl  ? 1 : 0);
 233    n += (dlbo->dl_CmpEQ16_CmpNE16 == dl  ? 1 : 0);
 234    n += (dlbo->dl_CmpEQ8_CmpNE8 == dl    ? 1 : 0);
 235    return n;
 236 }
 237
 238
 239 /* Carries info about a particular tmp.  The tmp's number is not
 240    recorded, as this is implied by (equal to) its index in the tmpMap
 241    in MCEnv.  The tmp's type is also not recorded, as this is present
 242    in MCEnv.sb->tyenv.
 243
 244    When .kind is Orig, .shadowV and .shadowB may give the identities
 245    of the temps currently holding the associated definedness (shadowV)
 246    and origin (shadowB) values, or these may be IRTemp_INVALID if code
 247    to compute such values has not yet been emitted.
 248
 249    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
 250    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
 251    illogical for a shadow tmp itself to be shadowed.
 252 */
 253 typedef
 254    enum { Orig=1, VSh=2, BSh=3 }
 255    TempKind;
 256
 257 typedef
 258    struct {
 259       TempKind kind;
 260       IRTemp   shadowV;
 261       IRTemp   shadowB;
 262    }
 263    TempMapEnt;
 264
 265
 266 /* A |HowUsed| value carries analysis results about how values are used,
 267    pertaining to whether we need to instrument integer adds expensively or
 268    not.  The running state carries a (readonly) mapping from original tmp to
 269    a HowUsed value for it.  A usage value can be one of three values,
 270    forming a 3-point chain lattice.
 271
 272       HuOth   ("Other") used in some arbitrary way
 273        |
 274       HuPCa   ("PCast") used *only* in effectively a PCast, in which all
 275        |      we care about is the all-defined vs not-all-defined distinction
 276        |
 277       HuUnU   ("Unused") not used at all.
 278
 279    The "safe" (don't-know) end of the lattice is "HuOth".  See comments
 280    below in |preInstrumentationAnalysis| for further details.
 281 */
 282 /* DECLARED ABOVE:
 283 typedef
 284    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 285    HowUsed;
 286 */
 287
 288 // Not actually necessary, but we don't want to waste D1 space.
 289 STATIC_ASSERT(sizeof(HowUsed) == 1);
 290
 291
 292 /* Carries around state during memcheck instrumentation. */
 293 typedef
 294    struct _MCEnv {
 295       /* MODIFIED: the superblock being constructed.  IRStmts are
 296          added. */
 297       IRSB* sb;
 298       Bool  trace;
 299
 300       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
 301          current kind and possibly shadow temps for each temp in the
 302          IRSB being constructed.  Note that it does not contain the
 303          type of each tmp.  If you want to know the type, look at the
 304          relevant entry in sb->tyenv.  It follows that at all times
 305          during the instrumentation process, the valid indices for
 306          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
 307          total number of Orig, V- and B- temps allocated so far.
 308
 309          The reason for this strange split (types in one place, all
 310          other info in another) is that we need the types to be
 311          attached to sb so as to make it possible to do
 312          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
 313          instrumentation process. */
 314       XArray* /* of TempMapEnt */ tmpMap;
 315
 316       /* READONLY: contains details of which ops should be expensively
 317          instrumented. */
 318       DetailLevelByOp dlbo;
 319
 320       /* READONLY: for each original tmp, how the tmp is used.  This is
 321          computed by |preInstrumentationAnalysis|.  Valid indices are
 322          0 .. #temps_in_sb-1 (same as for tmpMap). */
 323       HowUsed* tmpHowUsed;
 324
 325       /* READONLY: the guest layout.  This indicates which parts of
 326          the guest state should be regarded as 'always defined'. */
 327       const VexGuestLayout* layout;
 328
 329       /* READONLY: the host word type.  Needed for constructing
 330          arguments of type 'HWord' to be passed to helper functions.
 331          Ity_I32 or Ity_I64 only. */
 332       IRType hWordTy;
 333    }
 334    MCEnv;
 335
 336
 337 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
 338    demand), as they are encountered.  This is for two reasons.
 339
 340    (1) (less important reason): Many original tmps are unused due to
 341    initial IR optimisation, and we do not want to spaces in tables
 342    tracking them.
 343
 344    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
 345    table indexed [0 .. n_types-1], which gives the current shadow for
 346    each original tmp, or INVALID_IRTEMP if none is so far assigned.
 347    It is necessary to support making multiple assignments to a shadow
 348    -- specifically, after testing a shadow for definedness, it needs
 349    to be made defined.  But IR's SSA property disallows this.
 350
 351    (2) (more important reason): Therefore, when a shadow needs to get
 352    a new value, a new temporary is created, the value is assigned to
 353    that, and the tmpMap is updated to reflect the new binding.
 354
 355    A corollary is that if the tmpMap maps a given tmp to
 356    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
 357    there's a read-before-write error in the original tmps.  The IR
 358    sanity checker should catch all such anomalies, however.
 359 */
 360
 361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
 362    both the table in mce->sb and to our auxiliary mapping.  Note that
 363    newTemp may cause mce->tmpMap to resize, hence previous results
 364    from VG_(indexXA)(mce->tmpMap) are invalidated. */
 365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
 366 {
 367    Word       newIx;
 368    TempMapEnt ent;
 369    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
 370    ent.kind    = kind;
 371    ent.shadowV = IRTemp_INVALID;
 372    ent.shadowB = IRTemp_INVALID;
 373    newIx = VG_(addToXA)( mce->tmpMap, &ent );
 374    tl_assert(newIx == (Word)tmp);
 375    return tmp;
 376 }
 377
 378
 379 /* Find the tmp currently shadowing the given original tmp.  If none
 380    so far exists, allocate one.  */
 381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 382 {
 383    TempMapEnt* ent;
 384    /* VG_(indexXA) range-checks 'orig', hence no need to check
 385       here. */
 386    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 387    tl_assert(ent->kind == Orig);
 388    if (ent->shadowV == IRTemp_INVALID) {
 389       IRTemp tmpV
 390         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 391       /* newTemp may cause mce->tmpMap to resize, hence previous results
 392          from VG_(indexXA) are invalid. */
 393       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 394       tl_assert(ent->kind == Orig);
 395       tl_assert(ent->shadowV == IRTemp_INVALID);
 396       ent->shadowV = tmpV;
 397    }
 398    return ent->shadowV;
 399 }
 400
 401 /* Allocate a new shadow for the given original tmp.  This means any
 402    previous shadow is abandoned.  This is needed because it is
 403    necessary to give a new value to a shadow once it has been tested
 404    for undefinedness, but unfortunately IR's SSA property disallows
 405    this.  Instead we must abandon the old shadow, allocate a new one
 406    and use that instead.
 407
 408    This is the same as findShadowTmpV, except we don't bother to see
 409    if a shadow temp already existed -- we simply allocate a new one
 410    regardless. */
 411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 412 {
 413    TempMapEnt* ent;
 414    /* VG_(indexXA) range-checks 'orig', hence no need to check
 415       here. */
 416    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 417    tl_assert(ent->kind == Orig);
 418    if (1) {
 419       IRTemp tmpV
 420         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 421       /* newTemp may cause mce->tmpMap to resize, hence previous results
 422          from VG_(indexXA) are invalid. */
 423       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 424       tl_assert(ent->kind == Orig);
 425       ent->shadowV = tmpV;
 426    }
 427 }
 428
 429
 430 /*------------------------------------------------------------*/
 431 /*--- IRAtoms -- a subset of IRExprs                       ---*/
 432 /*------------------------------------------------------------*/
 433
 434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
 435    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
 436    input, most of this code deals in atoms.  Usefully, a value atom
 437    always has a V-value which is also an atom: constants are shadowed
 438    by constants, and temps are shadowed by the corresponding shadow
 439    temporary. */
 440
 441 typedef  IRExpr  IRAtom;
 442
 443 /* (used for sanity checks only): is this an atom which looks
 444    like it's from original code? */
 445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 446 {
 447    if (a1->tag == Iex_Const)
 448       return True;
 449    if (a1->tag == Iex_RdTmp) {
 450       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 451       return ent->kind == Orig;
 452    }
 453    return False;
 454 }
 455
 456 /* (used for sanity checks only): is this an atom which looks
 457    like it's from shadow code? */
 458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 459 {
 460    if (a1->tag == Iex_Const)
 461       return True;
 462    if (a1->tag == Iex_RdTmp) {
 463       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 464       return ent->kind == VSh || ent->kind == BSh;
 465    }
 466    return False;
 467 }
 468
 469 /* (used for sanity checks only): check that both args are atoms and
 470    are identically-kinded. */
 471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 472 {
 473    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 474       return True;
 475    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 476       return True;
 477    return False;
 478 }
 479
 480
 481 /*------------------------------------------------------------*/
 482 /*--- Type management                                      ---*/
 483 /*------------------------------------------------------------*/
 484
 485 /* Shadow state is always accessed using integer types.  This returns
 486    an integer type with the same size (as per sizeofIRType) as the
 487    given type.  The only valid shadow types are Bit, I8, I16, I32,
 488    I64, I128, V128, V256. */
 489
 490 static IRType shadowTypeV ( IRType ty )
 491 {
 492    switch (ty) {
 493       case Ity_I1:
 494       case Ity_I8:
 495       case Ity_I16:
 496       case Ity_I32:
 497       case Ity_I64:
 498       case Ity_I128: return ty;
 499       case Ity_F16:  return Ity_I16;
 500       case Ity_F32:  return Ity_I32;
 501       case Ity_D32:  return Ity_I32;
 502       case Ity_F64:  return Ity_I64;
 503       case Ity_D64:  return Ity_I64;
 504       case Ity_F128: return Ity_I128;
 505       case Ity_D128: return Ity_I128;
 506       case Ity_V128: return Ity_V128;
 507       case Ity_V256: return Ity_V256;
 508       default: ppIRType(ty);
 509                VG_(tool_panic)("memcheck:shadowTypeV");
 510    }
 511 }
 512
 513 /* Produce a 'defined' value of the given shadow type.  Should only be
 514    supplied shadow types (Bit/I8/I16/I32/UI64). */
 515 static IRExpr* definedOfType ( IRType ty ) {
 516    switch (ty) {
 517       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
 518       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
 519       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
 520       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
 521       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
 522       case Ity_I128: return i128_const_zero();
 523       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
 524       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
 525       default:       VG_(tool_panic)("memcheck:definedOfType");
 526    }
 527 }
 528
 529
 530 /*------------------------------------------------------------*/
 531 /*--- Constructing IR fragments                            ---*/
 532 /*------------------------------------------------------------*/
 533
 534 /* add stmt to a bb */
 535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
 536    if (mce->trace) {
 537       VG_(printf)("  %c: ", cat);
 538       ppIRStmt(st);
 539       VG_(printf)("\n");
 540    }
 541    addStmtToIRSB(mce->sb, st);
 542 }
 543
 544 /* assign value to tmp */
 545 static inline
 546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
 547    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
 548 }
 549
 550 /* build various kinds of expressions */
 551 #define triop(_op, _arg1, _arg2, _arg3) \
 552                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
 553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
 554 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
 555 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
 556 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
 557 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
 558 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 559 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 560 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 561 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 562
 563 /* Bind the given expression to a new temporary, and return the
 564    temporary.  This effectively converts an arbitrary expression into
 565    an atom.
 566
 567    'ty' is the type of 'e' and hence the type that the new temporary
 568    needs to be.  But passing it in is redundant, since we can deduce
 569    the type merely by inspecting 'e'.  So at least use that fact to
 570    assert that the two types agree. */
 571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 572 {
 573    TempKind k;
 574    IRTemp   t;
 575    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
 576
 577    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
 578    switch (cat) {
 579       case 'V': k = VSh;  break;
 580       case 'B': k = BSh;  break;
 581       case 'C': k = Orig; break;
 582                 /* happens when we are making up new "orig"
 583                    expressions, for IRCAS handling */
 584       default: tl_assert(0);
 585    }
 586    t = newTemp(mce, ty, k);
 587    assign(cat, mce, t, e);
 588    return mkexpr(t);
 589 }
 590
 591
 592 /*------------------------------------------------------------*/
 593 /*--- Helper functions for 128-bit ops                     ---*/
 594 /*------------------------------------------------------------*/
 595
 596 static IRExpr *i128_const_zero(void)
 597 {
 598    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
 599    return binop(Iop_64HLto128, z64, z64);
 600 }
 601
 602 /* There are no I128-bit loads and/or stores [as generated by any
 603    current front ends].  So we do not need to worry about that in
 604    expr2vbits_Load */
 605
 606
 607 /*------------------------------------------------------------*/
 608 /*--- Constructing definedness primitive ops               ---*/
 609 /*------------------------------------------------------------*/
 610
 611 /* --------- Defined-if-either-defined --------- */
 612
 613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 614    tl_assert(isShadowAtom(mce,a1));
 615    tl_assert(isShadowAtom(mce,a2));
 616    return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
 617 }
 618
 619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 620    tl_assert(isShadowAtom(mce,a1));
 621    tl_assert(isShadowAtom(mce,a2));
 622    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 623 }
 624
 625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 626    tl_assert(isShadowAtom(mce,a1));
 627    tl_assert(isShadowAtom(mce,a2));
 628    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 629 }
 630
 631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 632    tl_assert(isShadowAtom(mce,a1));
 633    tl_assert(isShadowAtom(mce,a2));
 634    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 635 }
 636
 637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 638    tl_assert(isShadowAtom(mce,a1));
 639    tl_assert(isShadowAtom(mce,a2));
 640    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 641 }
 642
 643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 644    tl_assert(isShadowAtom(mce,a1));
 645    tl_assert(isShadowAtom(mce,a2));
 646    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 647 }
 648
 649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 650    tl_assert(isShadowAtom(mce,a1));
 651    tl_assert(isShadowAtom(mce,a2));
 652    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
 653 }
 654
 655 /* --------- Undefined-if-either-undefined --------- */
 656
 657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 658    tl_assert(isShadowAtom(mce,a1));
 659    tl_assert(isShadowAtom(mce,a2));
 660    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
 661 }
 662
 663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 664    tl_assert(isShadowAtom(mce,a1));
 665    tl_assert(isShadowAtom(mce,a2));
 666    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 667 }
 668
 669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 670    tl_assert(isShadowAtom(mce,a1));
 671    tl_assert(isShadowAtom(mce,a2));
 672    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 673 }
 674
 675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 676    tl_assert(isShadowAtom(mce,a1));
 677    tl_assert(isShadowAtom(mce,a2));
 678    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 679 }
 680
 681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 682    tl_assert(isShadowAtom(mce,a1));
 683    tl_assert(isShadowAtom(mce,a2));
 684    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 685 }
 686
 687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 688    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
 689    tl_assert(isShadowAtom(mce,a1));
 690    tl_assert(isShadowAtom(mce,a2));
 691    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
 692    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
 693    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
 694    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
 695    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
 696    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
 697
 698    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
 699 }
 700
 701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 702    tl_assert(isShadowAtom(mce,a1));
 703    tl_assert(isShadowAtom(mce,a2));
 704    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 705 }
 706
 707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 708    tl_assert(isShadowAtom(mce,a1));
 709    tl_assert(isShadowAtom(mce,a2));
 710    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
 711 }
 712
 713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
 714    switch (vty) {
 715       case Ity_I8:   return mkUifU8(mce, a1, a2);
 716       case Ity_I16:  return mkUifU16(mce, a1, a2);
 717       case Ity_I32:  return mkUifU32(mce, a1, a2);
 718       case Ity_I64:  return mkUifU64(mce, a1, a2);
 719       case Ity_I128: return mkUifU128(mce, a1, a2);
 720       case Ity_V128: return mkUifUV128(mce, a1, a2);
 721       case Ity_V256: return mkUifUV256(mce, a1, a2);
 722       default:
 723          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
 724          VG_(tool_panic)("memcheck:mkUifU");
 725    }
 726 }
 727
 728 /* --------- The Left-family of operations. --------- */
 729
 730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
 731    tl_assert(isShadowAtom(mce,a1));
 732    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 733 }
 734
 735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
 736    tl_assert(isShadowAtom(mce,a1));
 737    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 738 }
 739
 740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
 741    tl_assert(isShadowAtom(mce,a1));
 742    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 743 }
 744
 745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
 746    tl_assert(isShadowAtom(mce,a1));
 747    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 748 }
 749
 750 /* --------- The Right-family of operations. --------- */
 751
 752 /* Unfortunately these are a lot more expensive then their Left
 753    counterparts.  Fortunately they are only very rarely used -- only for
 754    count-leading-zeroes instrumentation. */
 755
 756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
 757 {
 758    for (Int i = 1; i <= 16; i *= 2) {
 759       // a1 |= (a1 >>u i)
 760       IRAtom* tmp
 761          = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
 762       a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
 763    }
 764    return a1;
 765 }
 766
 767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
 768 {
 769    for (Int i = 1; i <= 32; i *= 2) {
 770       // a1 |= (a1 >>u i)
 771       IRAtom* tmp
 772          = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
 773       a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
 774    }
 775    return a1;
 776 }
 777
 778 /* --------- 'Improvement' functions for AND/OR. --------- */
 779
 780 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
 781    defined (0); all other -> undefined (1).
 782 */
 783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 784 {
 785    tl_assert(isOriginalAtom(mce, data));
 786    tl_assert(isShadowAtom(mce, vbits));
 787    tl_assert(sameKindedAtoms(data, vbits));
 788    return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
 789 }
 790
 791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 792 {
 793    tl_assert(isOriginalAtom(mce, data));
 794    tl_assert(isShadowAtom(mce, vbits));
 795    tl_assert(sameKindedAtoms(data, vbits));
 796    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 797 }
 798
 799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 800 {
 801    tl_assert(isOriginalAtom(mce, data));
 802    tl_assert(isShadowAtom(mce, vbits));
 803    tl_assert(sameKindedAtoms(data, vbits));
 804    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 805 }
 806
 807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 808 {
 809    tl_assert(isOriginalAtom(mce, data));
 810    tl_assert(isShadowAtom(mce, vbits));
 811    tl_assert(sameKindedAtoms(data, vbits));
 812    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 813 }
 814
 815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 816 {
 817    tl_assert(isOriginalAtom(mce, data));
 818    tl_assert(isShadowAtom(mce, vbits));
 819    tl_assert(sameKindedAtoms(data, vbits));
 820    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 821 }
 822
 823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 824 {
 825    tl_assert(isOriginalAtom(mce, data));
 826    tl_assert(isShadowAtom(mce, vbits));
 827    tl_assert(sameKindedAtoms(data, vbits));
 828    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 829 }
 830
 831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 832 {
 833    tl_assert(isOriginalAtom(mce, data));
 834    tl_assert(isShadowAtom(mce, vbits));
 835    tl_assert(sameKindedAtoms(data, vbits));
 836    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
 837 }
 838
 839 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
 840    defined (0); all other -> undefined (1).
 841 */
 842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 843 {
 844    tl_assert(isOriginalAtom(mce, data));
 845    tl_assert(isShadowAtom(mce, vbits));
 846    tl_assert(sameKindedAtoms(data, vbits));
 847    return assignNew(
 848              'V', mce, Ity_I1,
 849              binop(Iop_Or1,
 850                    assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
 851                    vbits) );
 852 }
 853
 854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 855 {
 856    tl_assert(isOriginalAtom(mce, data));
 857    tl_assert(isShadowAtom(mce, vbits));
 858    tl_assert(sameKindedAtoms(data, vbits));
 859    return assignNew(
 860              'V', mce, Ity_I8,
 861              binop(Iop_Or8,
 862                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
 863                    vbits) );
 864 }
 865
 866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 867 {
 868    tl_assert(isOriginalAtom(mce, data));
 869    tl_assert(isShadowAtom(mce, vbits));
 870    tl_assert(sameKindedAtoms(data, vbits));
 871    return assignNew(
 872              'V', mce, Ity_I16,
 873              binop(Iop_Or16,
 874                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
 875                    vbits) );
 876 }
 877
 878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 879 {
 880    tl_assert(isOriginalAtom(mce, data));
 881    tl_assert(isShadowAtom(mce, vbits));
 882    tl_assert(sameKindedAtoms(data, vbits));
 883    return assignNew(
 884              'V', mce, Ity_I32,
 885              binop(Iop_Or32,
 886                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
 887                    vbits) );
 888 }
 889
 890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 891 {
 892    tl_assert(isOriginalAtom(mce, data));
 893    tl_assert(isShadowAtom(mce, vbits));
 894    tl_assert(sameKindedAtoms(data, vbits));
 895    return assignNew(
 896              'V', mce, Ity_I64,
 897              binop(Iop_Or64,
 898                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
 899                    vbits) );
 900 }
 901
 902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 903 {
 904    tl_assert(isOriginalAtom(mce, data));
 905    tl_assert(isShadowAtom(mce, vbits));
 906    tl_assert(sameKindedAtoms(data, vbits));
 907    return assignNew(
 908              'V', mce, Ity_V128,
 909              binop(Iop_OrV128,
 910                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
 911                    vbits) );
 912 }
 913
 914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 915 {
 916    tl_assert(isOriginalAtom(mce, data));
 917    tl_assert(isShadowAtom(mce, vbits));
 918    tl_assert(sameKindedAtoms(data, vbits));
 919    return assignNew(
 920              'V', mce, Ity_V256,
 921              binop(Iop_OrV256,
 922                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
 923                    vbits) );
 924 }
 925
 926 /* --------- Pessimising casts. --------- */
 927
 928 /* The function returns an expression of type DST_TY. If any of the VBITS
 929    is undefined (value == 1) the resulting expression has all bits set to
 930    1. Otherwise, all bits are 0. */
 931
 932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
 933 {
 934    IRType  src_ty;
 935    IRAtom* tmp1;
 936
 937    /* Note, dst_ty is a shadow type, not an original type. */
 938    tl_assert(isShadowAtom(mce,vbits));
 939    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
 940
 941    /* Fast-track some common cases */
 942    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
 943       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 944
 945    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
 946       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 947
 948    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
 949       /* PCast the arg, then clone it. */
 950       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 951       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 952    }
 953
 954    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
 955       /* PCast the arg, then clone it 4 times. */
 956       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 957       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 958       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 959    }
 960
 961    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
 962       /* PCast the arg, then clone it 8 times. */
 963       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 964       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 965       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 966       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
 967    }
 968
 969    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
 970       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
 971          the top half. */
 972       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 973       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
 974    }
 975
 976    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
 977       /* Use InterleaveHI64x2 to copy the top half of the vector into
 978          the bottom half.  Then we can UifU it with the original, throw
 979          away the upper half of the result, and PCast-I64-to-I64
 980          the lower half. */
 981       // Generates vbits[127:64] : vbits[127:64]
 982       IRAtom* hi64hi64
 983          = assignNew('V', mce, Ity_V128,
 984                      binop(Iop_InterleaveHI64x2, vbits, vbits));
 985       // Generates
 986       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
 987       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
 988       IRAtom* lohi64
 989          = mkUifUV128(mce, hi64hi64, vbits);
 990       // Generates UifU(vbits[127:64],vbits[63:0])
 991       IRAtom* lo64
 992          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
 993       // Generates
 994       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
 995       //   == PCast-to-I64( vbits[127:0] )
 996       IRAtom* res
 997          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
 998       return res;
 999    }
1000
1001    /* Else do it the slow way .. */
1002    /* First of all, collapse vbits down to a single bit. */
1003    tmp1   = NULL;
1004    switch (src_ty) {
1005       case Ity_I1:
1006          tmp1 = vbits;
1007          break;
1008       case Ity_I8:
1009          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010          break;
1011       case Ity_I16:
1012          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013          break;
1014       case Ity_I32:
1015          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016          break;
1017       case Ity_I64:
1018          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019          break;
1020       case Ity_I128: {
1021          /* Gah.  Chop it in half, OR the halves together, and compare
1022             that with zero. */
1023          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026          tmp1         = assignNew('V', mce, Ity_I1,
1027                                        unop(Iop_CmpNEZ64, tmp4));
1028          break;
1029       }
1030       case Ity_V128: {
1031          /* Chop it in half, OR the halves together, and compare that
1032           * with zero.
1033           */
1034          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037          tmp1         = assignNew('V', mce, Ity_I1,
1038                                        unop(Iop_CmpNEZ64, tmp4));
1039          break;
1040       }
1041       default:
1042          ppIRType(src_ty);
1043          VG_(tool_panic)("mkPCastTo(1)");
1044    }
1045    tl_assert(tmp1);
1046    /* Now widen up to the dst type. */
1047    switch (dst_ty) {
1048       case Ity_I1:
1049          return tmp1;
1050       case Ity_I8:
1051          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052       case Ity_I16:
1053          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054       case Ity_I32:
1055          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056       case Ity_I64:
1057          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058       case Ity_V128:
1059          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1060          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061          return tmp1;
1062       case Ity_I128:
1063          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1064          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065          return tmp1;
1066       case Ity_V256:
1067          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1068          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069                                                     tmp1, tmp1));
1070          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071                                                     tmp1, tmp1));
1072          return tmp1;
1073       default:
1074          ppIRType(dst_ty);
1075          VG_(tool_panic)("mkPCastTo(2)");
1076    }
1077 }
1078
1079 /* This is a minor variant.  It takes an arg of some type and returns
1080    a value of the same type.  The result consists entirely of Defined
1081    (zero) bits except its least significant bit, which is a PCast of
1082    the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1084 {
1085    if (ty == Ity_V128) {
1086       /* --- Case for V128 --- */
1087       IRAtom* varg128 = varg;
1088       // generates: PCast-to-I64(varg128)
1089       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090       // Now introduce zeros (defined bits) in the top 63 places
1091       // generates: Def--(63)--Def PCast-to-I1(varg128)
1092       IRAtom* d63pc
1093          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094       // generates: Def--(64)--Def
1095       IRAtom* d64
1096          = definedOfType(Ity_I64);
1097       // generates: Def--(127)--Def PCast-to-I1(varg128)
1098       IRAtom* res
1099          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100       return res;
1101    }
1102    if (ty == Ity_I64) {
1103       /* --- Case for I64 --- */
1104       // PCast to 64
1105       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106       // Zero (Def) out the top 63 bits
1107       IRAtom* res
1108          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109       return res;
1110    }
1111    /*NOTREACHED*/
1112    tl_assert(0);
1113 }
1114
1115 /* --------- Optimistic casts. --------- */
1116
1117 /* The function takes and returns an expression of type TY. If any of the
1118    VBITS indicate defined (value == 0) the resulting expression has all bits
1119    set to 0. Otherwise, all bits are 1.  In words, if any bits are defined
1120    then all bits are made to be defined.
1121
1122    In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1123 */
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1125 {
1126    IROp opSUB, opSHR, opSAR;
1127    UInt sh;
1128
1129    switch (ty) {
1130       case Ity_I64:
1131          opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132          break;
1133       case Ity_I32:
1134          opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135          break;
1136       case Ity_I16:
1137          opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138          break;
1139       case Ity_I8:
1140          opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141          break;
1142       default:
1143          ppIRType(ty);
1144          VG_(tool_panic)("mkOCastTo");
1145    }
1146
1147    IRAtom *shr1, *at;
1148    shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149    at   = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150    at   = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151    return at;
1152 }
1153
1154
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1156 /*
1157    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158    PCasting to Ity_U1.  However, sometimes it is necessary to be more
1159    accurate.  The insight is that the result is defined if two
1160    corresponding bits can be found, one from each argument, so that
1161    both bits are defined but are different -- that makes EQ say "No"
1162    and NE say "Yes".  Hence, we compute an improvement term and DifD
1163    it onto the "normal" (UifU) result.
1164
1165    The result is:
1166
1167    PCastTo<1> (
1168       -- naive version
1169       UifU<sz>(vxx, vyy)
1170
1171       `DifD<sz>`
1172
1173       -- improvement term
1174       OCast<sz>(vec)
1175    )
1176
1177    where
1178      vec contains 0 (defined) bits where the corresponding arg bits
1179      are defined but different, and 1 bits otherwise.
1180
1181      vec = Or<sz>( vxx,   // 0 iff bit defined
1182                    vyy,   // 0 iff bit defined
1183                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1184                  )
1185
1186      If any bit of vec is 0, the result is defined and so the
1187      improvement term should produce 0...0, else it should produce
1188      1...1.
1189
1190      Hence require for the improvement term:
1191
1192         OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1193
1194      which you can think of as an "optimistic cast" (OCast, the opposite of
1195      the normal "pessimistic cast" (PCast) family.  An OCast says all bits
1196      are defined if any bit is defined.
1197
1198      It is possible to show that
1199
1200          if vec == 1...1 then 1...1 else 0...0
1201
1202      can be implemented in straight-line code as
1203
1204          (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1205
1206    We note that vec contains the sub-term Or<sz>(vxx, vyy).  Since UifU is
1207    implemented with Or (since 1 signifies undefinedness), this is a
1208    duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209    a final version of:
1210
1211    let naive = UifU<sz>(vxx, vyy)
1212        vec   = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1213    in
1214        PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1215
1216    This was extensively re-analysed and checked on 6 July 05 and again
1217    in July 2017.
1218 */
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
1220                                     IRType  ty,
1221                                     IRAtom* vxx, IRAtom* vyy,
1222                                     IRAtom* xx,  IRAtom* yy )
1223 {
1224    IRAtom *naive, *vec, *improved, *final_cast;
1225    IROp   opDIFD, opUIFU, opOR, opXOR, opNOT;
1226
1227    tl_assert(isShadowAtom(mce,vxx));
1228    tl_assert(isShadowAtom(mce,vyy));
1229    tl_assert(isOriginalAtom(mce,xx));
1230    tl_assert(isOriginalAtom(mce,yy));
1231    tl_assert(sameKindedAtoms(vxx,xx));
1232    tl_assert(sameKindedAtoms(vyy,yy));
1233
1234    switch (ty) {
1235       case Ity_I8:
1236          opDIFD = Iop_And8;
1237          opUIFU = Iop_Or8;
1238          opOR   = Iop_Or8;
1239          opXOR  = Iop_Xor8;
1240          opNOT  = Iop_Not8;
1241          break;
1242       case Ity_I16:
1243          opDIFD = Iop_And16;
1244          opUIFU = Iop_Or16;
1245          opOR   = Iop_Or16;
1246          opXOR  = Iop_Xor16;
1247          opNOT  = Iop_Not16;
1248          break;
1249       case Ity_I32:
1250          opDIFD = Iop_And32;
1251          opUIFU = Iop_Or32;
1252          opOR   = Iop_Or32;
1253          opXOR  = Iop_Xor32;
1254          opNOT  = Iop_Not32;
1255          break;
1256       case Ity_I64:
1257          opDIFD = Iop_And64;
1258          opUIFU = Iop_Or64;
1259          opOR   = Iop_Or64;
1260          opXOR  = Iop_Xor64;
1261          opNOT  = Iop_Not64;
1262          break;
1263       default:
1264          VG_(tool_panic)("expensiveCmpEQorNE");
1265    }
1266
1267    naive
1268       = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1269
1270    vec
1271       = assignNew(
1272            'V', mce,ty,
1273            binop( opOR,
1274                   naive,
1275                   assignNew(
1276                      'V', mce,ty,
1277                      unop(opNOT,
1278                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1279
1280    improved
1281       = assignNew( 'V', mce,ty,
1282                    binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1283
1284    final_cast
1285       = mkPCastTo( mce, Ity_I1, improved );
1286
1287    return final_cast;
1288 }
1289
1290
1291 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1292
1293 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1294
1295       CmpORD32S(x,y) = 1<<3   if  x <s y
1296                      = 1<<2   if  x >s y
1297                      = 1<<1   if  x == y
1298
1299    and similarly the unsigned variant.  The default interpretation is:
1300
1301       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1302                                   & (7<<1)
1303
1304    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1305    are zero and therefore defined (viz, zero).
1306
1307    Also deal with a special case better:
1308
1309       CmpORD32S(x,0)
1310
1311    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1312    will be defined even if the rest of x isn't.  In which case we do:
1313
1314       CmpORD32S#(x,x#,0,{impliedly 0}#)
1315          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1316            | (x# >>u 31) << 3      -- LT# = x#[31]
1317
1318    Analogous handling for CmpORD64{S,U}.
1319 */
1320 static Bool isZeroU32 ( IRAtom* e )
1321 {
1322    return
1323       toBool( e->tag == Iex_Const
1324               && e->Iex.Const.con->tag == Ico_U32
1325               && e->Iex.Const.con->Ico.U32 == 0 );
1326 }
1327
1328 static Bool isZeroU64 ( IRAtom* e )
1329 {
1330    return
1331       toBool( e->tag == Iex_Const
1332               && e->Iex.Const.con->tag == Ico_U64
1333               && e->Iex.Const.con->Ico.U64 == 0 );
1334 }
1335
1336 static IRAtom* doCmpORD ( MCEnv*  mce,
1337                           IROp    cmp_op,
1338                           IRAtom* xxhash, IRAtom* yyhash,
1339                           IRAtom* xx,     IRAtom* yy )
1340 {
1341    Bool   m64      = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1342    Bool   syned    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1343    IROp   opOR     = m64 ? Iop_Or64   : Iop_Or32;
1344    IROp   opAND    = m64 ? Iop_And64  : Iop_And32;
1345    IROp   opSHL    = m64 ? Iop_Shl64  : Iop_Shl32;
1346    IROp   opSHR    = m64 ? Iop_Shr64  : Iop_Shr32;
1347    IROp   op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1348    IRType ty       = m64 ? Ity_I64    : Ity_I32;
1349    Int    width    = m64 ? 64         : 32;
1350
1351    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1352
1353    tl_assert(isShadowAtom(mce,xxhash));
1354    tl_assert(isShadowAtom(mce,yyhash));
1355    tl_assert(isOriginalAtom(mce,xx));
1356    tl_assert(isOriginalAtom(mce,yy));
1357    tl_assert(sameKindedAtoms(xxhash,xx));
1358    tl_assert(sameKindedAtoms(yyhash,yy));
1359    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1360              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1361
1362    if (0) {
1363       ppIROp(cmp_op); VG_(printf)(" ");
1364       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1365    }
1366
1367    if (syned && isZero(yy)) {
1368       /* fancy interpretation */
1369       /* if yy is zero, then it must be fully defined (zero#). */
1370       tl_assert(isZero(yyhash));
1371       // This is still inaccurate, but I don't think it matters, since
1372       // nobody writes code of the form
1373       // "is <partially-undefined-value> signedly greater than zero?".
1374       // We therefore simply declare "x >s 0" to be undefined if any bit in
1375       // x is undefined.  That's clearly suboptimal in some cases.  Eg, if
1376       // the highest order bit is a defined 1 then x is negative so it
1377       // doesn't matter whether the remaining bits are defined or not.
1378       IRAtom* t_0_gt_0_0
1379          = assignNew(
1380               'V', mce,ty,
1381               binop(
1382                  opAND,
1383                  mkPCastTo(mce,ty, xxhash),
1384                  m64 ? mkU64(1<<2) : mkU32(1<<2)
1385               ));
1386       // For "x <s 0", we can just copy the definedness of the top bit of x
1387       // and we have a precise result.
1388       IRAtom* t_lt_0_0_0
1389          = assignNew(
1390               'V', mce,ty,
1391               binop(
1392                  opSHL,
1393                  assignNew(
1394                     'V', mce,ty,
1395                     binop(opSHR, xxhash, mkU8(width-1))),
1396                  mkU8(3)
1397               ));
1398       // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1399       IRAtom* t_0_0_eq_0
1400          = assignNew(
1401               'V', mce,ty,
1402               binop(
1403                  opSHL,
1404                  assignNew('V', mce,ty,
1405                     unop(
1406                     op1UtoWS,
1407                     expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1408                  ),
1409                  mkU8(1)
1410               ));
1411       return
1412          binop(
1413             opOR,
1414             assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1415             t_0_0_eq_0
1416          );
1417    } else {
1418       /* standard interpretation */
1419       IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1420       return
1421          binop(
1422             opAND,
1423             mkPCastTo( mce,ty,
1424                        mkUifU(mce,ty, xxhash,yyhash)),
1425             sevenLeft1
1426          );
1427    }
1428 }
1429
1430
1431 /*------------------------------------------------------------*/
1432 /*--- Emit a test and complaint if something is undefined. ---*/
1433 /*------------------------------------------------------------*/
1434
1435 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1436
1437
1438 /* Set the annotations on a dirty helper to indicate that the stack
1439    pointer and instruction pointers might be read.  This is the
1440    behaviour of all 'emit-a-complaint' style functions we might
1441    call. */
1442
1443 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1444    di->nFxState = 2;
1445    di->fxState[0].fx        = Ifx_Read;
1446    di->fxState[0].offset    = mce->layout->offset_SP;
1447    di->fxState[0].size      = mce->layout->sizeof_SP;
1448    di->fxState[0].nRepeats  = 0;
1449    di->fxState[0].repeatLen = 0;
1450    di->fxState[1].fx        = Ifx_Read;
1451    di->fxState[1].offset    = mce->layout->offset_IP;
1452    di->fxState[1].size      = mce->layout->sizeof_IP;
1453    di->fxState[1].nRepeats  = 0;
1454    di->fxState[1].repeatLen = 0;
1455 }
1456
1457
1458 /* Check the supplied *original* |atom| for undefinedness, and emit a
1459    complaint if so.  Once that happens, mark it as defined.  This is
1460    possible because the atom is either a tmp or literal.  If it's a
1461    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1462    be defined.  In fact as mentioned above, we will have to allocate a
1463    new tmp to carry the new 'defined' shadow value, and update the
1464    original->tmp mapping accordingly; we cannot simply assign a new
1465    value to an existing shadow tmp as this breaks SSAness.
1466
1467    The checks are performed, any resulting complaint emitted, and
1468    |atom|'s shadow temp set to 'defined', ONLY in the case that
1469    |guard| evaluates to True at run-time.  If it evaluates to False
1470    then no action is performed.  If |guard| is NULL (the usual case)
1471    then it is assumed to be always-true, and hence these actions are
1472    performed unconditionally.
1473
1474    This routine does not generate code to check the definedness of
1475    |guard|.  The caller is assumed to have taken care of that already.
1476 */
1477 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1478 {
1479    IRAtom*  vatom;
1480    IRType   ty;
1481    Int      sz;
1482    IRDirty* di;
1483    IRAtom*  cond;
1484    IRAtom*  origin;
1485    void*    fn;
1486    const HChar* nm;
1487    IRExpr** args;
1488    Int      nargs;
1489
1490    // Don't do V bit tests if we're not reporting undefined value errors.
1491    if (MC_(clo_mc_level) == 1)
1492       return;
1493
1494    if (guard)
1495       tl_assert(isOriginalAtom(mce, guard));
1496
1497    /* Since the original expression is atomic, there's no duplicated
1498       work generated by making multiple V-expressions for it.  So we
1499       don't really care about the possibility that someone else may
1500       also create a V-interpretion for it. */
1501    tl_assert(isOriginalAtom(mce, atom));
1502    vatom = expr2vbits( mce, atom, HuOth );
1503    tl_assert(isShadowAtom(mce, vatom));
1504    tl_assert(sameKindedAtoms(atom, vatom));
1505
1506    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1507
1508    /* sz is only used for constructing the error message */
1509    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1510
1511    cond = mkPCastTo( mce, Ity_I1, vatom );
1512    /* cond will be 0 if all defined, and 1 if any not defined. */
1513
1514    /* Get the origin info for the value we are about to check.  At
1515       least, if we are doing origin tracking.  If not, use a dummy
1516       zero origin. */
1517    if (MC_(clo_mc_level) == 3) {
1518       origin = schemeE( mce, atom );
1519       if (mce->hWordTy == Ity_I64) {
1520          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1521       }
1522    } else {
1523       origin = NULL;
1524    }
1525
1526    fn    = NULL;
1527    nm    = NULL;
1528    args  = NULL;
1529    nargs = -1;
1530
1531    switch (sz) {
1532       case 0:
1533          if (origin) {
1534             fn    = &MC_(helperc_value_check0_fail_w_o);
1535             nm    = "MC_(helperc_value_check0_fail_w_o)";
1536             args  = mkIRExprVec_1(origin);
1537             nargs = 1;
1538          } else {
1539             fn    = &MC_(helperc_value_check0_fail_no_o);
1540             nm    = "MC_(helperc_value_check0_fail_no_o)";
1541             args  = mkIRExprVec_0();
1542             nargs = 0;
1543          }
1544          break;
1545       case 1:
1546          if (origin) {
1547             fn    = &MC_(helperc_value_check1_fail_w_o);
1548             nm    = "MC_(helperc_value_check1_fail_w_o)";
1549             args  = mkIRExprVec_1(origin);
1550             nargs = 1;
1551          } else {
1552             fn    = &MC_(helperc_value_check1_fail_no_o);
1553             nm    = "MC_(helperc_value_check1_fail_no_o)";
1554             args  = mkIRExprVec_0();
1555             nargs = 0;
1556          }
1557          break;
1558       case 4:
1559          if (origin) {
1560             fn    = &MC_(helperc_value_check4_fail_w_o);
1561             nm    = "MC_(helperc_value_check4_fail_w_o)";
1562             args  = mkIRExprVec_1(origin);
1563             nargs = 1;
1564          } else {
1565             fn    = &MC_(helperc_value_check4_fail_no_o);
1566             nm    = "MC_(helperc_value_check4_fail_no_o)";
1567             args  = mkIRExprVec_0();
1568             nargs = 0;
1569          }
1570          break;
1571       case 8:
1572          if (origin) {
1573             fn    = &MC_(helperc_value_check8_fail_w_o);
1574             nm    = "MC_(helperc_value_check8_fail_w_o)";
1575             args  = mkIRExprVec_1(origin);
1576             nargs = 1;
1577          } else {
1578             fn    = &MC_(helperc_value_check8_fail_no_o);
1579             nm    = "MC_(helperc_value_check8_fail_no_o)";
1580             args  = mkIRExprVec_0();
1581             nargs = 0;
1582          }
1583          break;
1584       case 2:
1585       case 16:
1586          if (origin) {
1587             fn    = &MC_(helperc_value_checkN_fail_w_o);
1588             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1589             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1590             nargs = 2;
1591          } else {
1592             fn    = &MC_(helperc_value_checkN_fail_no_o);
1593             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1594             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1595             nargs = 1;
1596          }
1597          break;
1598       default:
1599          VG_(tool_panic)("unexpected szB");
1600    }
1601
1602    tl_assert(fn);
1603    tl_assert(nm);
1604    tl_assert(args);
1605    tl_assert(nargs >= 0 && nargs <= 2);
1606    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1607               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1608
1609    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1610                            VG_(fnptr_to_fnentry)( fn ), args );
1611    di->guard = cond; // and cond is PCast-to-1(atom#)
1612
1613    /* If the complaint is to be issued under a guard condition, AND
1614       that into the guard condition for the helper call. */
1615    if (guard) {
1616       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1617       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1618       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1619       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1620    }
1621
1622    setHelperAnns( mce, di );
1623    stmt( 'V', mce, IRStmt_Dirty(di));
1624
1625    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1626       defined -- but only in the case where the guard evaluates to
1627       True at run-time.  Do the update by setting the orig->shadow
1628       mapping for tmp to reflect the fact that this shadow is getting
1629       a new value. */
1630    tl_assert(isIRAtom(vatom));
1631    /* sameKindedAtoms ... */
1632    if (vatom->tag == Iex_RdTmp) {
1633       tl_assert(atom->tag == Iex_RdTmp);
1634       if (guard == NULL) {
1635          // guard is 'always True', hence update unconditionally
1636          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1637          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1638                           definedOfType(ty));
1639       } else {
1640          // update the temp only conditionally.  Do this by copying
1641          // its old value when the guard is False.
1642          // The old value ..
1643          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1644          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1645          IRAtom* new_tmpV
1646             = assignNew('V', mce, shadowTypeV(ty),
1647                         IRExpr_ITE(guard, definedOfType(ty),
1648                                           mkexpr(old_tmpV)));
1649          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1650       }
1651    }
1652 }
1653
1654
1655 /*------------------------------------------------------------*/
1656 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1657 /*------------------------------------------------------------*/
1658
1659 /* Examine the always-defined sections declared in layout to see if
1660    the (offset,size) section is within one.  Note, is is an error to
1661    partially fall into such a region: (offset,size) should either be
1662    completely in such a region or completely not-in such a region.
1663 */
1664 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1665 {
1666    Int minoffD, maxoffD, i;
1667    Int minoff = offset;
1668    Int maxoff = minoff + size - 1;
1669    tl_assert((minoff & ~0xFFFF) == 0);
1670    tl_assert((maxoff & ~0xFFFF) == 0);
1671
1672    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1673       minoffD = mce->layout->alwaysDefd[i].offset;
1674       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1675       tl_assert((minoffD & ~0xFFFF) == 0);
1676       tl_assert((maxoffD & ~0xFFFF) == 0);
1677
1678       if (maxoff < minoffD || maxoffD < minoff)
1679          continue; /* no overlap */
1680       if (minoff >= minoffD && maxoff <= maxoffD)
1681          return True; /* completely contained in an always-defd section */
1682
1683       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1684    }
1685    return False; /* could not find any containing section */
1686 }
1687
1688
1689 /* Generate into bb suitable actions to shadow this Put.  If the state
1690    slice is marked 'always defined', do nothing.  Otherwise, write the
1691    supplied V bits to the shadow state.  We can pass in either an
1692    original atom or a V-atom, but not both.  In the former case the
1693    relevant V-bits are then generated from the original.
1694    We assume here, that the definedness of GUARD has already been checked.
1695 */
1696 static
1697 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1698                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1699 {
1700    IRType ty;
1701
1702    // Don't do shadow PUTs if we're not doing undefined value checking.
1703    // Their absence lets Vex's optimiser remove all the shadow computation
1704    // that they depend on, which includes GETs of the shadow registers.
1705    if (MC_(clo_mc_level) == 1)
1706       return;
1707
1708    if (atom) {
1709       tl_assert(!vatom);
1710       tl_assert(isOriginalAtom(mce, atom));
1711       vatom = expr2vbits( mce, atom, HuOth );
1712    } else {
1713       tl_assert(vatom);
1714       tl_assert(isShadowAtom(mce, vatom));
1715    }
1716
1717    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1718    tl_assert(ty != Ity_I1);
1719    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1720       /* later: no ... */
1721       /* emit code to emit a complaint if any of the vbits are 1. */
1722       /* complainIfUndefined(mce, atom); */
1723    } else {
1724       /* Do a plain shadow Put. */
1725       if (guard) {
1726          /* If the guard expression evaluates to false we simply Put the value
1727             that is already stored in the guest state slot */
1728          IRAtom *cond, *iffalse;
1729
1730          cond    = assignNew('V', mce, Ity_I1, guard);
1731          iffalse = assignNew('V', mce, ty,
1732                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1733          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1734       }
1735       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1736    }
1737 }
1738
1739
1740 /* Return an expression which contains the V bits corresponding to the
1741    given GETI (passed in in pieces).
1742 */
1743 static
1744 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1745 {
1746    IRAtom* vatom;
1747    IRType  ty, tyS;
1748    Int     arrSize;;
1749    IRRegArray* descr = puti->descr;
1750    IRAtom*     ix    = puti->ix;
1751    Int         bias  = puti->bias;
1752    IRAtom*     atom  = puti->data;
1753
1754    // Don't do shadow PUTIs if we're not doing undefined value checking.
1755    // Their absence lets Vex's optimiser remove all the shadow computation
1756    // that they depend on, which includes GETIs of the shadow registers.
1757    if (MC_(clo_mc_level) == 1)
1758       return;
1759
1760    tl_assert(isOriginalAtom(mce,atom));
1761    vatom = expr2vbits( mce, atom, HuOth );
1762    tl_assert(sameKindedAtoms(atom, vatom));
1763    ty   = descr->elemTy;
1764    tyS  = shadowTypeV(ty);
1765    arrSize = descr->nElems * sizeofIRType(ty);
1766    tl_assert(ty != Ity_I1);
1767    tl_assert(isOriginalAtom(mce,ix));
1768    complainIfUndefined(mce, ix, NULL);
1769    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1770       /* later: no ... */
1771       /* emit code to emit a complaint if any of the vbits are 1. */
1772       /* complainIfUndefined(mce, atom); */
1773    } else {
1774       /* Do a cloned version of the Put that refers to the shadow
1775          area. */
1776       IRRegArray* new_descr
1777          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1778                          tyS, descr->nElems);
1779       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1780    }
1781 }
1782
1783
1784 /* Return an expression which contains the V bits corresponding to the
1785    given GET (passed in in pieces).
1786 */
1787 static
1788 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1789 {
1790    IRType tyS = shadowTypeV(ty);
1791    tl_assert(ty != Ity_I1);
1792    tl_assert(ty != Ity_I128);
1793    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1794       /* Always defined, return all zeroes of the relevant type */
1795       return definedOfType(tyS);
1796    } else {
1797       /* return a cloned version of the Get that refers to the shadow
1798          area. */
1799       /* FIXME: this isn't an atom! */
1800       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1801    }
1802 }
1803
1804
1805 /* Return an expression which contains the V bits corresponding to the
1806    given GETI (passed in in pieces).
1807 */
1808 static
1809 IRExpr* shadow_GETI ( MCEnv* mce,
1810                       IRRegArray* descr, IRAtom* ix, Int bias )
1811 {
1812    IRType ty   = descr->elemTy;
1813    IRType tyS  = shadowTypeV(ty);
1814    Int arrSize = descr->nElems * sizeofIRType(ty);
1815    tl_assert(ty != Ity_I1);
1816    tl_assert(isOriginalAtom(mce,ix));
1817    complainIfUndefined(mce, ix, NULL);
1818    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1819       /* Always defined, return all zeroes of the relevant type */
1820       return definedOfType(tyS);
1821    } else {
1822       /* return a cloned version of the Get that refers to the shadow
1823          area. */
1824       IRRegArray* new_descr
1825          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1826                          tyS, descr->nElems);
1827       return IRExpr_GetI( new_descr, ix, bias );
1828    }
1829 }
1830
1831
1832 /*------------------------------------------------------------*/
1833 /*--- Generating approximations for unknown operations,    ---*/
1834 /*--- using lazy-propagate semantics                       ---*/
1835 /*------------------------------------------------------------*/
1836
1837 /* Lazy propagation of undefinedness from two values, resulting in the
1838    specified shadow type.
1839 */
1840 static
1841 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1842 {
1843    IRAtom* at;
1844    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1845    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1846    tl_assert(isShadowAtom(mce,va1));
1847    tl_assert(isShadowAtom(mce,va2));
1848
1849    /* The general case is inefficient because PCast is an expensive
1850       operation.  Here are some special cases which use PCast only
1851       once rather than twice. */
1852
1853    /* I64 x I64 -> I64 */
1854    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1855       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1856       at = mkUifU(mce, Ity_I64, va1, va2);
1857       at = mkPCastTo(mce, Ity_I64, at);
1858       return at;
1859    }
1860
1861    /* I64 x I64 -> I32 */
1862    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1863       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1864       at = mkUifU(mce, Ity_I64, va1, va2);
1865       at = mkPCastTo(mce, Ity_I32, at);
1866       return at;
1867    }
1868
1869    /* I32 x I32 -> I32 */
1870    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1871       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1872       at = mkUifU(mce, Ity_I32, va1, va2);
1873       at = mkPCastTo(mce, Ity_I32, at);
1874       return at;
1875    }
1876
1877    if (0) {
1878       VG_(printf)("mkLazy2 ");
1879       ppIRType(t1);
1880       VG_(printf)("_");
1881       ppIRType(t2);
1882       VG_(printf)("_");
1883       ppIRType(finalVty);
1884       VG_(printf)("\n");
1885    }
1886
1887    /* General case: force everything via 32-bit intermediaries. */
1888    at = mkPCastTo(mce, Ity_I32, va1);
1889    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1890    at = mkPCastTo(mce, finalVty, at);
1891    return at;
1892 }
1893
1894
1895 /* 3-arg version of the above. */
1896 static
1897 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1898                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1899 {
1900    IRAtom* at;
1901    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1902    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1903    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1904    tl_assert(isShadowAtom(mce,va1));
1905    tl_assert(isShadowAtom(mce,va2));
1906    tl_assert(isShadowAtom(mce,va3));
1907
1908    /* The general case is inefficient because PCast is an expensive
1909       operation.  Here are some special cases which use PCast only
1910       twice rather than three times. */
1911
1912    /* I32 x I64 x I64 -> I64 */
1913    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1914    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1915        && finalVty == Ity_I64) {
1916       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1917       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1918          mode indication which is fully defined, this should get
1919          folded out later. */
1920       at = mkPCastTo(mce, Ity_I64, va1);
1921       /* Now fold in 2nd and 3rd args. */
1922       at = mkUifU(mce, Ity_I64, at, va2);
1923       at = mkUifU(mce, Ity_I64, at, va3);
1924       /* and PCast once again. */
1925       at = mkPCastTo(mce, Ity_I64, at);
1926       return at;
1927    }
1928
1929    /* I32 x I8 x I64 -> I64 */
1930    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1931        && finalVty == Ity_I64) {
1932       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1933       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1934        * rounding mode indication which is fully defined, this should
1935        * get folded out later.
1936       */
1937       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1938       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1939       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1940       at = mkUifU(mce, Ity_I64, at, va3);
1941       /* and PCast once again. */
1942       at = mkPCastTo(mce, Ity_I64, at);
1943       return at;
1944    }
1945
1946    /* I32 x I64 x I64 -> I32 */
1947    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1948        && finalVty == Ity_I32) {
1949       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1950       at = mkPCastTo(mce, Ity_I64, va1);
1951       at = mkUifU(mce, Ity_I64, at, va2);
1952       at = mkUifU(mce, Ity_I64, at, va3);
1953       at = mkPCastTo(mce, Ity_I32, at);
1954       return at;
1955    }
1956
1957    /* I32 x I32 x I32 -> I32 */
1958    /* 32-bit FP idiom, as (eg) happens on ARM */
1959    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1960        && finalVty == Ity_I32) {
1961       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1962       at = va1;
1963       at = mkUifU(mce, Ity_I32, at, va2);
1964       at = mkUifU(mce, Ity_I32, at, va3);
1965       at = mkPCastTo(mce, Ity_I32, at);
1966       return at;
1967    }
1968
1969    /* I32 x I16 x I16 -> I16 */
1970    /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
1971    if (t1 == Ity_I32 && t2 == Ity_I16 && t3 == Ity_I16
1972        && finalVty == Ity_I16) {
1973       if (0) VG_(printf)("mkLazy3: I32 x I16 x I16 -> I16\n");
1974       at = mkPCastTo(mce, Ity_I16, va1);
1975       at = mkUifU(mce, Ity_I16, at, va2);
1976       at = mkUifU(mce, Ity_I16, at, va3);
1977       at = mkPCastTo(mce, Ity_I16, at);
1978       return at;
1979    }
1980
1981    /* I32 x I128 x I128 -> I128 */
1982    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1983    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1984        && finalVty == Ity_I128) {
1985       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1986       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1987          mode indication which is fully defined, this should get
1988          folded out later. */
1989       at = mkPCastTo(mce, Ity_I128, va1);
1990       /* Now fold in 2nd and 3rd args. */
1991       at = mkUifU(mce, Ity_I128, at, va2);
1992       at = mkUifU(mce, Ity_I128, at, va3);
1993       /* and PCast once again. */
1994       at = mkPCastTo(mce, Ity_I128, at);
1995       return at;
1996    }
1997
1998    /* I32 x I8 x I128 -> I128 */
1999    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2000    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
2001        && finalVty == Ity_I128) {
2002       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
2003       /* Use I64 as an intermediate type, which means PCasting all 3
2004          args to I64 to start with. 1st arg is typically a rounding
2005          mode indication which is fully defined, so we hope that it
2006          will get folded out later. */
2007       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2008       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2009       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
2010       /* Now UifU all three together. */
2011       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
2012       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
2013       /* and PCast once again. */
2014       at = mkPCastTo(mce, Ity_I128, at);
2015       return at;
2016    }
2017    if (1) {
2018       VG_(printf)("mkLazy3: ");
2019       ppIRType(t1);
2020       VG_(printf)(" x ");
2021       ppIRType(t2);
2022       VG_(printf)(" x ");
2023       ppIRType(t3);
2024       VG_(printf)(" -> ");
2025       ppIRType(finalVty);
2026       VG_(printf)("\n");
2027    }
2028
2029    tl_assert(0);
2030    /* General case: force everything via 32-bit intermediaries. */
2031    /*
2032    at = mkPCastTo(mce, Ity_I32, va1);
2033    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2034    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2035    at = mkPCastTo(mce, finalVty, at);
2036    return at;
2037    */
2038 }
2039
2040
2041 /* 4-arg version of the above. */
2042 static
2043 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2044                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2045 {
2046    IRAtom* at;
2047    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2048    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2049    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2050    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2051    tl_assert(isShadowAtom(mce,va1));
2052    tl_assert(isShadowAtom(mce,va2));
2053    tl_assert(isShadowAtom(mce,va3));
2054    tl_assert(isShadowAtom(mce,va4));
2055
2056    /* The general case is inefficient because PCast is an expensive
2057       operation.  Here are some special cases which use PCast only
2058       twice rather than three times. */
2059
2060    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2061
2062    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2063        && finalVty == Ity_I128) {
2064       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2065       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
2066          mode indication which is fully defined, this should get
2067          folded out later. */
2068       at = mkPCastTo(mce, Ity_I128, va1);
2069       /* Now fold in 2nd, 3rd, 4th args. */
2070       at = mkUifU(mce, Ity_I128, at, va2);
2071       at = mkUifU(mce, Ity_I128, at, va3);
2072       at = mkUifU(mce, Ity_I128, at, va4);
2073       /* and PCast once again. */
2074       at = mkPCastTo(mce, Ity_I128, at);
2075       return at;
2076    }
2077
2078    /* I32 x I64 x I64 x I64 -> I64 */
2079    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2080        && finalVty == Ity_I64) {
2081       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2082       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
2083          mode indication which is fully defined, this should get
2084          folded out later. */
2085       at = mkPCastTo(mce, Ity_I64, va1);
2086       /* Now fold in 2nd, 3rd, 4th args. */
2087       at = mkUifU(mce, Ity_I64, at, va2);
2088       at = mkUifU(mce, Ity_I64, at, va3);
2089       at = mkUifU(mce, Ity_I64, at, va4);
2090       /* and PCast once again. */
2091       at = mkPCastTo(mce, Ity_I64, at);
2092       return at;
2093    }
2094    /* I32 x I32 x I32 x I32 -> I32 */
2095    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2096    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2097        && finalVty == Ity_I32) {
2098       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2099       at = va1;
2100       /* Now fold in 2nd, 3rd, 4th args. */
2101       at = mkUifU(mce, Ity_I32, at, va2);
2102       at = mkUifU(mce, Ity_I32, at, va3);
2103       at = mkUifU(mce, Ity_I32, at, va4);
2104       at = mkPCastTo(mce, Ity_I32, at);
2105       return at;
2106    }
2107
2108    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2109        && finalVty == Ity_I32) {
2110       if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2111       at = mkPCastTo(mce, Ity_I8, va1);
2112       /* Now fold in 2nd, 3rd, 4th args. */
2113       at = mkUifU(mce, Ity_I8, at, va2);
2114       at = mkUifU(mce, Ity_I8, at, va3);
2115       at = mkUifU(mce, Ity_I8, at, va4);
2116       at = mkPCastTo(mce, Ity_I32, at);
2117       return at;
2118    }
2119
2120    if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2121        && finalVty == Ity_I64) {
2122       if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2123       at = mkPCastTo(mce, Ity_I8, va1);
2124       /* Now fold in 2nd, 3rd, 4th args. */
2125       at = mkUifU(mce, Ity_I8, at, va2);
2126       at = mkUifU(mce, Ity_I8, at, va3);
2127       at = mkUifU(mce, Ity_I8, at, va4);
2128       at = mkPCastTo(mce, Ity_I64, at);
2129       return at;
2130    }
2131
2132    if (1) {
2133       VG_(printf)("mkLazy4: ");
2134       ppIRType(t1);
2135       VG_(printf)(" x ");
2136       ppIRType(t2);
2137       VG_(printf)(" x ");
2138       ppIRType(t3);
2139       VG_(printf)(" x ");
2140       ppIRType(t4);
2141       VG_(printf)(" -> ");
2142       ppIRType(finalVty);
2143       VG_(printf)("\n");
2144    }
2145
2146    tl_assert(0);
2147 }
2148
2149
2150 /* Do the lazy propagation game from a null-terminated vector of
2151    atoms.  This is presumably the arguments to a helper call, so the
2152    IRCallee info is also supplied in order that we can know which
2153    arguments should be ignored (via the .mcx_mask field).
2154 */
2155 static
2156 IRAtom* mkLazyN ( MCEnv* mce,
2157                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2158 {
2159    Int     i;
2160    IRAtom* here;
2161    IRAtom* curr;
2162    IRType  mergeTy;
2163    Bool    mergeTy64 = True;
2164
2165    /* Decide on the type of the merge intermediary.  If all relevant
2166       args are I64, then it's I64.  In all other circumstances, use
2167       I32. */
2168    for (i = 0; exprvec[i]; i++) {
2169       tl_assert(i < 32);
2170       tl_assert(isOriginalAtom(mce, exprvec[i]));
2171       if (cee->mcx_mask & (1<<i))
2172          continue;
2173       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2174          mergeTy64 = False;
2175    }
2176
2177    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
2178    curr    = definedOfType(mergeTy);
2179
2180    for (i = 0; exprvec[i]; i++) {
2181       tl_assert(i < 32);
2182       tl_assert(isOriginalAtom(mce, exprvec[i]));
2183       /* Only take notice of this arg if the callee's mc-exclusion
2184          mask does not say it is to be excluded. */
2185       if (cee->mcx_mask & (1<<i)) {
2186          /* the arg is to be excluded from definedness checking.  Do
2187             nothing. */
2188          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2189       } else {
2190          /* calculate the arg's definedness, and pessimistically merge
2191             it in. */
2192          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2193          curr = mergeTy64
2194                    ? mkUifU64(mce, here, curr)
2195                    : mkUifU32(mce, here, curr);
2196       }
2197    }
2198    return mkPCastTo(mce, finalVtype, curr );
2199 }
2200
2201
2202 /*------------------------------------------------------------*/
2203 /*--- Generating expensive sequences for exact carry-chain ---*/
2204 /*--- propagation in add/sub and related operations.       ---*/
2205 /*------------------------------------------------------------*/
2206
2207 static
2208 IRAtom* expensiveAddSub ( MCEnv*  mce,
2209                           Bool    add,
2210                           IRType  ty,
2211                           IRAtom* qaa, IRAtom* qbb,
2212                           IRAtom* aa,  IRAtom* bb )
2213 {
2214    IRAtom *a_min, *b_min, *a_max, *b_max;
2215    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
2216
2217    tl_assert(isShadowAtom(mce,qaa));
2218    tl_assert(isShadowAtom(mce,qbb));
2219    tl_assert(isOriginalAtom(mce,aa));
2220    tl_assert(isOriginalAtom(mce,bb));
2221    tl_assert(sameKindedAtoms(qaa,aa));
2222    tl_assert(sameKindedAtoms(qbb,bb));
2223
2224    switch (ty) {
2225       case Ity_I32:
2226          opAND = Iop_And32;
2227          opOR  = Iop_Or32;
2228          opXOR = Iop_Xor32;
2229          opNOT = Iop_Not32;
2230          opADD = Iop_Add32;
2231          opSUB = Iop_Sub32;
2232          break;
2233       case Ity_I64:
2234          opAND = Iop_And64;
2235          opOR  = Iop_Or64;
2236          opXOR = Iop_Xor64;
2237          opNOT = Iop_Not64;
2238          opADD = Iop_Add64;
2239          opSUB = Iop_Sub64;
2240          break;
2241       default:
2242          VG_(tool_panic)("expensiveAddSub");
2243    }
2244
2245    // a_min = aa & ~qaa
2246    a_min = assignNew('V', mce,ty,
2247                      binop(opAND, aa,
2248                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
2249
2250    // b_min = bb & ~qbb
2251    b_min = assignNew('V', mce,ty,
2252                      binop(opAND, bb,
2253                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
2254
2255    // a_max = aa | qaa
2256    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2257
2258    // b_max = bb | qbb
2259    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2260
2261    if (add) {
2262       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2263       return
2264       assignNew('V', mce,ty,
2265          binop( opOR,
2266                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2267                 assignNew('V', mce,ty,
2268                    binop( opXOR,
2269                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2270                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2271                    )
2272                 )
2273          )
2274       );
2275    } else {
2276       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2277       return
2278       assignNew('V', mce,ty,
2279          binop( opOR,
2280                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2281                 assignNew('V', mce,ty,
2282                    binop( opXOR,
2283                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2284                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2285                    )
2286                 )
2287          )
2288       );
2289    }
2290
2291 }
2292
2293
2294 static
2295 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2296                                        IRAtom* atom, IRAtom* vatom )
2297 {
2298    IRType ty;
2299    IROp xorOp, subOp, andOp;
2300    IRExpr *one;
2301    IRAtom *improver, *improved;
2302    tl_assert(isShadowAtom(mce,vatom));
2303    tl_assert(isOriginalAtom(mce,atom));
2304    tl_assert(sameKindedAtoms(atom,vatom));
2305
2306    switch (czop) {
2307       case Iop_Ctz32: case Iop_CtzNat32:
2308          ty = Ity_I32;
2309          xorOp = Iop_Xor32;
2310          subOp = Iop_Sub32;
2311          andOp = Iop_And32;
2312          one = mkU32(1);
2313          break;
2314       case Iop_Ctz64: case Iop_CtzNat64:
2315          ty = Ity_I64;
2316          xorOp = Iop_Xor64;
2317          subOp = Iop_Sub64;
2318          andOp = Iop_And64;
2319          one = mkU64(1);
2320          break;
2321       default:
2322          ppIROp(czop);
2323          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2324    }
2325
2326    // improver = atom ^ (atom - 1)
2327    //
2328    // That is, improver has its low ctz(atom)+1 bits equal to one;
2329    // higher bits (if any) equal to zero.  So it's exactly the right
2330    // mask to use to remove the irrelevant undefined input bits.
2331    /* Here are some examples:
2332          atom   = U...U 1 0...0
2333          atom-1 = U...U 0 1...1
2334          ^ed    = 0...0 1 11111, which correctly describes which bits of |atom|
2335                                  actually influence the result
2336       A boundary case
2337          atom   = 0...0
2338          atom-1 = 1...1
2339          ^ed    = 11111, also a correct mask for the input: all input bits
2340                          are relevant
2341       Another boundary case
2342          atom   = 1..1 1
2343          atom-1 = 1..1 0
2344          ^ed    = 0..0 1, also a correct mask: only the rightmost input bit
2345                           is relevant
2346       Now with misc U bits interspersed:
2347          atom   = U...U 1 0 U...U 0 1 0...0
2348          atom-1 = U...U 1 0 U...U 0 0 1...1
2349          ^ed    = 0...0 0 0 0...0 0 1 1...1, also correct
2350       (Per re-check/analysis of 14 Nov 2018)
2351    */
2352    improver = assignNew('V', mce,ty,
2353                         binop(xorOp,
2354                               atom,
2355                               assignNew('V', mce, ty,
2356                                         binop(subOp, atom, one))));
2357
2358    // improved = vatom & improver
2359    //
2360    // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2361    // bits as "defined".
2362    improved = assignNew('V', mce, ty,
2363                         binop(andOp, vatom, improver));
2364
2365    // Return pessimizing cast of improved.
2366    return mkPCastTo(mce, ty, improved);
2367 }
2368
2369 static
2370 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2371                                       IRAtom* atom, IRAtom* vatom )
2372 {
2373    IRType ty;
2374    IROp shrOp, notOp, andOp;
2375    IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2376    IRAtom *improver, *improved;
2377    tl_assert(isShadowAtom(mce,vatom));
2378    tl_assert(isOriginalAtom(mce,atom));
2379    tl_assert(sameKindedAtoms(atom,vatom));
2380
2381    switch (czop) {
2382       case Iop_Clz32: case Iop_ClzNat32:
2383          ty = Ity_I32;
2384          shrOp = Iop_Shr32;
2385          notOp = Iop_Not32;
2386          andOp = Iop_And32;
2387          mkRight = mkRight32;
2388          break;
2389       case Iop_Clz64: case Iop_ClzNat64:
2390          ty = Ity_I64;
2391          shrOp = Iop_Shr64;
2392          notOp = Iop_Not64;
2393          andOp = Iop_And64;
2394          mkRight = mkRight64;
2395          break;
2396       default:
2397          ppIROp(czop);
2398          VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2399    }
2400
2401    // This is in principle very similar to how expensiveCountTrailingZeroes
2402    // works.  That function computed an "improver", which it used to mask
2403    // off all but the rightmost 1-bit and the zeroes to the right of it,
2404    // hence removing irrelevant bits from the input.  Here, we play the
2405    // exact same game but with the left-vs-right roles interchanged.
2406    // Unfortunately calculation of the improver in this case is
2407    // significantly more expensive.
2408    //
2409    // improver = ~(RIGHT(atom) >>u 1)
2410    //
2411    // That is, improver has its upper clz(atom)+1 bits equal to one;
2412    // lower bits (if any) equal to zero.  So it's exactly the right
2413    // mask to use to remove the irrelevant undefined input bits.
2414    /* Here are some examples:
2415          atom             = 0...0 1 U...U
2416          R(atom)          = 0...0 1 1...1
2417          R(atom) >>u 1    = 0...0 0 1...1
2418          ~(R(atom) >>u 1) = 1...1 1 0...0
2419                             which correctly describes which bits of |atom|
2420                             actually influence the result
2421       A boundary case
2422          atom             = 0...0
2423          R(atom)          = 0...0
2424          R(atom) >>u 1    = 0...0
2425          ~(R(atom) >>u 1) = 1...1
2426                             also a correct mask for the input: all input bits
2427                             are relevant
2428       Another boundary case
2429          atom             = 1 1..1
2430          R(atom)          = 1 1..1
2431          R(atom) >>u 1    = 0 1..1
2432          ~(R(atom) >>u 1) = 1 0..0
2433                             also a correct mask: only the leftmost input bit
2434                             is relevant
2435       Now with misc U bits interspersed:
2436          atom             = 0...0 1 U...U 0 1 U...U
2437          R(atom)          = 0...0 1 1...1 1 1 1...1
2438          R(atom) >>u 1    = 0...0 0 1...1 1 1 1...1
2439          ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2440       (Per initial implementation of 15 Nov 2018)
2441    */
2442    improver = mkRight(mce, atom);
2443    improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2444    improver = assignNew('V', mce, ty, unop(notOp, improver));
2445
2446    // improved = vatom & improver
2447    //
2448    // That is, treat any V bits to the right of the leftmost clz(atom)+1
2449    // bits as "defined".
2450    improved = assignNew('V', mce, ty,
2451                         binop(andOp, vatom, improver));
2452
2453    // Return pessimizing cast of improved.
2454    return mkPCastTo(mce, ty, improved);
2455 }
2456
2457
2458 /*------------------------------------------------------------*/
2459 /*--- Scalar shifts.                                       ---*/
2460 /*------------------------------------------------------------*/
2461
2462 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2463    idea is to shift the definedness bits by the original shift amount.
2464    This introduces 0s ("defined") in new positions for left shifts and
2465    unsigned right shifts, and copies the top definedness bit for
2466    signed right shifts.  So, conveniently, applying the original shift
2467    operator to the definedness bits for the left arg is exactly the
2468    right thing to do:
2469
2470       (qaa << bb)
2471
2472    However if the shift amount is undefined then the whole result
2473    is undefined.  Hence need:
2474
2475       (qaa << bb) `UifU` PCast(qbb)
2476
2477    If the shift amount bb is a literal than qbb will say 'all defined'
2478    and the UifU and PCast will get folded out by post-instrumentation
2479    optimisation.
2480 */
2481 static IRAtom* scalarShift ( MCEnv*  mce,
2482                              IRType  ty,
2483                              IROp    original_op,
2484                              IRAtom* qaa, IRAtom* qbb,
2485                              IRAtom* aa,  IRAtom* bb )
2486 {
2487    tl_assert(isShadowAtom(mce,qaa));
2488    tl_assert(isShadowAtom(mce,qbb));
2489    tl_assert(isOriginalAtom(mce,aa));
2490    tl_assert(isOriginalAtom(mce,bb));
2491    tl_assert(sameKindedAtoms(qaa,aa));
2492    tl_assert(sameKindedAtoms(qbb,bb));
2493    return
2494       assignNew(
2495          'V', mce, ty,
2496          mkUifU( mce, ty,
2497                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2498                  mkPCastTo(mce, ty, qbb)
2499          )
2500    );
2501 }
2502
2503
2504 /*------------------------------------------------------------*/
2505 /*--- Helpers for dealing with vector primops.             ---*/
2506 /*------------------------------------------------------------*/
2507
2508 /* Vector pessimisation -- pessimise within each lane individually. */
2509
2510 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2511 {
2512    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2513 }
2514
2515 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2516 {
2517    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2518 }
2519
2520 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2521 {
2522    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2523 }
2524
2525 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2526 {
2527    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2528 }
2529
2530 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2531 {
2532    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2533 }
2534
2535 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2536 {
2537    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2538 }
2539
2540 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2541 {
2542    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2543 }
2544
2545 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2546 {
2547    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2548 }
2549
2550 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2551 {
2552    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2553 }
2554
2555 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2556 {
2557    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2558 }
2559
2560 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2561 {
2562    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2563 }
2564
2565 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2566 {
2567    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2568 }
2569
2570 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2571 {
2572    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2573 }
2574
2575 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2576 {
2577    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2578 }
2579
2580
2581 /* Here's a simple scheme capable of handling ops derived from SSE1
2582    code and while only generating ops that can be efficiently
2583    implemented in SSE1. */
2584
2585 /* All-lanes versions are straightforward:
2586
2587    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2588
2589    unary32Fx4(x,y)    ==> PCast32x4(x#)
2590
2591    Lowest-lane-only versions are more complex:
2592
2593    binary32F0x4(x,y)  ==> SetV128lo32(
2594                              x#,
2595                              PCast32(V128to32(UifUV128(x#,y#)))
2596                           )
2597
2598    This is perhaps not so obvious.  In particular, it's faster to
2599    do a V128-bit UifU and then take the bottom 32 bits than the more
2600    obvious scheme of taking the bottom 32 bits of each operand
2601    and doing a 32-bit UifU.  Basically since UifU is fast and
2602    chopping lanes off vector values is slow.
2603
2604    Finally:
2605
2606    unary32F0x4(x)     ==> SetV128lo32(
2607                              x#,
2608                              PCast32(V128to32(x#))
2609                           )
2610
2611    Where:
2612
2613    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2614    PCast32x4(v#) = CmpNEZ32x4(v#)
2615 */
2616
2617 static
2618 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2619 {
2620    IRAtom* at;
2621    tl_assert(isShadowAtom(mce, vatomX));
2622    tl_assert(isShadowAtom(mce, vatomY));
2623    at = mkUifUV128(mce, vatomX, vatomY);
2624    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2625    return at;
2626 }
2627
2628 static
2629 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2630 {
2631    IRAtom* at;
2632    tl_assert(isShadowAtom(mce, vatomX));
2633    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2634    return at;
2635 }
2636
2637 static
2638 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2639 {
2640    IRAtom* at;
2641    tl_assert(isShadowAtom(mce, vatomX));
2642    tl_assert(isShadowAtom(mce, vatomY));
2643    at = mkUifUV128(mce, vatomX, vatomY);
2644    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2645    at = mkPCastTo(mce, Ity_I32, at);
2646    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2647    return at;
2648 }
2649
2650 static
2651 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2652 {
2653    IRAtom* at;
2654    tl_assert(isShadowAtom(mce, vatomX));
2655    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2656    at = mkPCastTo(mce, Ity_I32, at);
2657    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2658    return at;
2659 }
2660
2661 /* --- ... and ... 64Fx2 versions of the same ... --- */
2662
2663 static
2664 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2665 {
2666    IRAtom* at;
2667    tl_assert(isShadowAtom(mce, vatomX));
2668    tl_assert(isShadowAtom(mce, vatomY));
2669    at = mkUifUV128(mce, vatomX, vatomY);
2670    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2671    return at;
2672 }
2673
2674 static
2675 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2676 {
2677    IRAtom* at;
2678    tl_assert(isShadowAtom(mce, vatomX));
2679    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2680    return at;
2681 }
2682
2683 static
2684 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2685 {
2686    IRAtom* at;
2687    tl_assert(isShadowAtom(mce, vatomX));
2688    tl_assert(isShadowAtom(mce, vatomY));
2689    at = mkUifUV128(mce, vatomX, vatomY);
2690    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2691    at = mkPCastTo(mce, Ity_I64, at);
2692    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2693    return at;
2694 }
2695
2696 static
2697 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2698 {
2699    IRAtom* at;
2700    tl_assert(isShadowAtom(mce, vatomX));
2701    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2702    at = mkPCastTo(mce, Ity_I64, at);
2703    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2704    return at;
2705 }
2706
2707 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2708
2709 static
2710 IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2711 {
2712    IRAtom* at;
2713    tl_assert(isShadowAtom(mce, vatomX));
2714    tl_assert(isShadowAtom(mce, vatomY));
2715    at = mkUifUV128(mce, vatomX, vatomY);
2716    at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at));
2717    return at;
2718 }
2719
2720 static
2721 IRAtom* unary16Fx8 ( MCEnv* mce, IRAtom* vatomX )
2722 {
2723    IRAtom* at;
2724    tl_assert(isShadowAtom(mce, vatomX));
2725    at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, vatomX));
2726    return at;
2727 }
2728
2729 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2730    implemented.
2731 */
2732
2733 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2734
2735 static
2736 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2737 {
2738    IRAtom* at;
2739    tl_assert(isShadowAtom(mce, vatomX));
2740    tl_assert(isShadowAtom(mce, vatomY));
2741    at = mkUifU64(mce, vatomX, vatomY);
2742    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2743    return at;
2744 }
2745
2746 static
2747 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2748 {
2749    IRAtom* at;
2750    tl_assert(isShadowAtom(mce, vatomX));
2751    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2752    return at;
2753 }
2754
2755 /* --- ... and ... 64Fx4 versions of the same ... --- */
2756
2757 static
2758 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2759 {
2760    IRAtom* at;
2761    tl_assert(isShadowAtom(mce, vatomX));
2762    tl_assert(isShadowAtom(mce, vatomY));
2763    at = mkUifUV256(mce, vatomX, vatomY);
2764    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2765    return at;
2766 }
2767
2768 static
2769 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2770 {
2771    IRAtom* at;
2772    tl_assert(isShadowAtom(mce, vatomX));
2773    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2774    return at;
2775 }
2776
2777 /* --- ... and ... 32Fx8 versions of the same ... --- */
2778
2779 static
2780 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2781 {
2782    IRAtom* at;
2783    tl_assert(isShadowAtom(mce, vatomX));
2784    tl_assert(isShadowAtom(mce, vatomY));
2785    at = mkUifUV256(mce, vatomX, vatomY);
2786    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2787    return at;
2788 }
2789
2790 static
2791 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2792 {
2793    IRAtom* at;
2794    tl_assert(isShadowAtom(mce, vatomX));
2795    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2796    return at;
2797 }
2798
2799 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2800
2801 static
2802 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2803                                        IRAtom* vatomX, IRAtom* vatomY )
2804 {
2805    /* This is the same as binary64Fx2, except that we subsequently
2806       pessimise vRM (definedness of the rounding mode), widen to 128
2807       bits and UifU it into the result.  As with the scalar cases, if
2808       the RM is a constant then it is defined and so this extra bit
2809       will get constant-folded out later. */
2810    // "do" the vector args
2811    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2812    // PCast the RM, and widen it to 128 bits
2813    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2814    // Roll it into the result
2815    t1 = mkUifUV128(mce, t1, t2);
2816    return t1;
2817 }
2818
2819 /* --- ... and ... 32Fx4 versions of the same --- */
2820
2821 static
2822 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2823                                        IRAtom* vatomX, IRAtom* vatomY )
2824 {
2825    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2826    // PCast the RM, and widen it to 128 bits
2827    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2828    // Roll it into the result
2829    t1 = mkUifUV128(mce, t1, t2);
2830    return t1;
2831 }
2832
2833 /* --- ... and ... 64Fx4 versions of the same --- */
2834
2835 static
2836 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2837                                        IRAtom* vatomX, IRAtom* vatomY )
2838 {
2839    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2840    // PCast the RM, and widen it to 256 bits
2841    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2842    // Roll it into the result
2843    t1 = mkUifUV256(mce, t1, t2);
2844    return t1;
2845 }
2846
2847 /* --- ... and ... 16Fx8 versions of the same --- */
2848
2849 static
2850 IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2851                                        IRAtom* vatomX, IRAtom* vatomY )
2852 {
2853    IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY);
2854    // PCast the RM, and widen it to 128 bits
2855    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2856    // Roll it into the result
2857    t1 = mkUifUV128(mce, t1, t2);
2858    return t1;
2859 }
2860
2861 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2862    implemented.
2863 */
2864
2865 /* --- ... and ... 32Fx8 versions of the same --- */
2866
2867 static
2868 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2869                                        IRAtom* vatomX, IRAtom* vatomY )
2870 {
2871    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2872    // PCast the RM, and widen it to 256 bits
2873    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2874    // Roll it into the result
2875    t1 = mkUifUV256(mce, t1, t2);
2876    return t1;
2877 }
2878
2879 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2880
2881 static
2882 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2883 {
2884    /* Same scheme as binary64Fx2_w_rm. */
2885    // "do" the vector arg
2886    IRAtom* t1 = unary64Fx2(mce, vatomX);
2887    // PCast the RM, and widen it to 128 bits
2888    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2889    // Roll it into the result
2890    t1 = mkUifUV128(mce, t1, t2);
2891    return t1;
2892 }
2893
2894 /* --- ... and ... 32Fx4 versions of the same --- */
2895
2896 static
2897 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2898 {
2899    /* Same scheme as binaryFx4_w_rm. */
2900    IRAtom* t1 = unary32Fx4(mce, vatomX);
2901    // PCast the RM, and widen it to 128 bits
2902    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2903    // Roll it into the result
2904    t1 = mkUifUV128(mce, t1, t2);
2905    return t1;
2906 }
2907
2908 /* --- ... and ... 16Fx8 versions of the same --- */
2909
2910 static
2911 IRAtom* unary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2912 {
2913    /* Same scheme as binaryFx4_w_rm. */
2914    IRAtom* t1 = unary16Fx8(mce, vatomX);
2915    // PCast the RM, and widen it to 128 bits
2916    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2917    // Roll it into the result
2918    t1 = mkUifUV128(mce, t1, t2);
2919    return t1;
2920 }
2921
2922 /* --- ... and ... 32Fx8 versions of the same --- */
2923
2924 static
2925 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2926 {
2927    /* Same scheme as unary32Fx8_w_rm. */
2928    IRAtom* t1 = unary32Fx8(mce, vatomX);
2929    // PCast the RM, and widen it to 256 bits
2930    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2931    // Roll it into the result
2932    t1 = mkUifUV256(mce, t1, t2);
2933    return t1;
2934 }
2935
2936
2937 /* --- --- Vector saturated narrowing --- --- */
2938
2939 /* We used to do something very clever here, but on closer inspection
2940    (2011-Jun-15), and in particular bug #279698, it turns out to be
2941    wrong.  Part of the problem came from the fact that for a long
2942    time, the IR primops to do with saturated narrowing were
2943    underspecified and managed to confuse multiple cases which needed
2944    to be separate: the op names had a signedness qualifier, but in
2945    fact the source and destination signednesses needed to be specified
2946    independently, so the op names really need two independent
2947    signedness specifiers.
2948
2949    As of 2011-Jun-15 (ish) the underspecification was sorted out
2950    properly.  The incorrect instrumentation remained, though.  That
2951    has now (2011-Oct-22) been fixed.
2952
2953    What we now do is simple:
2954
2955    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2956    number of lanes, X is the source lane width and signedness, and Y
2957    is the destination lane width and signedness.  In all cases the
2958    destination lane width is half the source lane width, so the names
2959    have a bit of redundancy, but are at least easy to read.
2960
2961    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2962    to unsigned 16s.
2963
2964    Let Vanilla(OP) be a function that takes OP, one of these
2965    saturating narrowing ops, and produces the same "shaped" narrowing
2966    op which is not saturating, but merely dumps the most significant
2967    bits.  "same shape" means that the lane numbers and widths are the
2968    same as with OP.
2969
2970    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2971                   = Iop_NarrowBin32to16x8,
2972    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2973    dumping the top half of each lane.
2974
2975    So, with that in place, the scheme is simple, and it is simple to
2976    pessimise each lane individually and then apply Vanilla(OP) so as
2977    to get the result in the right "shape".  If the original OP is
2978    QNarrowBinXtoYxZ then we produce
2979
2980    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2981
2982    or for the case when OP is unary (Iop_QNarrowUn*)
2983
2984    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2985 */
2986 static
2987 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2988 {
2989    switch (qnarrowOp) {
2990       /* Binary: (128, 128) -> 128 */
2991       case Iop_QNarrowBin16Sto8Ux16:
2992       case Iop_QNarrowBin16Sto8Sx16:
2993       case Iop_QNarrowBin16Uto8Ux16:
2994       case Iop_QNarrowBin64Sto32Sx4:
2995       case Iop_QNarrowBin64Uto32Ux4:
2996          return Iop_NarrowBin16to8x16;
2997       case Iop_QNarrowBin32Sto16Ux8:
2998       case Iop_QNarrowBin32Sto16Sx8:
2999       case Iop_QNarrowBin32Uto16Ux8:
3000          return Iop_NarrowBin32to16x8;
3001       /* Binary: (64, 64) -> 64 */
3002       case Iop_QNarrowBin32Sto16Sx4:
3003          return Iop_NarrowBin32to16x4;
3004       case Iop_QNarrowBin16Sto8Ux8:
3005       case Iop_QNarrowBin16Sto8Sx8:
3006          return Iop_NarrowBin16to8x8;
3007       /* Unary: 128 -> 64 */
3008       case Iop_QNarrowUn64Uto32Ux2:
3009       case Iop_QNarrowUn64Sto32Sx2:
3010       case Iop_QNarrowUn64Sto32Ux2:
3011          return Iop_NarrowUn64to32x2;
3012       case Iop_QNarrowUn32Uto16Ux4:
3013       case Iop_QNarrowUn32Sto16Sx4:
3014       case Iop_QNarrowUn32Sto16Ux4:
3015       case Iop_F32toF16x4_DEP:
3016          return Iop_NarrowUn32to16x4;
3017       case Iop_QNarrowUn16Uto8Ux8:
3018       case Iop_QNarrowUn16Sto8Sx8:
3019       case Iop_QNarrowUn16Sto8Ux8:
3020          return Iop_NarrowUn16to8x8;
3021       default:
3022          ppIROp(qnarrowOp);
3023          VG_(tool_panic)("vanillaNarrowOpOfShape");
3024    }
3025 }
3026
3027 static
3028 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
3029                               IRAtom* vatom1, IRAtom* vatom2)
3030 {
3031    IRAtom *at1, *at2, *at3;
3032    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3033    switch (narrow_op) {
3034       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
3035       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
3036       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
3037       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
3038       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
3039       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
3040       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
3041       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
3042       default: VG_(tool_panic)("vectorNarrowBinV128");
3043    }
3044    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3045    tl_assert(isShadowAtom(mce,vatom1));
3046    tl_assert(isShadowAtom(mce,vatom2));
3047    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3048    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
3049    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
3050    return at3;
3051 }
3052
3053 static
3054 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
3055                             IRAtom* vatom1, IRAtom* vatom2)
3056 {
3057    IRAtom *at1, *at2, *at3;
3058    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3059    switch (narrow_op) {
3060       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
3061       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
3062       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
3063       default: VG_(tool_panic)("vectorNarrowBin64");
3064    }
3065    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3066    tl_assert(isShadowAtom(mce,vatom1));
3067    tl_assert(isShadowAtom(mce,vatom2));
3068    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
3069    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3070    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3071    return at3;
3072 }
3073
3074 static
3075 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3076                              IRAtom* vatom1)
3077 {
3078    IRAtom *at1, *at2;
3079    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3080    tl_assert(isShadowAtom(mce,vatom1));
3081    /* For vanilla narrowing (non-saturating), we can just apply
3082       the op directly to the V bits. */
3083    switch (narrow_op) {
3084       case Iop_NarrowUn16to8x8:
3085       case Iop_NarrowUn32to16x4:
3086       case Iop_NarrowUn64to32x2:
3087       case Iop_F32toF16x4_DEP:
3088          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3089          return at1;
3090       default:
3091          break; /* Do Plan B */
3092    }
3093    /* Plan B: for ops that involve a saturation operation on the args,
3094       we must PCast before the vanilla narrow. */
3095    switch (narrow_op) {
3096       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
3097       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
3098       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
3099       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3100       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3101       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3102       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3103       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3104       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3105       default: VG_(tool_panic)("vectorNarrowUnV128");
3106    }
3107    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3108    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3109    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3110    return at2;
3111 }
3112
3113 static
3114 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3115                          IRAtom* vatom1)
3116 {
3117    IRAtom *at1, *at2;
3118    IRAtom* (*pcast)( MCEnv*, IRAtom* );
3119    switch (longen_op) {
3120       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
3121       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
3122       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3123       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3124       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3125       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3126       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
3127       default: VG_(tool_panic)("vectorWidenI64");
3128    }
3129    tl_assert(isShadowAtom(mce,vatom1));
3130    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3131    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3132    return at2;
3133 }
3134
3135
3136 /* --- --- Vector integer arithmetic --- --- */
3137
3138 /* Simple ... UifU the args and per-lane pessimise the results. */
3139
3140 /* --- V256-bit versions --- */
3141
3142 static
3143 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3144 {
3145    IRAtom* at;
3146    at = mkUifUV256(mce, vatom1, vatom2);
3147    at = mkPCast8x32(mce, at);
3148    return at;
3149 }
3150
3151 static
3152 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3153 {
3154    IRAtom* at;
3155    at = mkUifUV256(mce, vatom1, vatom2);
3156    at = mkPCast16x16(mce, at);
3157    return at;
3158 }
3159
3160 static
3161 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3162 {
3163    IRAtom* at;
3164    at = mkUifUV256(mce, vatom1, vatom2);
3165    at = mkPCast32x8(mce, at);
3166    return at;
3167 }
3168
3169 static
3170 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3171 {
3172    IRAtom* at;
3173    at = mkUifUV256(mce, vatom1, vatom2);
3174    at = mkPCast64x4(mce, at);
3175    return at;
3176 }
3177
3178 /* --- V128-bit versions --- */
3179
3180 static
3181 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3182 {
3183    IRAtom* at;
3184    at = mkUifUV128(mce, vatom1, vatom2);
3185    at = mkPCast8x16(mce, at);
3186    return at;
3187 }
3188
3189 static
3190 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3191 {
3192    IRAtom* at;
3193    at = mkUifUV128(mce, vatom1, vatom2);
3194    at = mkPCast16x8(mce, at);
3195    return at;
3196 }
3197
3198 static
3199 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3200 {
3201    IRAtom* at;
3202    at = mkUifUV128(mce, vatom1, vatom2);
3203    at = mkPCast32x4(mce, at);
3204    return at;
3205 }
3206
3207 static
3208 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3209 {
3210    IRAtom* at;
3211    at = mkUifUV128(mce, vatom1, vatom2);
3212    at = mkPCast64x2(mce, at);
3213    return at;
3214 }
3215
3216 static
3217 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3218 {
3219    IRAtom* at;
3220    at = mkUifUV128(mce, vatom1, vatom2);
3221    at = mkPCast128x1(mce, at);
3222    return at;
3223 }
3224
3225 /* --- 64-bit versions --- */
3226
3227 static
3228 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3229 {
3230    IRAtom* at;
3231    at = mkUifU64(mce, vatom1, vatom2);
3232    at = mkPCast8x8(mce, at);
3233    return at;
3234 }
3235
3236 static
3237 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3238 {
3239    IRAtom* at;
3240    at = mkUifU64(mce, vatom1, vatom2);
3241    at = mkPCast16x4(mce, at);
3242    return at;
3243 }
3244
3245 static
3246 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3247 {
3248    IRAtom* at;
3249    at = mkUifU64(mce, vatom1, vatom2);
3250    at = mkPCast32x2(mce, at);
3251    return at;
3252 }
3253
3254 static
3255 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3256 {
3257    IRAtom* at;
3258    at = mkUifU64(mce, vatom1, vatom2);
3259    at = mkPCastTo(mce, Ity_I64, at);
3260    return at;
3261 }
3262
3263 /* --- 32-bit versions --- */
3264
3265 static
3266 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3267 {
3268    IRAtom* at;
3269    at = mkUifU32(mce, vatom1, vatom2);
3270    at = mkPCast8x4(mce, at);
3271    return at;
3272 }
3273
3274 static
3275 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3276 {
3277    IRAtom* at;
3278    at = mkUifU32(mce, vatom1, vatom2);
3279    at = mkPCast16x2(mce, at);
3280    return at;
3281 }
3282
3283
3284 /*------------------------------------------------------------*/
3285 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
3286 /*------------------------------------------------------------*/
3287
3288 static
3289 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3290                          IROp op,
3291                          IRAtom* atom1, IRAtom* atom2,
3292                          IRAtom* atom3, IRAtom* atom4 )
3293 {
3294    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3295    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3296    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3297    IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3298
3299    tl_assert(isOriginalAtom(mce,atom1));
3300    tl_assert(isOriginalAtom(mce,atom2));
3301    tl_assert(isOriginalAtom(mce,atom3));
3302    tl_assert(isOriginalAtom(mce,atom4));
3303    tl_assert(isShadowAtom(mce,vatom1));
3304    tl_assert(isShadowAtom(mce,vatom2));
3305    tl_assert(isShadowAtom(mce,vatom3));
3306    tl_assert(isShadowAtom(mce,vatom4));
3307    tl_assert(sameKindedAtoms(atom1,vatom1));
3308    tl_assert(sameKindedAtoms(atom2,vatom2));
3309    tl_assert(sameKindedAtoms(atom3,vatom3));
3310    tl_assert(sameKindedAtoms(atom4,vatom4));
3311    switch (op) {
3312       case Iop_MAddF64:
3313       case Iop_MAddF64r32:
3314       case Iop_MSubF64:
3315       case Iop_MSubF64r32:
3316          /* I32(rm) x F64 x F64 x F64 -> F64 */
3317          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3318
3319       case Iop_MAddF32:
3320       case Iop_MSubF32:
3321          /* I32(rm) x F32 x F32 x F32 -> F32 */
3322          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3323
3324       case Iop_MAddF128:
3325       case Iop_MSubF128:
3326       case Iop_NegMAddF128:
3327       case Iop_NegMSubF128:
3328          /* I32(rm) x F128 x F128 x F128 -> F128 */
3329          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3330
3331       /* V256-bit data-steering */
3332       case Iop_64x4toV256:
3333          return assignNew('V', mce, Ity_V256,
3334                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3335
3336       /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3337       case Iop_Rotx32:
3338          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3339       case Iop_Rotx64:
3340          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3341       default:
3342          ppIROp(op);
3343          VG_(tool_panic)("memcheck:expr2vbits_Qop");
3344    }
3345 }
3346
3347
3348 static
3349 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3350                            IROp op,
3351                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3352 {
3353    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3354    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3355    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3356
3357    tl_assert(isOriginalAtom(mce,atom1));
3358    tl_assert(isOriginalAtom(mce,atom2));
3359    tl_assert(isOriginalAtom(mce,atom3));
3360    tl_assert(isShadowAtom(mce,vatom1));
3361    tl_assert(isShadowAtom(mce,vatom2));
3362    tl_assert(isShadowAtom(mce,vatom3));
3363    tl_assert(sameKindedAtoms(atom1,vatom1));
3364    tl_assert(sameKindedAtoms(atom2,vatom2));
3365    tl_assert(sameKindedAtoms(atom3,vatom3));
3366    switch (op) {
3367       case Iop_AddF128:
3368       case Iop_SubF128:
3369       case Iop_MulF128:
3370       case Iop_DivF128:
3371       case Iop_AddD128:
3372       case Iop_SubD128:
3373       case Iop_MulD128:
3374       case Iop_DivD128:
3375       case Iop_QuantizeD128:
3376          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3377          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3378       case Iop_AddF64:
3379       case Iop_AddD64:
3380       case Iop_AddF64r32:
3381       case Iop_SubF64:
3382       case Iop_SubD64:
3383       case Iop_SubF64r32:
3384       case Iop_MulF64:
3385       case Iop_MulD64:
3386       case Iop_MulF64r32:
3387       case Iop_DivF64:
3388       case Iop_DivD64:
3389       case Iop_DivF64r32:
3390       case Iop_ScaleF64:
3391       case Iop_Yl2xF64:
3392       case Iop_Yl2xp1F64:
3393       case Iop_AtanF64:
3394       case Iop_PRemF64:
3395       case Iop_PRem1F64:
3396       case Iop_QuantizeD64:
3397          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3398          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3399       case Iop_PRemC3210F64:
3400       case Iop_PRem1C3210F64:
3401          /* I32(rm) x F64 x F64 -> I32 */
3402          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3403       case Iop_AddF32:
3404       case Iop_SubF32:
3405       case Iop_MulF32:
3406       case Iop_DivF32:
3407          /* I32(rm) x F32 x F32 -> I32 */
3408          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3409       case Iop_AddF16:
3410       case Iop_SubF16:
3411          /* I32(rm) x F16 x F16 -> I16 */
3412          return mkLazy3(mce, Ity_I16, vatom1, vatom2, vatom3);
3413       case Iop_SignificanceRoundD64:
3414          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3415          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3416       case Iop_SignificanceRoundD128:
3417          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3418          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3419       case Iop_SliceV128:
3420          /* (V128, V128, I8) -> V128 */
3421          complainIfUndefined(mce, atom3, NULL);
3422          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3423       case Iop_Slice64:
3424          /* (I64, I64, I8) -> I64 */
3425          complainIfUndefined(mce, atom3, NULL);
3426          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3427       case Iop_SetElem8x8:
3428       case Iop_SetElem16x4:
3429       case Iop_SetElem32x2:
3430          complainIfUndefined(mce, atom2, NULL);
3431          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3432
3433       case Iop_SetElem8x16:
3434       case Iop_SetElem16x8:
3435       case Iop_SetElem32x4:
3436       case Iop_SetElem64x2:
3437          complainIfUndefined(mce, atom2, NULL);
3438          return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3439
3440       /* Int 128-bit Integer three arg  */
3441       case Iop_2xMultU64Add128CarryOut:
3442       case Iop_Perm8x16x2:
3443          /* (V128, V128, V128) -> V128 */
3444             complainIfUndefined(mce, atom3, NULL);
3445             return mkUifUV128(
3446                    mce,
3447                    assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3448                    mkPCast8x16(mce, vatom3)
3449                 );
3450
3451       /* Vector FP with rounding mode as the first arg */
3452       case Iop_Add64Fx2:
3453       case Iop_Sub64Fx2:
3454       case Iop_Mul64Fx2:
3455       case Iop_Div64Fx2:
3456       case Iop_Scale2_64Fx2:
3457          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3458
3459       case Iop_Add32Fx4:
3460       case Iop_Sub32Fx4:
3461       case Iop_Mul32Fx4:
3462       case Iop_Div32Fx4:
3463       case Iop_Scale2_32Fx4:
3464         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3465
3466       case Iop_Add64Fx4:
3467       case Iop_Sub64Fx4:
3468       case Iop_Mul64Fx4:
3469       case Iop_Div64Fx4:
3470          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3471
3472       /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3473          IR is implemented.
3474       */
3475       case Iop_Add16Fx8:
3476       case Iop_Sub16Fx8:
3477         return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3478
3479       case Iop_Add32Fx8:
3480       case Iop_Sub32Fx8:
3481       case Iop_Mul32Fx8:
3482       case Iop_Div32Fx8:
3483          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3484
3485       case Iop_F32x4_2toQ16x8:
3486          return assignNew('V', mce, Ity_V128,
3487                           binop(Iop_PackEvenLanes16x8,
3488                                 unary32Fx4_w_rm(mce, vatom1, vatom2),
3489                                 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3490       case Iop_F64x2_2toQ32x4:
3491          return assignNew('V', mce, Ity_V128,
3492                           binop(Iop_PackEvenLanes32x4,
3493                                 unary64Fx2_w_rm(mce, vatom1, vatom2),
3494                                 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3495
3496       default:
3497          ppIROp(op);
3498          VG_(tool_panic)("memcheck:expr2vbits_Triop");
3499    }
3500 }
3501
3502
3503 static
3504 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3505                            IROp op,
3506                            IRAtom* atom1, IRAtom* atom2,
3507                            HowUsed hu/*use HuOth if unknown*/ )
3508 {
3509    IRType  and_or_ty = Ity_INVALID;
3510    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3511    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*) = NULL;
3512    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3513
3514    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3515    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3516
3517    tl_assert(isOriginalAtom(mce,atom1));
3518    tl_assert(isOriginalAtom(mce,atom2));
3519    tl_assert(isShadowAtom(mce,vatom1));
3520    tl_assert(isShadowAtom(mce,vatom2));
3521    tl_assert(sameKindedAtoms(atom1,vatom1));
3522    tl_assert(sameKindedAtoms(atom2,vatom2));
3523    switch (op) {
3524
3525       /* 32-bit SIMD */
3526
3527       case Iop_Add16x2:
3528       case Iop_HAdd16Ux2:
3529       case Iop_HAdd16Sx2:
3530       case Iop_Sub16x2:
3531       case Iop_HSub16Ux2:
3532       case Iop_HSub16Sx2:
3533       case Iop_QAdd16Sx2:
3534       case Iop_QSub16Sx2:
3535       case Iop_QSub16Ux2:
3536       case Iop_QAdd16Ux2:
3537          return binary16Ix2(mce, vatom1, vatom2);
3538
3539       case Iop_Add8x4:
3540       case Iop_HAdd8Ux4:
3541       case Iop_HAdd8Sx4:
3542       case Iop_Sub8x4:
3543       case Iop_HSub8Ux4:
3544       case Iop_HSub8Sx4:
3545       case Iop_QSub8Ux4:
3546       case Iop_QAdd8Ux4:
3547       case Iop_QSub8Sx4:
3548       case Iop_QAdd8Sx4:
3549          return binary8Ix4(mce, vatom1, vatom2);
3550
3551       /* 64-bit SIMD */
3552
3553       case Iop_ShrN8x8:
3554       case Iop_ShrN16x4:
3555       case Iop_ShrN32x2:
3556       case Iop_SarN8x8:
3557       case Iop_SarN16x4:
3558       case Iop_SarN32x2:
3559       case Iop_ShlN16x4:
3560       case Iop_ShlN32x2:
3561       case Iop_ShlN8x8:
3562          /* Same scheme as with all other shifts. */
3563          complainIfUndefined(mce, atom2, NULL);
3564          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3565
3566       case Iop_QNarrowBin32Sto16Sx4:
3567       case Iop_QNarrowBin16Sto8Sx8:
3568       case Iop_QNarrowBin16Sto8Ux8:
3569          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3570
3571       case Iop_Min8Ux8:
3572       case Iop_Min8Sx8:
3573       case Iop_Max8Ux8:
3574       case Iop_Max8Sx8:
3575       case Iop_Avg8Ux8:
3576       case Iop_QSub8Sx8:
3577       case Iop_QSub8Ux8:
3578       case Iop_Sub8x8:
3579       case Iop_CmpGT8Sx8:
3580       case Iop_CmpGT8Ux8:
3581       case Iop_CmpEQ8x8:
3582       case Iop_QAdd8Sx8:
3583       case Iop_QAdd8Ux8:
3584       case Iop_QSal8x8:
3585       case Iop_QShl8x8:
3586       case Iop_Add8x8:
3587       case Iop_Mul8x8:
3588       case Iop_PolynomialMul8x8:
3589          return binary8Ix8(mce, vatom1, vatom2);
3590
3591       case Iop_Min16Sx4:
3592       case Iop_Min16Ux4:
3593       case Iop_Max16Sx4:
3594       case Iop_Max16Ux4:
3595       case Iop_Avg16Ux4:
3596       case Iop_QSub16Ux4:
3597       case Iop_QSub16Sx4:
3598       case Iop_Sub16x4:
3599       case Iop_Mul16x4:
3600       case Iop_MulHi16Sx4:
3601       case Iop_MulHi16Ux4:
3602       case Iop_CmpGT16Sx4:
3603       case Iop_CmpGT16Ux4:
3604       case Iop_CmpEQ16x4:
3605       case Iop_QAdd16Sx4:
3606       case Iop_QAdd16Ux4:
3607       case Iop_QSal16x4:
3608       case Iop_QShl16x4:
3609       case Iop_Add16x4:
3610       case Iop_QDMulHi16Sx4:
3611       case Iop_QRDMulHi16Sx4:
3612          return binary16Ix4(mce, vatom1, vatom2);
3613
3614       case Iop_Sub32x2:
3615       case Iop_Mul32x2:
3616       case Iop_Max32Sx2:
3617       case Iop_Max32Ux2:
3618       case Iop_Min32Sx2:
3619       case Iop_Min32Ux2:
3620       case Iop_CmpGT32Sx2:
3621       case Iop_CmpGT32Ux2:
3622       case Iop_CmpEQ32x2:
3623       case Iop_Add32x2:
3624       case Iop_QAdd32Ux2:
3625       case Iop_QAdd32Sx2:
3626       case Iop_QSub32Ux2:
3627       case Iop_QSub32Sx2:
3628       case Iop_QSal32x2:
3629       case Iop_QShl32x2:
3630       case Iop_QDMulHi32Sx2:
3631       case Iop_QRDMulHi32Sx2:
3632          return binary32Ix2(mce, vatom1, vatom2);
3633
3634       case Iop_QSub64Ux1:
3635       case Iop_QSub64Sx1:
3636       case Iop_QAdd64Ux1:
3637       case Iop_QAdd64Sx1:
3638       case Iop_QSal64x1:
3639       case Iop_QShl64x1:
3640       case Iop_Sal64x1:
3641          return binary64Ix1(mce, vatom1, vatom2);
3642
3643       case Iop_QShlNsatSU8x8:
3644       case Iop_QShlNsatUU8x8:
3645       case Iop_QShlNsatSS8x8:
3646          complainIfUndefined(mce, atom2, NULL);
3647          return mkPCast8x8(mce, vatom1);
3648
3649       case Iop_QShlNsatSU16x4:
3650       case Iop_QShlNsatUU16x4:
3651       case Iop_QShlNsatSS16x4:
3652          complainIfUndefined(mce, atom2, NULL);
3653          return mkPCast16x4(mce, vatom1);
3654
3655       case Iop_QShlNsatSU32x2:
3656       case Iop_QShlNsatUU32x2:
3657       case Iop_QShlNsatSS32x2:
3658          complainIfUndefined(mce, atom2, NULL);
3659          return mkPCast32x2(mce, vatom1);
3660
3661       case Iop_QShlNsatSU64x1:
3662       case Iop_QShlNsatUU64x1:
3663       case Iop_QShlNsatSS64x1:
3664          complainIfUndefined(mce, atom2, NULL);
3665          return mkPCast32x2(mce, vatom1);
3666
3667       case Iop_PwMax32Sx2:
3668       case Iop_PwMax32Ux2:
3669       case Iop_PwMin32Sx2:
3670       case Iop_PwMin32Ux2:
3671       case Iop_PwMax32Fx2:
3672       case Iop_PwMin32Fx2:
3673          return assignNew('V', mce, Ity_I64,
3674                           binop(Iop_PwMax32Ux2,
3675                                 mkPCast32x2(mce, vatom1),
3676                                 mkPCast32x2(mce, vatom2)));
3677
3678       case Iop_PwMax16Sx4:
3679       case Iop_PwMax16Ux4:
3680       case Iop_PwMin16Sx4:
3681       case Iop_PwMin16Ux4:
3682          return assignNew('V', mce, Ity_I64,
3683                           binop(Iop_PwMax16Ux4,
3684                                 mkPCast16x4(mce, vatom1),
3685                                 mkPCast16x4(mce, vatom2)));
3686
3687       case Iop_PwMax8Sx8:
3688       case Iop_PwMax8Ux8:
3689       case Iop_PwMin8Sx8:
3690       case Iop_PwMin8Ux8:
3691          return assignNew('V', mce, Ity_I64,
3692                           binop(Iop_PwMax8Ux8,
3693                                 mkPCast8x8(mce, vatom1),
3694                                 mkPCast8x8(mce, vatom2)));
3695
3696       case Iop_PwAdd32x2:
3697       case Iop_PwAdd32Fx2:
3698          return mkPCast32x2(mce,
3699                assignNew('V', mce, Ity_I64,
3700                          binop(Iop_PwAdd32x2,
3701                                mkPCast32x2(mce, vatom1),
3702                                mkPCast32x2(mce, vatom2))));
3703
3704       case Iop_PwAdd16x4:
3705          return mkPCast16x4(mce,
3706                assignNew('V', mce, Ity_I64,
3707                          binop(op, mkPCast16x4(mce, vatom1),
3708                                    mkPCast16x4(mce, vatom2))));
3709
3710       case Iop_PwAdd8x8:
3711          return mkPCast8x8(mce,
3712                assignNew('V', mce, Ity_I64,
3713                          binop(op, mkPCast8x8(mce, vatom1),
3714                                    mkPCast8x8(mce, vatom2))));
3715
3716       case Iop_Shl8x8:
3717       case Iop_Shr8x8:
3718       case Iop_Sar8x8:
3719       case Iop_Sal8x8:
3720          return mkUifU64(mce,
3721                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3722                    mkPCast8x8(mce,vatom2)
3723                 );
3724
3725       case Iop_Shl16x4:
3726       case Iop_Shr16x4:
3727       case Iop_Sar16x4:
3728       case Iop_Sal16x4:
3729          return mkUifU64(mce,
3730                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3731                    mkPCast16x4(mce,vatom2)
3732                 );
3733
3734       case Iop_Shl32x2:
3735       case Iop_Shr32x2:
3736       case Iop_Sar32x2:
3737       case Iop_Sal32x2:
3738          return mkUifU64(mce,
3739                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3740                    mkPCast32x2(mce,vatom2)
3741                 );
3742
3743       /* 64-bit data-steering */
3744       case Iop_InterleaveLO32x2:
3745       case Iop_InterleaveLO16x4:
3746       case Iop_InterleaveLO8x8:
3747       case Iop_InterleaveHI32x2:
3748       case Iop_InterleaveHI16x4:
3749       case Iop_InterleaveHI8x8:
3750       case Iop_CatOddLanes8x8:
3751       case Iop_CatEvenLanes8x8:
3752       case Iop_CatOddLanes16x4:
3753       case Iop_CatEvenLanes16x4:
3754       case Iop_InterleaveOddLanes8x8:
3755       case Iop_InterleaveEvenLanes8x8:
3756       case Iop_InterleaveOddLanes16x4:
3757       case Iop_InterleaveEvenLanes16x4:
3758          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3759
3760       case Iop_GetElem8x8:
3761          complainIfUndefined(mce, atom2, NULL);
3762          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3763       case Iop_GetElem16x4:
3764          complainIfUndefined(mce, atom2, NULL);
3765          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3766       case Iop_GetElem32x2:
3767          complainIfUndefined(mce, atom2, NULL);
3768          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3769
3770       /* Perm8x8: rearrange values in left arg using steering values from
3771          right arg.  So rearrange the vbits in the same way but pessimise wrt
3772          steering values.  We assume that unused bits in the steering value
3773          are defined zeros, so we can safely PCast within each lane of the the
3774          steering value without having to take precautions to avoid a
3775          dependency on those unused bits.
3776
3777          This is also correct for PermOrZero8x8, but it is a bit subtle.  For
3778          each lane, if bit 7 of the steering value is zero, then we'll steer
3779          the shadow value exactly as per Perm8x8.  If that bit is one, then
3780          the operation will set the resulting (concrete) value to zero.  That
3781          means it is defined, and should have a shadow value of zero.  Hence
3782          in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3783          as Perm8x8) and then pessimise against the steering values.  */
3784       case Iop_Perm8x8:
3785       case Iop_PermOrZero8x8:
3786          return mkUifU64(
3787                    mce,
3788                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3789                    mkPCast8x8(mce, vatom2)
3790                 );
3791
3792       /* V128-bit SIMD */
3793
3794       case Iop_I32StoF32x4:
3795       case Iop_F32toI32Sx4:
3796       case Iop_Sqrt16Fx8:
3797          return unary16Fx8_w_rm(mce, vatom1, vatom2);
3798       case Iop_Sqrt32Fx4:
3799          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3800       case Iop_Sqrt64Fx2:
3801          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3802
3803       case Iop_ShrN8x16:
3804       case Iop_ShrN16x8:
3805       case Iop_ShrN32x4:
3806       case Iop_ShrN64x2:
3807       case Iop_SarN8x16:
3808       case Iop_SarN16x8:
3809       case Iop_SarN32x4:
3810       case Iop_SarN64x2:
3811       case Iop_ShlN8x16:
3812       case Iop_ShlN16x8:
3813       case Iop_ShlN32x4:
3814       case Iop_ShlN64x2:
3815          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3816             this is wrong now, scalar shifts are done properly lazily.
3817             Vector shifts should be fixed too. */
3818          complainIfUndefined(mce, atom2, NULL);
3819          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3820
3821       /* V x V shifts/rotates are done using the standard lazy scheme. */
3822       /* For the non-rounding variants of bi-di vector x vector
3823          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3824          But note that this is overly pessimistic, because in fact only
3825          the bottom 8 bits of each lane of the second argument are taken
3826          into account when shifting.  So really we ought to ignore
3827          undefinedness in bits 8 and above of each lane in the
3828          second argument. */
3829       case Iop_Shl8x16:
3830       case Iop_Shr8x16:
3831       case Iop_Sar8x16:
3832       case Iop_Sal8x16:
3833       case Iop_Rol8x16:
3834       case Iop_Sh8Sx16:
3835       case Iop_Sh8Ux16:
3836          return mkUifUV128(mce,
3837                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3838                    mkPCast8x16(mce,vatom2)
3839                 );
3840
3841       case Iop_Shl16x8:
3842       case Iop_Shr16x8:
3843       case Iop_Sar16x8:
3844       case Iop_Sal16x8:
3845       case Iop_Rol16x8:
3846       case Iop_Sh16Sx8:
3847       case Iop_Sh16Ux8:
3848          return mkUifUV128(mce,
3849                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3850                    mkPCast16x8(mce,vatom2)
3851                 );
3852
3853       case Iop_Shl32x4:
3854       case Iop_Shr32x4:
3855       case Iop_Sar32x4:
3856       case Iop_Sal32x4:
3857       case Iop_Rol32x4:
3858       case Iop_Sh32Sx4:
3859       case Iop_Sh32Ux4:
3860          return mkUifUV128(mce,
3861                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3862                    mkPCast32x4(mce,vatom2)
3863                 );
3864
3865       case Iop_Shl64x2:
3866       case Iop_Shr64x2:
3867       case Iop_Sar64x2:
3868       case Iop_Sal64x2:
3869       case Iop_Rol64x2:
3870       case Iop_Sh64Sx2:
3871       case Iop_Sh64Ux2:
3872          return mkUifUV128(mce,
3873                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3874                    mkPCast64x2(mce,vatom2)
3875                 );
3876
3877       /* For the rounding variants of bi-di vector x vector shifts, the
3878          rounding adjustment can cause undefinedness to propagate through
3879          the entire lane, in the worst case.  Too complex to handle
3880          properly .. just UifU the arguments and then PCast them.
3881          Suboptimal but safe. */
3882       case Iop_Rsh8Sx16:
3883       case Iop_Rsh8Ux16:
3884          return binary8Ix16(mce, vatom1, vatom2);
3885       case Iop_Rsh16Sx8:
3886       case Iop_Rsh16Ux8:
3887          return binary16Ix8(mce, vatom1, vatom2);
3888       case Iop_Rsh32Sx4:
3889       case Iop_Rsh32Ux4:
3890          return binary32Ix4(mce, vatom1, vatom2);
3891       case Iop_Rsh64Sx2:
3892       case Iop_Rsh64Ux2:
3893          return binary64Ix2(mce, vatom1, vatom2);
3894
3895       case Iop_F32ToFixed32Ux4_RZ:
3896       case Iop_F32ToFixed32Sx4_RZ:
3897       case Iop_Fixed32UToF32x4_RN:
3898       case Iop_Fixed32SToF32x4_RN:
3899          complainIfUndefined(mce, atom2, NULL);
3900          return mkPCast32x4(mce, vatom1);
3901
3902       case Iop_F32ToFixed32Ux2_RZ:
3903       case Iop_F32ToFixed32Sx2_RZ:
3904       case Iop_Fixed32UToF32x2_RN:
3905       case Iop_Fixed32SToF32x2_RN:
3906          complainIfUndefined(mce, atom2, NULL);
3907          return mkPCast32x2(mce, vatom1);
3908
3909       case Iop_QSub8Ux16:
3910       case Iop_QSub8Sx16:
3911       case Iop_Sub8x16:
3912       case Iop_Min8Ux16:
3913       case Iop_Min8Sx16:
3914       case Iop_Max8Ux16:
3915       case Iop_Max8Sx16:
3916       case Iop_CmpGT8Sx16:
3917       case Iop_CmpGT8Ux16:
3918       case Iop_CmpEQ8x16:
3919       case Iop_Avg8Ux16:
3920       case Iop_Avg8Sx16:
3921       case Iop_QAdd8Ux16:
3922       case Iop_QAdd8Sx16:
3923       case Iop_QAddExtUSsatSS8x16:
3924       case Iop_QAddExtSUsatUU8x16:
3925       case Iop_QSal8x16:
3926       case Iop_QShl8x16:
3927       case Iop_Add8x16:
3928       case Iop_Mul8x16:
3929       case Iop_MulHi8Sx16:
3930       case Iop_MulHi8Ux16:
3931       case Iop_PolynomialMul8x16:
3932       case Iop_PolynomialMulAdd8x16:
3933          return binary8Ix16(mce, vatom1, vatom2);
3934
3935       case Iop_QSub16Ux8:
3936       case Iop_QSub16Sx8:
3937       case Iop_Sub16x8:
3938       case Iop_Mul16x8:
3939       case Iop_MulHi16Sx8:
3940       case Iop_MulHi16Ux8:
3941       case Iop_Min16Sx8:
3942       case Iop_Min16Ux8:
3943       case Iop_Max16Sx8:
3944       case Iop_Max16Ux8:
3945       case Iop_CmpGT16Sx8:
3946       case Iop_CmpGT16Ux8:
3947       case Iop_CmpEQ16x8:
3948       case Iop_Avg16Ux8:
3949       case Iop_Avg16Sx8:
3950       case Iop_QAdd16Ux8:
3951       case Iop_QAdd16Sx8:
3952       case Iop_QAddExtUSsatSS16x8:
3953       case Iop_QAddExtSUsatUU16x8:
3954       case Iop_QSal16x8:
3955       case Iop_QShl16x8:
3956       case Iop_Add16x8:
3957       case Iop_QDMulHi16Sx8:
3958       case Iop_QRDMulHi16Sx8:
3959       case Iop_PolynomialMulAdd16x8:
3960       /* PwExtUSMulQAdd8x16 is a bit subtle.  The effect of it is that each
3961          16-bit chunk of the output is formed from corresponding 16-bit chunks
3962          of the input args, so we can treat it like an other binary 16x8
3963          operation.  That's despite it having '8x16' in its name. */
3964       case Iop_PwExtUSMulQAdd8x16:
3965          return binary16Ix8(mce, vatom1, vatom2);
3966
3967       case Iop_Sub32x4:
3968       case Iop_CmpGT32Sx4:
3969       case Iop_CmpGT32Ux4:
3970       case Iop_CmpEQ32x4:
3971       case Iop_QAdd32Sx4:
3972       case Iop_QAdd32Ux4:
3973       case Iop_QSub32Sx4:
3974       case Iop_QSub32Ux4:
3975       case Iop_QAddExtUSsatSS32x4:
3976       case Iop_QAddExtSUsatUU32x4:
3977       case Iop_QSal32x4:
3978       case Iop_QShl32x4:
3979       case Iop_Avg32Ux4:
3980       case Iop_Avg32Sx4:
3981       case Iop_Add32x4:
3982       case Iop_Max32Ux4:
3983       case Iop_Max32Sx4:
3984       case Iop_Min32Ux4:
3985       case Iop_Min32Sx4:
3986       case Iop_Mul32x4:
3987       case Iop_MulHi32Sx4:
3988       case Iop_MulHi32Ux4:
3989       case Iop_QDMulHi32Sx4:
3990       case Iop_QRDMulHi32Sx4:
3991       case Iop_PolynomialMulAdd32x4:
3992          return binary32Ix4(mce, vatom1, vatom2);
3993
3994       case Iop_Sub64x2:
3995       case Iop_Add64x2:
3996       case Iop_Avg64Ux2:
3997       case Iop_Avg64Sx2:
3998       case Iop_Max64Sx2:
3999       case Iop_Max64Ux2:
4000       case Iop_Min64Sx2:
4001       case Iop_Min64Ux2:
4002       case Iop_CmpEQ64x2:
4003       case Iop_CmpGT64Sx2:
4004       case Iop_CmpGT64Ux2:
4005       case Iop_QSal64x2:
4006       case Iop_QShl64x2:
4007       case Iop_QAdd64Ux2:
4008       case Iop_QAdd64Sx2:
4009       case Iop_QSub64Ux2:
4010       case Iop_QSub64Sx2:
4011       case Iop_QAddExtUSsatSS64x2:
4012       case Iop_QAddExtSUsatUU64x2:
4013       case Iop_PolynomialMulAdd64x2:
4014       case Iop_CipherV128:
4015       case Iop_CipherLV128:
4016       case Iop_NCipherV128:
4017       case Iop_NCipherLV128:
4018       case Iop_MulI128by10E:
4019       case Iop_MulI128by10ECarry:
4020         return binary64Ix2(mce, vatom1, vatom2);
4021
4022       case Iop_Add128x1:
4023       case Iop_Sub128x1:
4024       case Iop_CmpNEZ128x1:
4025          return binary128Ix1(mce, vatom1, vatom2);
4026
4027       case Iop_DivU128:
4028       case Iop_DivS128:
4029       case Iop_DivU128E:
4030       case Iop_DivS128E:
4031       case Iop_ModU128:
4032       case Iop_ModS128:
4033          /* I128 x I128 -> I128 */
4034          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4035
4036       case Iop_QNarrowBin64Sto32Sx4:
4037       case Iop_QNarrowBin64Uto32Ux4:
4038       case Iop_QNarrowBin32Sto16Sx8:
4039       case Iop_QNarrowBin32Uto16Ux8:
4040       case Iop_QNarrowBin32Sto16Ux8:
4041       case Iop_QNarrowBin16Sto8Sx16:
4042       case Iop_QNarrowBin16Uto8Ux16:
4043       case Iop_QNarrowBin16Sto8Ux16:
4044          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
4045
4046       case Iop_Min64Fx2:
4047       case Iop_Max64Fx2:
4048       case Iop_CmpLT64Fx2:
4049       case Iop_CmpLE64Fx2:
4050       case Iop_CmpEQ64Fx2:
4051       case Iop_CmpUN64Fx2:
4052       case Iop_RecipStep64Fx2:
4053       case Iop_RSqrtStep64Fx2:
4054          return binary64Fx2(mce, vatom1, vatom2);
4055
4056       case Iop_CmpLT16Fx8:
4057       case Iop_CmpLE16Fx8:
4058       case Iop_CmpEQ16Fx8:
4059          return binary16Fx8(mce, vatom1, vatom2);
4060
4061       case Iop_Sub64F0x2:
4062       case Iop_Mul64F0x2:
4063       case Iop_Min64F0x2:
4064       case Iop_Max64F0x2:
4065       case Iop_Div64F0x2:
4066       case Iop_CmpLT64F0x2:
4067       case Iop_CmpLE64F0x2:
4068       case Iop_CmpEQ64F0x2:
4069       case Iop_CmpUN64F0x2:
4070       case Iop_Add64F0x2:
4071          return binary64F0x2(mce, vatom1, vatom2);
4072
4073       case Iop_Min32Fx4:
4074       case Iop_Max32Fx4:
4075       case Iop_CmpLT32Fx4:
4076       case Iop_CmpLE32Fx4:
4077       case Iop_CmpEQ32Fx4:
4078       case Iop_CmpUN32Fx4:
4079       case Iop_CmpGT32Fx4:
4080       case Iop_CmpGE32Fx4:
4081       case Iop_RecipStep32Fx4:
4082       case Iop_RSqrtStep32Fx4:
4083          return binary32Fx4(mce, vatom1, vatom2);
4084
4085       case Iop_Sub32Fx2:
4086       case Iop_Mul32Fx2:
4087       case Iop_Min32Fx2:
4088       case Iop_Max32Fx2:
4089       case Iop_CmpEQ32Fx2:
4090       case Iop_CmpGT32Fx2:
4091       case Iop_CmpGE32Fx2:
4092       case Iop_Add32Fx2:
4093       case Iop_RecipStep32Fx2:
4094       case Iop_RSqrtStep32Fx2:
4095          return binary32Fx2(mce, vatom1, vatom2);
4096
4097       case Iop_Sub32F0x4:
4098       case Iop_Mul32F0x4:
4099       case Iop_Min32F0x4:
4100       case Iop_Max32F0x4:
4101       case Iop_Div32F0x4:
4102       case Iop_CmpLT32F0x4:
4103       case Iop_CmpLE32F0x4:
4104       case Iop_CmpEQ32F0x4:
4105       case Iop_CmpUN32F0x4:
4106       case Iop_Add32F0x4:
4107          return binary32F0x4(mce, vatom1, vatom2);
4108
4109       case Iop_QShlNsatSU8x16:
4110       case Iop_QShlNsatUU8x16:
4111       case Iop_QShlNsatSS8x16:
4112          complainIfUndefined(mce, atom2, NULL);
4113          return mkPCast8x16(mce, vatom1);
4114
4115       case Iop_QShlNsatSU16x8:
4116       case Iop_QShlNsatUU16x8:
4117       case Iop_QShlNsatSS16x8:
4118          complainIfUndefined(mce, atom2, NULL);
4119          return mkPCast16x8(mce, vatom1);
4120
4121       case Iop_QShlNsatSU32x4:
4122       case Iop_QShlNsatUU32x4:
4123       case Iop_QShlNsatSS32x4:
4124          complainIfUndefined(mce, atom2, NULL);
4125          return mkPCast32x4(mce, vatom1);
4126
4127       case Iop_QShlNsatSU64x2:
4128       case Iop_QShlNsatUU64x2:
4129       case Iop_QShlNsatSS64x2:
4130          complainIfUndefined(mce, atom2, NULL);
4131          return mkPCast32x4(mce, vatom1);
4132
4133       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4134          To make this simpler, do the following:
4135          * complain if the shift amount (the I8) is undefined
4136          * pcast each lane at the wide width
4137          * truncate each lane to half width
4138          * pcast the resulting 64-bit value to a single bit and use
4139            that as the least significant bit of the upper half of the
4140            result. */
4141       case Iop_QandQShrNnarrow64Uto32Ux2:
4142       case Iop_QandQSarNnarrow64Sto32Sx2:
4143       case Iop_QandQSarNnarrow64Sto32Ux2:
4144       case Iop_QandQRShrNnarrow64Uto32Ux2:
4145       case Iop_QandQRSarNnarrow64Sto32Sx2:
4146       case Iop_QandQRSarNnarrow64Sto32Ux2:
4147       case Iop_QandQShrNnarrow32Uto16Ux4:
4148       case Iop_QandQSarNnarrow32Sto16Sx4:
4149       case Iop_QandQSarNnarrow32Sto16Ux4:
4150       case Iop_QandQRShrNnarrow32Uto16Ux4:
4151       case Iop_QandQRSarNnarrow32Sto16Sx4:
4152       case Iop_QandQRSarNnarrow32Sto16Ux4:
4153       case Iop_QandQShrNnarrow16Uto8Ux8:
4154       case Iop_QandQSarNnarrow16Sto8Sx8:
4155       case Iop_QandQSarNnarrow16Sto8Ux8:
4156       case Iop_QandQRShrNnarrow16Uto8Ux8:
4157       case Iop_QandQRSarNnarrow16Sto8Sx8:
4158       case Iop_QandQRSarNnarrow16Sto8Ux8:
4159       {
4160          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4161          IROp opNarrow = Iop_INVALID;
4162          switch (op) {
4163             case Iop_QandQShrNnarrow64Uto32Ux2:
4164             case Iop_QandQSarNnarrow64Sto32Sx2:
4165             case Iop_QandQSarNnarrow64Sto32Ux2:
4166             case Iop_QandQRShrNnarrow64Uto32Ux2:
4167             case Iop_QandQRSarNnarrow64Sto32Sx2:
4168             case Iop_QandQRSarNnarrow64Sto32Ux2:
4169                fnPessim = mkPCast64x2;
4170                opNarrow = Iop_NarrowUn64to32x2;
4171                break;
4172             case Iop_QandQShrNnarrow32Uto16Ux4:
4173             case Iop_QandQSarNnarrow32Sto16Sx4:
4174             case Iop_QandQSarNnarrow32Sto16Ux4:
4175             case Iop_QandQRShrNnarrow32Uto16Ux4:
4176             case Iop_QandQRSarNnarrow32Sto16Sx4:
4177             case Iop_QandQRSarNnarrow32Sto16Ux4:
4178                fnPessim = mkPCast32x4;
4179                opNarrow = Iop_NarrowUn32to16x4;
4180                break;
4181             case Iop_QandQShrNnarrow16Uto8Ux8:
4182             case Iop_QandQSarNnarrow16Sto8Sx8:
4183             case Iop_QandQSarNnarrow16Sto8Ux8:
4184             case Iop_QandQRShrNnarrow16Uto8Ux8:
4185             case Iop_QandQRSarNnarrow16Sto8Sx8:
4186             case Iop_QandQRSarNnarrow16Sto8Ux8:
4187                fnPessim = mkPCast16x8;
4188                opNarrow = Iop_NarrowUn16to8x8;
4189                break;
4190             default:
4191                tl_assert(0);
4192          }
4193          complainIfUndefined(mce, atom2, NULL);
4194          // Pessimised shift result
4195          IRAtom* shV
4196             = fnPessim(mce, vatom1);
4197          // Narrowed, pessimised shift result
4198          IRAtom* shVnarrowed
4199             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4200          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4201          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4202          // and assemble the result
4203          return assignNew('V', mce, Ity_V128,
4204                           binop(Iop_64HLtoV128, qV, shVnarrowed));
4205       }
4206
4207       case Iop_Mull32Sx2:
4208       case Iop_Mull32Ux2:
4209       case Iop_QDMull32Sx2:
4210          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4211                                     mkUifU64(mce, vatom1, vatom2));
4212
4213       case Iop_Mull16Sx4:
4214       case Iop_Mull16Ux4:
4215       case Iop_QDMull16Sx4:
4216          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4217                                     mkUifU64(mce, vatom1, vatom2));
4218
4219       case Iop_Mull8Sx8:
4220       case Iop_Mull8Ux8:
4221       case Iop_PolynomialMull8x8:
4222          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4223                                     mkUifU64(mce, vatom1, vatom2));
4224
4225       case Iop_PwAdd32x4:
4226          return mkPCast32x4(mce,
4227                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4228                      mkPCast32x4(mce, vatom2))));
4229
4230       case Iop_PwAdd16x8:
4231          return mkPCast16x8(mce,
4232                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4233                      mkPCast16x8(mce, vatom2))));
4234
4235       case Iop_PwAdd8x16:
4236          return mkPCast8x16(mce,
4237                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4238                      mkPCast8x16(mce, vatom2))));
4239
4240       /* V128-bit data-steering */
4241       case Iop_SetV128lo32:
4242       case Iop_SetV128lo64:
4243       case Iop_64HLtoV128:
4244       case Iop_InterleaveLO64x2:
4245       case Iop_InterleaveLO32x4:
4246       case Iop_InterleaveLO16x8:
4247       case Iop_InterleaveLO8x16:
4248       case Iop_InterleaveHI64x2:
4249       case Iop_InterleaveHI32x4:
4250       case Iop_InterleaveHI16x8:
4251       case Iop_InterleaveHI8x16:
4252       case Iop_CatOddLanes8x16:
4253       case Iop_CatOddLanes16x8:
4254       case Iop_CatOddLanes32x4:
4255       case Iop_CatEvenLanes8x16:
4256       case Iop_CatEvenLanes16x8:
4257       case Iop_CatEvenLanes32x4:
4258       case Iop_InterleaveOddLanes8x16:
4259       case Iop_InterleaveOddLanes16x8:
4260       case Iop_InterleaveOddLanes32x4:
4261       case Iop_InterleaveEvenLanes8x16:
4262       case Iop_InterleaveEvenLanes16x8:
4263       case Iop_InterleaveEvenLanes32x4:
4264       case Iop_PackOddLanes8x16:
4265       case Iop_PackOddLanes16x8:
4266       case Iop_PackOddLanes32x4:
4267       case Iop_PackEvenLanes8x16:
4268       case Iop_PackEvenLanes16x8:
4269       case Iop_PackEvenLanes32x4:
4270          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4271
4272       case Iop_GetElem8x16:
4273          complainIfUndefined(mce, atom2, NULL);
4274          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4275       case Iop_GetElem16x8:
4276          complainIfUndefined(mce, atom2, NULL);
4277          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4278       case Iop_GetElem32x4:
4279          complainIfUndefined(mce, atom2, NULL);
4280          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4281       case Iop_GetElem64x2:
4282          complainIfUndefined(mce, atom2, NULL);
4283          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4284
4285       /* Perm8x16: rearrange values in left arg using steering values
4286          from right arg.  So rearrange the vbits in the same way but
4287          pessimise wrt steering values.  Perm32x4 ditto. */
4288       /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4289       case Iop_Perm8x16:
4290       case Iop_PermOrZero8x16:
4291          return mkUifUV128(
4292                    mce,
4293                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4294                    mkPCast8x16(mce, vatom2)
4295                 );
4296       case Iop_Perm32x4:
4297          return mkUifUV128(
4298                    mce,
4299                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4300                    mkPCast32x4(mce, vatom2)
4301                 );
4302
4303      /* These two take the lower half of each 16-bit lane, sign/zero
4304         extend it to 32, and multiply together, producing a 32x4
4305         result (and implicitly ignoring half the operand bits).  So
4306         treat it as a bunch of independent 16x8 operations, but then
4307         do 32-bit shifts left-right to copy the lower half results
4308         (which are all 0s or all 1s due to PCasting in binary16Ix8)
4309         into the upper half of each result lane. */
4310       case Iop_MullEven16Ux8:
4311       case Iop_MullEven16Sx8: {
4312          IRAtom* at;
4313          at = binary16Ix8(mce,vatom1,vatom2);
4314          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4315          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4316          return at;
4317       }
4318
4319       /* Same deal as Iop_MullEven16{S,U}x8 */
4320       case Iop_MullEven8Ux16:
4321       case Iop_MullEven8Sx16: {
4322          IRAtom* at;
4323          at = binary8Ix16(mce,vatom1,vatom2);
4324          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4325          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4326          return at;
4327       }
4328
4329       /* Same deal as Iop_MullEven16{S,U}x8 */
4330       case Iop_MullEven32Ux4:
4331       case Iop_MullEven32Sx4: {
4332          IRAtom* at;
4333          at = binary32Ix4(mce,vatom1,vatom2);
4334          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4335          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4336          return at;
4337       }
4338
4339       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4340          32x4 -> 16x8 laneage, discarding the upper half of each lane.
4341          Simply apply same op to the V bits, since this really no more
4342          than a data steering operation. */
4343       case Iop_NarrowBin32to16x8:
4344       case Iop_NarrowBin16to8x16:
4345       case Iop_NarrowBin64to32x4:
4346          return assignNew('V', mce, Ity_V128,
4347                                     binop(op, vatom1, vatom2));
4348
4349       case Iop_ShrV128:
4350       case Iop_SarV128:
4351       case Iop_ShlV128:
4352       case Iop_I128StoBCD128:
4353          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
4354             this is wrong now, scalar shifts are done properly lazily.
4355             Vector shifts should be fixed too. */
4356          complainIfUndefined(mce, atom2, NULL);
4357          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4358
4359       case Iop_I128UtoF128:      /* I128 -> F128 */
4360       case Iop_I128StoF128:      /* I128 -> F128 */
4361          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4362
4363       case Iop_BCDAdd:
4364       case Iop_BCDSub:
4365          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4366
4367       /* SHA Iops */
4368       case Iop_SHA256:
4369       case Iop_SHA512:
4370          complainIfUndefined(mce, atom2, NULL);
4371          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4372
4373       /* I128-bit data-steering */
4374       case Iop_64HLto128:
4375          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4376
4377       /* V256-bit SIMD */
4378
4379       case Iop_Max64Fx4:
4380       case Iop_Min64Fx4:
4381          return binary64Fx4(mce, vatom1, vatom2);
4382
4383       case Iop_Max32Fx8:
4384       case Iop_Min32Fx8:
4385          return binary32Fx8(mce, vatom1, vatom2);
4386
4387       /* V256-bit data-steering */
4388       case Iop_V128HLtoV256:
4389          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4390
4391       /* Scalar floating point */
4392
4393       case Iop_F32toI64S:
4394       case Iop_F32toI64U:
4395          /* I32(rm) x F32 -> I64 */
4396          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4397
4398       case Iop_I64StoF32:
4399          /* I32(rm) x I64 -> F32 */
4400          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4401
4402       case Iop_RoundF64toInt:
4403       case Iop_RoundF64toF32:
4404       case Iop_F64toI64S:
4405       case Iop_F64toI64U:
4406       case Iop_I64StoF64:
4407       case Iop_I64UtoF64:
4408       case Iop_SinF64:
4409       case Iop_CosF64:
4410       case Iop_TanF64:
4411       case Iop_2xm1F64:
4412       case Iop_SqrtF64:
4413       case Iop_RecpExpF64:
4414          /* I32(rm) x I64/F64 -> I64/F64 */
4415          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4416
4417       case Iop_ShlD64:
4418       case Iop_ShrD64:
4419       case Iop_RoundD64toInt:
4420          /* I32(rm) x D64 -> D64 */
4421          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4422
4423       case Iop_ShlD128:
4424       case Iop_ShrD128:
4425       case Iop_RoundD128toInt:
4426          /* I32(rm) x D128 -> D128 */
4427          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4428
4429       case Iop_RoundF128toInt:
4430          /* I32(rm) x F128 -> F128 */
4431          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4432
4433       case Iop_D64toI64S:
4434       case Iop_D64toI64U:
4435       case Iop_I64StoD64:
4436       case Iop_I64UtoD64:
4437          /* I32(rm) x I64/D64 -> D64/I64 */
4438          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4439
4440       case Iop_F32toD32:
4441       case Iop_F64toD32:
4442       case Iop_F128toD32:
4443       case Iop_D32toF32:
4444       case Iop_D64toF32:
4445       case Iop_D128toF32:
4446          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4447          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4448
4449       case Iop_F32toD64:
4450       case Iop_F64toD64:
4451       case Iop_F128toD64:
4452       case Iop_D32toF64:
4453       case Iop_D64toF64:
4454       case Iop_D128toF64:
4455          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4456          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4457
4458       case Iop_F32toD128:
4459       case Iop_F64toD128:
4460       case Iop_F128toD128:
4461       case Iop_D32toF128:
4462       case Iop_D64toF128:
4463       case Iop_D128toF128:
4464       case Iop_I128StoD128:
4465          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4466          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4467
4468       case Iop_SqrtF16:
4469          /* I32(rm) x F16 -> F16 */
4470          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4471
4472       case Iop_RoundF32toInt:
4473       case Iop_SqrtF32:
4474       case Iop_RecpExpF32:
4475          /* I32(rm) x I32/F32 -> I32/F32 */
4476          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4477
4478       case Iop_SqrtF128:
4479          /* I32(rm) x F128 -> F128 */
4480          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4481
4482       case Iop_I32StoF32:
4483       case Iop_I32UtoF32:
4484       case Iop_F32toI32S:
4485       case Iop_F32toI32U:
4486          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4487          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4488
4489       case Iop_F64toF16:
4490       case Iop_F32toF16:
4491          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4492          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4493
4494       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
4495       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
4496       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
4497       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
4498       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
4499          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4500
4501       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
4502       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
4503       case Iop_D128toI128S:   /* IRRoundingMode(I32) x D128 -> signed I128 */
4504          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4505
4506       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
4507       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
4508       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
4509       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
4510       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
4511       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
4512          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4513
4514       case Iop_F64HLtoF128:
4515       case Iop_D64HLtoD128:
4516          return assignNew('V', mce, Ity_I128,
4517                           binop(Iop_64HLto128, vatom1, vatom2));
4518
4519       case Iop_F64toI32U:
4520       case Iop_F64toI32S:
4521       case Iop_F64toF32:
4522       case Iop_I64UtoF32:
4523       case Iop_D64toI32U:
4524       case Iop_D64toI32S:
4525          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4526          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4527
4528       case Iop_D64toD32:
4529          /* First arg is I32 (rounding mode), second is D64 (data). */
4530          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4531
4532       case Iop_F64toI16S:
4533          /* First arg is I32 (rounding mode), second is F64 (data). */
4534          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4535
4536       case Iop_InsertExpD64:
4537          /*  I64 x I64 -> D64 */
4538          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4539
4540       case Iop_InsertExpD128:
4541          /*  I64 x I128 -> D128 */
4542          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4543
4544       case Iop_CmpF16:
4545       case Iop_CmpF32:
4546       case Iop_CmpF64:
4547       case Iop_CmpF128:
4548       case Iop_CmpD64:
4549       case Iop_CmpD128:
4550       case Iop_CmpExpD64:
4551       case Iop_CmpExpD128:
4552          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4553
4554       case Iop_MaxNumF32:
4555       case Iop_MinNumF32:
4556          /* F32 x F32 -> F32 */
4557          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4558
4559       case Iop_MaxNumF64:
4560       case Iop_MinNumF64:
4561          /* F64 x F64 -> F64 */
4562          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4563
4564       /* non-FP after here */
4565
4566       case Iop_DivModU64to32:
4567       case Iop_DivModS64to32:
4568          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4569
4570       case Iop_DivModU128to64:
4571       case Iop_DivModS128to64:
4572          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4573
4574       case Iop_8HLto16:
4575          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4576       case Iop_16HLto32:
4577          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4578       case Iop_32HLto64:
4579          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4580
4581       case Iop_DivModU64to64:
4582       case Iop_DivModS64to64: {
4583          IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4584          return assignNew('V', mce, Ity_I128,
4585                           binop(Iop_64HLto128, vTmp64, vTmp64));
4586       }
4587
4588       case Iop_MullS64:
4589       case Iop_MullU64: {
4590          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4591          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4592          return assignNew('V', mce, Ity_I128,
4593                           binop(Iop_64HLto128, vHi64, vLo64));
4594       }
4595
4596       case Iop_DivModU32to32:
4597       case Iop_DivModS32to32: {
4598          IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4599          return assignNew('V', mce, Ity_I64,
4600                           binop(Iop_32HLto64, vTmp32, vTmp32));
4601       }
4602
4603       case Iop_MullS32:
4604       case Iop_MullU32: {
4605          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4606          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4607          return assignNew('V', mce, Ity_I64,
4608                           binop(Iop_32HLto64, vHi32, vLo32));
4609       }
4610
4611       case Iop_MullS16:
4612       case Iop_MullU16: {
4613          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4614          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4615          return assignNew('V', mce, Ity_I32,
4616                           binop(Iop_16HLto32, vHi16, vLo16));
4617       }
4618
4619       case Iop_MullS8:
4620       case Iop_MullU8: {
4621          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4622          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4623          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4624       }
4625
4626       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4627       case Iop_DivS32:
4628       case Iop_DivU32:
4629       case Iop_DivU32E:
4630       case Iop_DivS32E:
4631       case Iop_QAdd32S: /* could probably do better */
4632       case Iop_QSub32S: /* could probably do better */
4633          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4634
4635       case Iop_DivS64:
4636       case Iop_DivU64:
4637       case Iop_DivS64E:
4638       case Iop_DivU64E:
4639          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4640
4641       case Iop_Add32:
4642          if (mce->dlbo.dl_Add32 == DLexpensive
4643              || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4644              return expensiveAddSub(mce,True,Ity_I32,
4645                                     vatom1,vatom2, atom1,atom2);
4646          } else {
4647              goto cheap_AddSub32;
4648          }
4649       case Iop_Sub32:
4650          if (mce->dlbo.dl_Sub32 == DLexpensive
4651              || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4652              return expensiveAddSub(mce,False,Ity_I32,
4653                                     vatom1,vatom2, atom1,atom2);
4654          } else {
4655              goto cheap_AddSub32;
4656          }
4657
4658       cheap_AddSub32:
4659       case Iop_Mul32:
4660          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4661
4662       case Iop_CmpORD32S:
4663       case Iop_CmpORD32U:
4664       case Iop_CmpORD64S:
4665       case Iop_CmpORD64U:
4666          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4667
4668       case Iop_Add64:
4669          if (mce->dlbo.dl_Add64 == DLexpensive
4670              || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4671              return expensiveAddSub(mce,True,Ity_I64,
4672                                     vatom1,vatom2, atom1,atom2);
4673          } else {
4674              goto cheap_AddSub64;
4675          }
4676       case Iop_Sub64:
4677          if (mce->dlbo.dl_Sub64 == DLexpensive
4678              || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4679              return expensiveAddSub(mce,False,Ity_I64,
4680                                     vatom1,vatom2, atom1,atom2);
4681          } else {
4682              goto cheap_AddSub64;
4683          }
4684
4685       cheap_AddSub64:
4686       case Iop_Mul64:
4687          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4688
4689       case Iop_Mul16:
4690       case Iop_Add16:
4691       case Iop_Sub16:
4692          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4693
4694       case Iop_Mul8:
4695       case Iop_Sub8:
4696       case Iop_Add8:
4697          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4698
4699       ////---- CmpXX64
4700       case Iop_CmpEQ64: case Iop_CmpNE64:
4701          if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4702             goto expensive_cmp64;
4703          else
4704             goto cheap_cmp64;
4705
4706       expensive_cmp64:
4707       case Iop_ExpCmpNE64:
4708          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4709
4710       cheap_cmp64:
4711       case Iop_CmpLE64S: case Iop_CmpLE64U:
4712       case Iop_CmpLT64U: case Iop_CmpLT64S:
4713          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4714
4715       ////---- CmpXX32
4716       case Iop_CmpEQ32: case Iop_CmpNE32:
4717          if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4718             goto expensive_cmp32;
4719          else
4720             goto cheap_cmp32;
4721
4722       expensive_cmp32:
4723       case Iop_ExpCmpNE32:
4724          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4725
4726       cheap_cmp32:
4727       case Iop_CmpLE32S: case Iop_CmpLE32U:
4728       case Iop_CmpLT32U: case Iop_CmpLT32S:
4729          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4730
4731       ////---- CmpXX16
4732       case Iop_CmpEQ16: case Iop_CmpNE16:
4733          if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4734             goto expensive_cmp16;
4735          else
4736             goto cheap_cmp16;
4737
4738       expensive_cmp16:
4739       case Iop_ExpCmpNE16:
4740          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4741
4742       cheap_cmp16:
4743          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4744
4745       ////---- CmpXX8
4746       case Iop_CmpEQ8: case Iop_CmpNE8:
4747          if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4748             goto expensive_cmp8;
4749          else
4750             goto cheap_cmp8;
4751
4752       expensive_cmp8:
4753          return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4754
4755       cheap_cmp8:
4756          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4757
4758       ////---- end CmpXX{64,32,16,8}
4759
4760       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4761       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4762       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4763       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4764          /* Just say these all produce a defined result, regardless
4765             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4766          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4767
4768       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4769          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4770
4771       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4772          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4773
4774       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4775          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4776
4777       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4778          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4779
4780       case Iop_AndV256:
4781          uifu = mkUifUV256; difd = mkDifDV256;
4782          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4783       case Iop_AndV128:
4784          uifu = mkUifUV128; difd = mkDifDV128;
4785          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4786       case Iop_And64:
4787          uifu = mkUifU64; difd = mkDifD64;
4788          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4789       case Iop_And32:
4790          uifu = mkUifU32; difd = mkDifD32;
4791          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4792       case Iop_And16:
4793          uifu = mkUifU16; difd = mkDifD16;
4794          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4795       case Iop_And8:
4796          uifu = mkUifU8; difd = mkDifD8;
4797          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4798       case Iop_And1:
4799          uifu = mkUifU1; difd = mkDifD1;
4800          and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4801
4802       case Iop_OrV256:
4803          uifu = mkUifUV256; difd = mkDifDV256;
4804          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4805       case Iop_OrV128:
4806          uifu = mkUifUV128; difd = mkDifDV128;
4807          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4808       case Iop_Or64:
4809          uifu = mkUifU64; difd = mkDifD64;
4810          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4811       case Iop_Or32:
4812          uifu = mkUifU32; difd = mkDifD32;
4813          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4814       case Iop_Or16:
4815          uifu = mkUifU16; difd = mkDifD16;
4816          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4817       case Iop_Or8:
4818          uifu = mkUifU8; difd = mkDifD8;
4819          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4820       case Iop_Or1:
4821          uifu = mkUifU1; difd = mkDifD1;
4822          and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4823
4824       do_And_Or:
4825          return
4826          assignNew(
4827             'V', mce,
4828             and_or_ty,
4829             difd(mce, uifu(mce, vatom1, vatom2),
4830                       difd(mce, improve(mce, atom1, vatom1),
4831                                 improve(mce, atom2, vatom2) ) ) );
4832
4833          return assignNew('V', mce, and_or_ty,
4834             difd(mce, uifu(mce, vatom1, vatom2),
4835                       difd(mce, improve(mce, atom1, vatom1),
4836                                 improve(mce, atom2, vatom2) ) ) );
4837       case Iop_Xor8:
4838          return mkUifU8(mce, vatom1, vatom2);
4839       case Iop_Xor16:
4840          return mkUifU16(mce, vatom1, vatom2);
4841       case Iop_Xor32:
4842          return mkUifU32(mce, vatom1, vatom2);
4843       case Iop_Xor64:
4844          return mkUifU64(mce, vatom1, vatom2);
4845       case Iop_XorV128:
4846          return mkUifUV128(mce, vatom1, vatom2);
4847       case Iop_XorV256:
4848          return mkUifUV256(mce, vatom1, vatom2);
4849
4850       /* V256-bit SIMD */
4851
4852       case Iop_ShrN16x16:
4853       case Iop_ShrN32x8:
4854       case Iop_ShrN64x4:
4855       case Iop_SarN16x16:
4856       case Iop_SarN32x8:
4857       case Iop_ShlN16x16:
4858       case Iop_ShlN32x8:
4859       case Iop_ShlN64x4:
4860          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4861             this is wrong now, scalar shifts are done properly lazily.
4862             Vector shifts should be fixed too. */
4863          complainIfUndefined(mce, atom2, NULL);
4864          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4865
4866       case Iop_QSub8Ux32:
4867       case Iop_QSub8Sx32:
4868       case Iop_Sub8x32:
4869       case Iop_Min8Ux32:
4870       case Iop_Min8Sx32:
4871       case Iop_Max8Ux32:
4872       case Iop_Max8Sx32:
4873       case Iop_CmpGT8Sx32:
4874       case Iop_CmpEQ8x32:
4875       case Iop_Avg8Ux32:
4876       case Iop_QAdd8Ux32:
4877       case Iop_QAdd8Sx32:
4878       case Iop_Add8x32:
4879          return binary8Ix32(mce, vatom1, vatom2);
4880
4881       case Iop_QSub16Ux16:
4882       case Iop_QSub16Sx16:
4883       case Iop_Sub16x16:
4884       case Iop_Mul16x16:
4885       case Iop_MulHi16Sx16:
4886       case Iop_MulHi16Ux16:
4887       case Iop_Min16Sx16:
4888       case Iop_Min16Ux16:
4889       case Iop_Max16Sx16:
4890       case Iop_Max16Ux16:
4891       case Iop_CmpGT16Sx16:
4892       case Iop_CmpEQ16x16:
4893       case Iop_Avg16Ux16:
4894       case Iop_QAdd16Ux16:
4895       case Iop_QAdd16Sx16:
4896       case Iop_Add16x16:
4897          return binary16Ix16(mce, vatom1, vatom2);
4898
4899       case Iop_Sub32x8:
4900       case Iop_CmpGT32Sx8:
4901       case Iop_CmpEQ32x8:
4902       case Iop_Add32x8:
4903       case Iop_Max32Ux8:
4904       case Iop_Max32Sx8:
4905       case Iop_Min32Ux8:
4906       case Iop_Min32Sx8:
4907       case Iop_Mul32x8:
4908          return binary32Ix8(mce, vatom1, vatom2);
4909
4910       case Iop_Sub64x4:
4911       case Iop_Add64x4:
4912       case Iop_CmpEQ64x4:
4913       case Iop_CmpGT64Sx4:
4914          return binary64Ix4(mce, vatom1, vatom2);
4915
4916       case Iop_I32StoF32x8:
4917       case Iop_F32toI32Sx8:
4918          return unary32Fx8_w_rm(mce, vatom1, vatom2);
4919
4920       /* Perm32x8: rearrange values in left arg using steering values
4921          from right arg.  So rearrange the vbits in the same way but
4922          pessimise wrt steering values. */
4923       case Iop_Perm32x8:
4924          return mkUifUV256(
4925                    mce,
4926                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4927                    mkPCast32x8(mce, vatom2)
4928                 );
4929
4930       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4931          Handle the shifted results in the same way that other
4932          binary Q ops are handled, eg QSub: UifU the two args,
4933          then pessimise -- which is binaryNIxM.  But for the upper
4934          V128, we require to generate just 1 bit which is the
4935          pessimised shift result, with 127 defined zeroes above it.
4936
4937          Note that this overly pessimistic in that in fact only the
4938          bottom 8 bits of each lane of the second arg determine the shift
4939          amount.  Really we ought to ignore any undefinedness in the
4940          rest of the lanes of the second arg. */
4941       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
4942       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4943       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
4944       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4945       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
4946       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4947       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
4948       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4949       {
4950          // The function to generate the pessimised shift result
4951          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4952          switch (op) {
4953             case Iop_QandSQsh64x2:
4954             case Iop_QandUQsh64x2:
4955             case Iop_QandSQRsh64x2:
4956             case Iop_QandUQRsh64x2:
4957                binaryNIxM = binary64Ix2;
4958                break;
4959             case Iop_QandSQsh32x4:
4960             case Iop_QandUQsh32x4:
4961             case Iop_QandSQRsh32x4:
4962             case Iop_QandUQRsh32x4:
4963                binaryNIxM = binary32Ix4;
4964                break;
4965             case Iop_QandSQsh16x8:
4966             case Iop_QandUQsh16x8:
4967             case Iop_QandSQRsh16x8:
4968             case Iop_QandUQRsh16x8:
4969                binaryNIxM = binary16Ix8;
4970                break;
4971             case Iop_QandSQsh8x16:
4972             case Iop_QandUQsh8x16:
4973             case Iop_QandSQRsh8x16:
4974             case Iop_QandUQRsh8x16:
4975                binaryNIxM = binary8Ix16;
4976                break;
4977             default:
4978                tl_assert(0);
4979          }
4980          tl_assert(binaryNIxM);
4981          // Pessimised shift result, shV[127:0]
4982          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4983          // Generates: Def--(127)--Def PCast-to-I1(shV)
4984          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4985          // and assemble the result
4986          return assignNew('V', mce, Ity_V256,
4987                           binop(Iop_V128HLtoV256, qV, shV));
4988       }
4989
4990       case Iop_F32toF16x4: {
4991          // First, PCast the input vector, retaining the 32x4 format.
4992          IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
4993          // Now truncate each 32 bit lane to 16 bits.  Since we already PCasted
4994          // the input, we're not going to lose any information.
4995          IRAtom* pcHI64
4996             = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
4997          IRAtom* pcLO64
4998             = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
4999          IRAtom* narrowed
5000             = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
5001                                                  pcHI64, pcLO64)); // 16x4
5002          // Finally, roll in any badness from the rounding mode.
5003          IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
5004          return mkUifU64(mce, narrowed, rmPCasted);
5005       }
5006
5007       case Iop_F32toF16x8: {
5008          // Same scheme as for Iop_F32toF16x4.
5009          IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
5010          IRAtom* pcHI128
5011             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
5012                                                  pcasted)); // 32x4
5013          IRAtom* pcLO128
5014             = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
5015                                                  pcasted)); // 32x4
5016          IRAtom* narrowed
5017             = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
5018                                                   pcHI128, pcLO128)); // 16x8
5019          // Finally, roll in any badness from the rounding mode.
5020          IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
5021          return mkUifUV128(mce, narrowed, rmPCasted);
5022       }
5023
5024       default:
5025          ppIROp(op);
5026          VG_(tool_panic)("memcheck:expr2vbits_Binop");
5027    }
5028 }
5029
5030
5031 static
5032 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
5033 {
5034    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5035       selection of shadow operation implicitly duplicates the logic in
5036       do_shadow_LoadG and should be kept in sync (in the very unlikely
5037       event that the interpretation of such widening ops changes in
5038       future).  See comment in do_shadow_LoadG. */
5039    IRAtom* vatom = expr2vbits( mce, atom, HuOth );
5040    tl_assert(isOriginalAtom(mce,atom));
5041    switch (op) {
5042
5043       case Iop_Abs64Fx2:
5044       case Iop_Neg64Fx2:
5045       case Iop_RSqrtEst64Fx2:
5046       case Iop_RecipEst64Fx2:
5047       case Iop_Log2_64Fx2:
5048          return unary64Fx2(mce, vatom);
5049
5050       case Iop_Sqrt64F0x2:
5051          return unary64F0x2(mce, vatom);
5052
5053       case Iop_Sqrt32Fx8:
5054       case Iop_RSqrtEst32Fx8:
5055       case Iop_RecipEst32Fx8:
5056          return unary32Fx8(mce, vatom);
5057
5058       case Iop_Sqrt64Fx4:
5059          return unary64Fx4(mce, vatom);
5060
5061       case Iop_RecipEst32Fx4:
5062       case Iop_I32UtoF32x4_DEP:
5063       case Iop_I32StoF32x4_DEP:
5064       case Iop_QF32toI32Ux4_RZ:
5065       case Iop_QF32toI32Sx4_RZ:
5066       case Iop_RoundF32x4_RM:
5067       case Iop_RoundF32x4_RP:
5068       case Iop_RoundF32x4_RN:
5069       case Iop_RoundF32x4_RZ:
5070       case Iop_RecipEst32Ux4:
5071       case Iop_Abs32Fx4:
5072       case Iop_Neg32Fx4:
5073       case Iop_RSqrtEst32Fx4:
5074       case Iop_Log2_32Fx4:
5075       case Iop_Exp2_32Fx4:
5076          return unary32Fx4(mce, vatom);
5077
5078       case Iop_I32UtoF32x2_DEP:
5079       case Iop_I32StoF32x2_DEP:
5080       case Iop_RecipEst32Fx2:
5081       case Iop_RecipEst32Ux2:
5082       case Iop_Abs32Fx2:
5083       case Iop_Neg32Fx2:
5084       case Iop_RSqrtEst32Fx2:
5085          return unary32Fx2(mce, vatom);
5086
5087       case Iop_Sqrt32F0x4:
5088       case Iop_RSqrtEst32F0x4:
5089       case Iop_RecipEst32F0x4:
5090          return unary32F0x4(mce, vatom);
5091
5092       case Iop_Abs16Fx8:
5093       case Iop_Neg16Fx8:
5094          return unary16Fx8(mce, vatom);
5095
5096       // These are self-shadowing.
5097       case Iop_32UtoV128:
5098       case Iop_64UtoV128:
5099       case Iop_Dup8x16:
5100       case Iop_Dup16x8:
5101       case Iop_Dup32x4:
5102       case Iop_Reverse1sIn8_x16:
5103       case Iop_Reverse8sIn16_x8:
5104       case Iop_Reverse8sIn32_x4:
5105       case Iop_Reverse16sIn32_x4:
5106       case Iop_Reverse8sIn64_x2:
5107       case Iop_Reverse16sIn64_x2:
5108       case Iop_Reverse32sIn64_x2:
5109       case Iop_V256toV128_1: case Iop_V256toV128_0:
5110       case Iop_ZeroHI64ofV128:
5111       case Iop_ZeroHI96ofV128:
5112       case Iop_ZeroHI112ofV128:
5113       case Iop_ZeroHI120ofV128:
5114       case Iop_ReinterpI128asV128:  /* I128 -> V128 */
5115          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5116
5117       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
5118       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
5119          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5120
5121       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
5122       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
5123          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5124
5125       case Iop_NegF128:
5126       case Iop_AbsF128:
5127       case Iop_RndF128:
5128       case Iop_TruncF128toI128S: /* F128 -> I128S */
5129       case Iop_TruncF128toI128U: /* F128 -> I128U */
5130       case Iop_ReinterpV128asI128:  /* V128 -> I128 */
5131       case Iop_ReinterpI128asF128:
5132       case Iop_ReinterpF128asI128:
5133          return mkPCastTo(mce, Ity_I128, vatom);
5134
5135       case Iop_BCD128toI128S:
5136       case Iop_MulI128by10:
5137       case Iop_MulI128by10Carry:
5138       case Iop_F16toF64x2:
5139       case Iop_F64toF16x2_DEP:
5140          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5141          return vatom;
5142
5143       case Iop_ReinterpI32asF32:
5144       case Iop_ReinterpF32asI32:
5145          return assignNew('V', mce, Ity_I32, vatom);
5146
5147       case Iop_ReinterpF64asI64:
5148       case Iop_ReinterpI64asF64:
5149       case Iop_ReinterpI64asD64:
5150       case Iop_ReinterpD64asI64:
5151          return assignNew('V', mce, Ity_I64, vatom);
5152
5153       case Iop_I32StoF128: /* signed I32 -> F128 */
5154       case Iop_I64StoF128: /* signed I64 -> F128 */
5155       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5156       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5157       case Iop_F32toF128:  /* F32 -> F128 */
5158       case Iop_F64toF128:  /* F64 -> F128 */
5159       case Iop_I32StoD128: /* signed I64 -> D128 */
5160       case Iop_I64StoD128: /* signed I64 -> D128 */
5161       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5162       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5163          return mkPCastTo(mce, Ity_I128, vatom);
5164
5165       case Iop_F16toF64:
5166       case Iop_F32toF64:
5167       case Iop_I32StoF64:
5168       case Iop_I32UtoF64:
5169       case Iop_NegF64:
5170       case Iop_AbsF64:
5171       case Iop_RSqrtEst5GoodF64:
5172       case Iop_RoundF64toF64_NEAREST:
5173       case Iop_RoundF64toF64_NegINF:
5174       case Iop_RoundF64toF64_PosINF:
5175       case Iop_RoundF64toF64_ZERO:
5176       case Iop_D32toD64:
5177       case Iop_I32StoD64:
5178       case Iop_I32UtoD64:
5179       case Iop_ExtractExpD64:    /* D64  -> I64 */
5180       case Iop_ExtractExpD128:   /* D128 -> I64 */
5181       case Iop_ExtractSigD64:    /* D64  -> I64 */
5182       case Iop_ExtractSigD128:   /* D128 -> I64 */
5183       case Iop_DPBtoBCD:
5184       case Iop_BCDtoDPB:
5185          return mkPCastTo(mce, Ity_I64, vatom);
5186
5187       case Iop_D64toD128:
5188          return mkPCastTo(mce, Ity_I128, vatom);
5189
5190       case Iop_TruncF64asF32:
5191       case Iop_NegF32:
5192       case Iop_AbsF32:
5193       case Iop_F16toF32:
5194          return mkPCastTo(mce, Ity_I32, vatom);
5195
5196       case Iop_AbsF16:
5197       case Iop_NegF16:
5198          return mkPCastTo(mce, Ity_I16, vatom);
5199
5200       case Iop_Ctz32: case Iop_CtzNat32:
5201       case Iop_Ctz64: case Iop_CtzNat64:
5202          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5203
5204       case Iop_Clz32: case Iop_ClzNat32:
5205       case Iop_Clz64: case Iop_ClzNat64:
5206          return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5207
5208       // PopCount32: this is slightly pessimistic.  It is true that the
5209       // result depends on all input bits, so that aspect of the PCast is
5210       // correct.  However, regardless of the input, only the lowest 5 bits
5211       // out of the output can ever be undefined.  So we could actually
5212       // "improve" the results here by marking the top 27 bits of output as
5213       // defined.  A similar comment applies for PopCount64.
5214       case Iop_PopCount32:
5215          return mkPCastTo(mce, Ity_I32, vatom);
5216       case Iop_PopCount64:
5217          return mkPCastTo(mce, Ity_I64, vatom);
5218
5219       // These are self-shadowing.
5220       case Iop_1Uto64:
5221       case Iop_1Sto64:
5222       case Iop_8Uto64:
5223       case Iop_8Sto64:
5224       case Iop_16Uto64:
5225       case Iop_16Sto64:
5226       case Iop_32Sto64:
5227       case Iop_32Uto64:
5228       case Iop_V128to64:
5229       case Iop_V128HIto64:
5230       case Iop_128HIto64:
5231       case Iop_128to64:
5232       case Iop_Dup8x8:
5233       case Iop_Dup16x4:
5234       case Iop_Dup32x2:
5235       case Iop_Reverse8sIn16_x4:
5236       case Iop_Reverse8sIn32_x2:
5237       case Iop_Reverse16sIn32_x2:
5238       case Iop_Reverse8sIn64_x1:
5239       case Iop_Reverse16sIn64_x1:
5240       case Iop_Reverse32sIn64_x1:
5241       case Iop_V256to64_0: case Iop_V256to64_1:
5242       case Iop_V256to64_2: case Iop_V256to64_3:
5243          return assignNew('V', mce, Ity_I64, unop(op, vatom));
5244
5245       // These are self-shadowing.
5246       case Iop_64to32:
5247       case Iop_64HIto32:
5248       case Iop_1Uto32:
5249       case Iop_1Sto32:
5250       case Iop_8Uto32:
5251       case Iop_16Uto32:
5252       case Iop_16Sto32:
5253       case Iop_8Sto32:
5254       case Iop_V128to32:
5255       case Iop_Reverse8sIn32_x1:
5256          return assignNew('V', mce, Ity_I32, unop(op, vatom));
5257
5258       // These are self-shadowing.
5259       case Iop_1Sto16:
5260       case Iop_8Sto16:
5261       case Iop_8Uto16:
5262       case Iop_32to16:
5263       case Iop_32HIto16:
5264       case Iop_64to16:
5265       case Iop_GetMSBs8x16:
5266          return assignNew('V', mce, Ity_I16, unop(op, vatom));
5267
5268       // These are self-shadowing.
5269       case Iop_1Uto8:
5270       case Iop_1Sto8:
5271       case Iop_16to8:
5272       case Iop_16HIto8:
5273       case Iop_32to8:
5274       case Iop_64to8:
5275       case Iop_GetMSBs8x8:
5276          return assignNew('V', mce, Ity_I8, unop(op, vatom));
5277
5278       case Iop_32to1:
5279          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5280
5281       case Iop_64to1:
5282          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5283
5284       case Iop_NotV256:
5285       case Iop_NotV128:
5286       case Iop_Not64:
5287       case Iop_Not32:
5288       case Iop_Not16:
5289       case Iop_Not8:
5290       case Iop_Not1:
5291          // FIXME JRS 2018-Nov-15.  This is surely not correct!
5292          return vatom;
5293
5294       case Iop_CmpNEZ8x8:
5295       case Iop_Cnt8x8:
5296       case Iop_Clz8x8:
5297       case Iop_Cls8x8:
5298       case Iop_Abs8x8:
5299          return mkPCast8x8(mce, vatom);
5300
5301       case Iop_CmpNEZ8x16:
5302       case Iop_Cnt8x16:
5303       case Iop_Clz8x16:
5304       case Iop_Cls8x16:
5305       case Iop_Abs8x16:
5306       case Iop_Ctz8x16:
5307          return mkPCast8x16(mce, vatom);
5308
5309       case Iop_CmpNEZ16x4:
5310       case Iop_Clz16x4:
5311       case Iop_Cls16x4:
5312       case Iop_Abs16x4:
5313          return mkPCast16x4(mce, vatom);
5314
5315       case Iop_CmpNEZ16x8:
5316       case Iop_Clz16x8:
5317       case Iop_Cls16x8:
5318       case Iop_Abs16x8:
5319       case Iop_Ctz16x8:
5320          return mkPCast16x8(mce, vatom);
5321
5322       case Iop_CmpNEZ32x2:
5323       case Iop_Clz32x2:
5324       case Iop_Cls32x2:
5325       case Iop_F32toI32Ux2_RZ:
5326       case Iop_F32toI32Sx2_RZ:
5327       case Iop_Abs32x2:
5328          return mkPCast32x2(mce, vatom);
5329
5330       case Iop_CmpNEZ32x4:
5331       case Iop_Clz32x4:
5332       case Iop_Cls32x4:
5333       case Iop_F32toI32Ux4_RZ:
5334       case Iop_F32toI32Sx4_RZ:
5335       case Iop_Abs32x4:
5336       case Iop_RSqrtEst32Ux4:
5337       case Iop_Ctz32x4:
5338          return mkPCast32x4(mce, vatom);
5339
5340       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5341       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5342       case Iop_CmpwNEZ32:
5343          return mkPCastTo(mce, Ity_I32, vatom);
5344
5345       case Iop_TruncF128toI64S: /* F128 -> I64S */
5346       case Iop_TruncF128toI64U: /* F128 -> I64U */
5347       case Iop_CmpwNEZ64:
5348          return mkPCastTo(mce, Ity_I64, vatom);
5349
5350       case Iop_CmpNEZ64x2:
5351       case Iop_CipherSV128:
5352       case Iop_Clz64x2:
5353       case Iop_Abs64x2:
5354       case Iop_Ctz64x2:
5355          return mkPCast64x2(mce, vatom);
5356
5357       // This is self-shadowing.
5358       case Iop_PwBitMtxXpose64x2:
5359          return assignNew('V', mce, Ity_V128, unop(op, vatom));
5360
5361       case Iop_NarrowUn16to8x8:
5362       case Iop_NarrowUn32to16x4:
5363       case Iop_NarrowUn64to32x2:
5364       case Iop_QNarrowUn16Sto8Sx8:
5365       case Iop_QNarrowUn16Sto8Ux8:
5366       case Iop_QNarrowUn16Uto8Ux8:
5367       case Iop_QNarrowUn32Sto16Sx4:
5368       case Iop_QNarrowUn32Sto16Ux4:
5369       case Iop_QNarrowUn32Uto16Ux4:
5370       case Iop_QNarrowUn64Sto32Sx2:
5371       case Iop_QNarrowUn64Sto32Ux2:
5372       case Iop_QNarrowUn64Uto32Ux2:
5373          return vectorNarrowUnV128(mce, op, vatom);
5374
5375       // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5376       // right.
5377       case Iop_F32toF16x4_DEP:
5378          return vectorNarrowUnV128(mce, op, vatom);
5379
5380       case Iop_Widen8Sto16x8:
5381       case Iop_Widen8Uto16x8:
5382       case Iop_Widen16Sto32x4:
5383       case Iop_Widen16Uto32x4:
5384       case Iop_Widen32Sto64x2:
5385       case Iop_Widen32Uto64x2:
5386          return vectorWidenI64(mce, op, vatom);
5387
5388       case Iop_F16toF32x4:
5389          // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5390          // OK by accident if -- as seems likely -- the F16 to F32 conversion
5391          // preserves will generate an output 32 bits with at least one 1 bit
5392          // set if there's one or more 1 bits set in the input 16 bits.  More
5393          // correct code for this is just below, but commented out, so as to
5394          // avoid short-term backend failures on targets that can't do
5395          // Iop_Interleave{LO,HI}16x4.
5396          return vectorWidenI64(mce, op, vatom);
5397
5398       case Iop_F16toF32x8: {
5399          // PCast the input at 16x8.  This makes each lane hold either all
5400          // zeroes or all ones.
5401          IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5402          // Now double the width of each lane to 32 bits.  Because the lanes are
5403          // all zeroes or all ones, we can just copy the each lane twice into
5404          // the result.  Here's the low half:
5405          IRAtom* widenedLO // :: I32x4
5406             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5407                                                   pcasted, pcasted));
5408          // And the high half:
5409          IRAtom* widenedHI // :: I32x4
5410             = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5411                                                   pcasted, pcasted));
5412          // Glue them back together:
5413          return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5414                                                     widenedHI, widenedLO));
5415       }
5416
5417       // See comment just above, for Iop_F16toF32x4
5418       //case Iop_F16toF32x4: {
5419       //   // Same scheme as F16toF32x4
5420       //   IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5421       //   IRAtom* widenedLO // :: I32x2
5422       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5423       //                                            pcasted, pcasted));
5424       //   IRAtom* widenedHI // :: I32x4
5425       //      = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5426       //                                            pcasted, pcasted));
5427       //   // Glue them back together:
5428       //   return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5429       //                                              widenedHI, widenedLO));
5430       //}
5431
5432       case Iop_PwAddL32Ux2:
5433       case Iop_PwAddL32Sx2:
5434          return mkPCastTo(mce, Ity_I64,
5435                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5436
5437       case Iop_PwAddL16Ux4:
5438       case Iop_PwAddL16Sx4:
5439          return mkPCast32x2(mce,
5440                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5441
5442       case Iop_PwAddL8Ux8:
5443       case Iop_PwAddL8Sx8:
5444          return mkPCast16x4(mce,
5445                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5446
5447       case Iop_PwAddL32Ux4:
5448       case Iop_PwAddL32Sx4:
5449          return mkPCast64x2(mce,
5450                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5451
5452       case Iop_PwAddL64Ux2:
5453          return mkPCast128x1(mce,
5454                assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5455
5456       case Iop_PwAddL16Ux8:
5457       case Iop_PwAddL16Sx8:
5458          return mkPCast32x4(mce,
5459                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5460
5461       case Iop_PwAddL8Ux16:
5462       case Iop_PwAddL8Sx16:
5463          return mkPCast16x8(mce,
5464                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5465
5466       case Iop_I64UtoF32:
5467       default:
5468          ppIROp(op);
5469          VG_(tool_panic)("memcheck:expr2vbits_Unop");
5470    }
5471 }
5472
5473
5474 /* Worker function -- do not call directly.  See comments on
5475    expr2vbits_Load for the meaning of |guard|.
5476
5477    Generates IR to (1) perform a definedness test of |addr|, (2)
5478    perform a validity test of |addr|, and (3) return the Vbits for the
5479    location indicated by |addr|.  All of this only happens when
5480    |guard| is NULL or |guard| evaluates to True at run time.
5481
5482    If |guard| evaluates to False at run time, the returned value is
5483    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5484    performed.
5485
5486    The definedness of |guard| itself is not checked.  That is assumed
5487    to have been done before this point, by the caller. */
5488 static
5489 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5490                               IREndness end, IRType ty,
5491                               IRAtom* addr, UInt bias, IRAtom* guard )
5492 {
5493    tl_assert(isOriginalAtom(mce,addr));
5494    tl_assert(end == Iend_LE || end == Iend_BE);
5495
5496    /* First, emit a definedness test for the address.  This also sets
5497       the address (shadow) to 'defined' following the test. */
5498    complainIfUndefined( mce, addr, guard );
5499
5500    /* Now cook up a call to the relevant helper function, to read the data V
5501       bits from shadow memory.  Note that I128 loads are done by pretending
5502       we're doing a V128 load, and then converting the resulting V128 vbits
5503       word to an I128, right at the end of this function -- see `castedToI128`
5504       below.  (It's only a minor hack :-) This pertains to bug 444399. */
5505    ty = shadowTypeV(ty);
5506
5507    void*        helper           = NULL;
5508    const HChar* hname            = NULL;
5509    Bool         ret_via_outparam = False;
5510
5511    if (end == Iend_LE) {
5512       switch (ty) {
5513          case Ity_V256: helper = &MC_(helperc_LOADV256le);
5514                         hname = "MC_(helperc_LOADV256le)";
5515                         ret_via_outparam = True;
5516                         break;
5517          case Ity_I128: // fallthrough.  See comment above.
5518          case Ity_V128: helper = &MC_(helperc_LOADV128le);
5519                         hname = "MC_(helperc_LOADV128le)";
5520                         ret_via_outparam = True;
5521                         break;
5522          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
5523                         hname = "MC_(helperc_LOADV64le)";
5524                         break;
5525          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
5526                         hname = "MC_(helperc_LOADV32le)";
5527                         break;
5528          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
5529                         hname = "MC_(helperc_LOADV16le)";
5530                         break;
5531          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5532                         hname = "MC_(helperc_LOADV8)";
5533                         break;
5534          default:       ppIRType(ty);
5535                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5536       }
5537    } else {
5538       switch (ty) {
5539          case Ity_V256: helper = &MC_(helperc_LOADV256be);
5540                         hname = "MC_(helperc_LOADV256be)";
5541                         ret_via_outparam = True;
5542                         break;
5543          case Ity_V128: helper = &MC_(helperc_LOADV128be);
5544                         hname = "MC_(helperc_LOADV128be)";
5545                         ret_via_outparam = True;
5546                         break;
5547          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
5548                         hname = "MC_(helperc_LOADV64be)";
5549                         break;
5550          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
5551                         hname = "MC_(helperc_LOADV32be)";
5552                         break;
5553          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
5554                         hname = "MC_(helperc_LOADV16be)";
5555                         break;
5556          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5557                         hname = "MC_(helperc_LOADV8)";
5558                         break;
5559          default:       ppIRType(ty);
5560                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5561       }
5562    }
5563
5564    tl_assert(helper);
5565    tl_assert(hname);
5566
5567    /* Generate the actual address into addrAct. */
5568    IRAtom* addrAct;
5569    if (bias == 0) {
5570       addrAct = addr;
5571    } else {
5572       IROp    mkAdd;
5573       IRAtom* eBias;
5574       IRType  tyAddr  = mce->hWordTy;
5575       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5576       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5577       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5578       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5579    }
5580
5581    /* We need to have a place to park the V bits we're just about to
5582       read. */
5583    IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
5584
5585    /* Here's the call. */
5586    IRDirty* di;
5587    if (ret_via_outparam) {
5588       di = unsafeIRDirty_1_N( datavbits,
5589                               2/*regparms*/,
5590                               hname, VG_(fnptr_to_fnentry)( helper ),
5591                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5592    } else {
5593       di = unsafeIRDirty_1_N( datavbits,
5594                               1/*regparms*/,
5595                               hname, VG_(fnptr_to_fnentry)( helper ),
5596                               mkIRExprVec_1( addrAct ) );
5597    }
5598
5599    setHelperAnns( mce, di );
5600    if (guard) {
5601       di->guard = guard;
5602       /* Ideally the didn't-happen return value here would be all-ones
5603          (all-undefined), so it'd be obvious if it got used
5604          inadvertently.  We can get by with the IR-mandated default
5605          value (0b01 repeating, 0x55 etc) as that'll still look pretty
5606          undefined if it ever leaks out. */
5607    }
5608    stmt( 'V', mce, IRStmt_Dirty(di) );
5609
5610    if (ty == Ity_I128) {
5611       IRAtom* castedToI128
5612          = assignNew('V', mce, Ity_I128,
5613                      unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
5614       return castedToI128;
5615    } else {
5616       return mkexpr(datavbits);
5617    }
5618 }
5619
5620
5621 /* Generate IR to do a shadow load.  The helper is expected to check
5622    the validity of the address and return the V bits for that address.
5623    This can optionally be controlled by a guard, which is assumed to
5624    be True if NULL.  In the case where the guard is False at runtime,
5625    the helper will return the didn't-do-the-call value of 0x55..55.
5626    Since that means "completely undefined result", the caller of
5627    this function will need to fix up the result somehow in that
5628    case.
5629
5630    Caller of this function is also expected to have checked the
5631    definedness of |guard| before this point.
5632 */
5633 static
5634 IRAtom* expr2vbits_Load ( MCEnv* mce,
5635                           IREndness end, IRType ty,
5636                           IRAtom* addr, UInt bias,
5637                           IRAtom* guard )
5638 {
5639    tl_assert(end == Iend_LE || end == Iend_BE);
5640    switch (shadowTypeV(ty)) {
5641       case Ity_I8:
5642       case Ity_I16:
5643       case Ity_I32:
5644       case Ity_I64:
5645       case Ity_I128:
5646       case Ity_V128:
5647       case Ity_V256:
5648          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5649       default:
5650          VG_(tool_panic)("expr2vbits_Load");
5651    }
5652 }
5653
5654
5655 /* The most general handler for guarded loads.  Assumes the
5656    definedness of GUARD has already been checked by the caller.  A
5657    GUARD of NULL is assumed to mean "always True".  Generates code to
5658    check the definedness and validity of ADDR.
5659
5660    Generate IR to do a shadow load from ADDR and return the V bits.
5661    The loaded type is TY.  The loaded data is then (shadow) widened by
5662    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
5663    evaluates to False at run time then the returned Vbits are simply
5664    VALT instead.  Note therefore that the argument type of VWIDEN must
5665    be TY and the result type of VWIDEN must equal the type of VALT.
5666 */
5667 static
5668 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5669                                           IREndness end, IRType ty,
5670                                           IRAtom* addr, UInt bias,
5671                                           IRAtom* guard,
5672                                           IROp vwiden, IRAtom* valt )
5673 {
5674    /* Sanity check the conversion operation, and also set TYWIDE. */
5675    IRType tyWide = Ity_INVALID;
5676    switch (vwiden) {
5677       case Iop_INVALID:
5678          tyWide = ty;
5679          break;
5680       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5681          tyWide = Ity_I32;
5682          break;
5683       default:
5684          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5685    }
5686
5687    /* If the guard evaluates to True, this will hold the loaded V bits
5688       at TY.  If the guard evaluates to False, this will be all
5689       ones, meaning "all undefined", in which case we will have to
5690       replace it using an ITE below. */
5691    IRAtom* iftrue1
5692       = assignNew('V', mce, ty,
5693                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
5694    /* Now (shadow-) widen the loaded V bits to the desired width.  In
5695       the guard-is-False case, the allowable widening operators will
5696       in the worst case (unsigned widening) at least leave the
5697       pre-widened part as being marked all-undefined, and in the best
5698       case (signed widening) mark the whole widened result as
5699       undefined.  Anyway, it doesn't matter really, since in this case
5700       we will replace said value with the default value |valt| using an
5701       ITE. */
5702    IRAtom* iftrue2
5703       = vwiden == Iop_INVALID
5704            ? iftrue1
5705            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5706    /* These are the V bits we will return if the load doesn't take
5707       place. */
5708    IRAtom* iffalse
5709       = valt;
5710    /* Prepare the cond for the ITE.  Convert a NULL cond into
5711       something that iropt knows how to fold out later. */
5712    IRAtom* cond
5713       = guard == NULL  ? mkU1(1)  : guard;
5714    /* And assemble the final result. */
5715    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5716 }
5717
5718
5719 /* A simpler handler for guarded loads, in which there is no
5720    conversion operation, and the default V bit return (when the guard
5721    evaluates to False at runtime) is "all defined".  If there is no
5722    guard expression or the guard is always TRUE this function behaves
5723    like expr2vbits_Load.  It is assumed that definedness of GUARD has
5724    already been checked at the call site. */
5725 static
5726 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5727                                          IREndness end, IRType ty,
5728                                          IRAtom* addr, UInt bias,
5729                                          IRAtom *guard )
5730 {
5731    return expr2vbits_Load_guarded_General(
5732              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5733           );
5734 }
5735
5736
5737 static
5738 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5739                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5740 {
5741    IRAtom *vbitsC, *vbits0, *vbits1;
5742    IRType ty;
5743    /* Given ITE(cond, iftrue,  iffalse),  generate
5744             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5745       That is, steer the V bits like the originals, but trash the
5746       result if the steering value is undefined.  This gives
5747       lazy propagation. */
5748    tl_assert(isOriginalAtom(mce, cond));
5749    tl_assert(isOriginalAtom(mce, iftrue));
5750    tl_assert(isOriginalAtom(mce, iffalse));
5751
5752    vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5753    vbits1 = expr2vbits(mce, iftrue, HuOth);
5754    vbits0 = expr2vbits(mce, iffalse, HuOth);
5755    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5756
5757    return
5758       mkUifU(mce, ty, assignNew('V', mce, ty,
5759                                      IRExpr_ITE(cond, vbits1, vbits0)),
5760                       mkPCastTo(mce, ty, vbitsC) );
5761 }
5762
5763 /* --------- This is the main expression-handling function. --------- */
5764
5765 static
5766 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5767                      HowUsed hu/*use HuOth if unknown*/ )
5768 {
5769    switch (e->tag) {
5770
5771       case Iex_Get:
5772          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5773
5774       case Iex_GetI:
5775          return shadow_GETI( mce, e->Iex.GetI.descr,
5776                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
5777
5778       case Iex_RdTmp:
5779          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5780
5781       case Iex_Const:
5782          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5783
5784       case Iex_Qop:
5785          return expr2vbits_Qop(
5786                    mce,
5787                    e->Iex.Qop.details->op,
5788                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5789                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5790                 );
5791
5792       case Iex_Triop:
5793          return expr2vbits_Triop(
5794                    mce,
5795                    e->Iex.Triop.details->op,
5796                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5797                    e->Iex.Triop.details->arg3
5798                 );
5799
5800       case Iex_Binop:
5801          return expr2vbits_Binop(
5802                    mce,
5803                    e->Iex.Binop.op,
5804                    e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5805                    hu
5806                 );
5807
5808       case Iex_Unop:
5809          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5810
5811       case Iex_Load:
5812          return expr2vbits_Load( mce, e->Iex.Load.end,
5813                                       e->Iex.Load.ty,
5814                                       e->Iex.Load.addr, 0/*addr bias*/,
5815                                       NULL/* guard == "always True"*/ );
5816
5817       case Iex_CCall:
5818          return mkLazyN( mce, e->Iex.CCall.args,
5819                               e->Iex.CCall.retty,
5820                               e->Iex.CCall.cee );
5821
5822       case Iex_ITE:
5823          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5824                                      e->Iex.ITE.iffalse);
5825
5826       default:
5827          VG_(printf)("\n");
5828          ppIRExpr(e);
5829          VG_(printf)("\n");
5830          VG_(tool_panic)("memcheck: expr2vbits");
5831    }
5832 }
5833
5834
5835 /*------------------------------------------------------------*/
5836 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5837 /*------------------------------------------------------------*/
5838
5839 /* Widen a value to the host word size. */
5840
5841 static
5842 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5843 {
5844    IRType ty, tyH;
5845
5846    /* vatom is vbits-value and as such can only have a shadow type. */
5847    tl_assert(isShadowAtom(mce,vatom));
5848
5849    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5850    tyH = mce->hWordTy;
5851
5852    if (tyH == Ity_I32) {
5853       switch (ty) {
5854          case Ity_I32:
5855             return vatom;
5856          case Ity_I16:
5857             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5858          case Ity_I8:
5859             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5860          default:
5861             goto unhandled;
5862       }
5863    } else
5864    if (tyH == Ity_I64) {
5865       switch (ty) {
5866          case Ity_I32:
5867             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5868          case Ity_I16:
5869             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5870                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5871          case Ity_I8:
5872             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5873                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5874          default:
5875             goto unhandled;
5876       }
5877    } else {
5878       goto unhandled;
5879    }
5880   unhandled:
5881    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5882    VG_(tool_panic)("zwidenToHostWord");
5883 }
5884
5885
5886 /* Generate a shadow store.  |addr| is always the original address
5887    atom.  You can pass in either originals or V-bits for the data
5888    atom, but obviously not both.  This function generates a check for
5889    the definedness and (indirectly) the validity of |addr|, but only
5890    when |guard| evaluates to True at run time (or is NULL).
5891
5892    |guard| :: Ity_I1 controls whether the store really happens; NULL
5893    means it unconditionally does.  Note that |guard| itself is not
5894    checked for definedness; the caller of this function must do that
5895    if necessary.
5896 */
5897 static
5898 void do_shadow_Store ( MCEnv* mce,
5899                        IREndness end,
5900                        IRAtom* addr, UInt bias,
5901                        IRAtom* data, IRAtom* vdata,
5902                        IRAtom* guard )
5903 {
5904    IROp     mkAdd;
5905    IRType   ty, tyAddr;
5906    void*    helper = NULL;
5907    const HChar* hname = NULL;
5908    IRConst* c;
5909
5910    tyAddr = mce->hWordTy;
5911    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5912    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5913    tl_assert( end == Iend_LE || end == Iend_BE );
5914
5915    if (data) {
5916       tl_assert(!vdata);
5917       tl_assert(isOriginalAtom(mce, data));
5918       tl_assert(bias == 0);
5919       vdata = expr2vbits( mce, data, HuOth );
5920    } else {
5921       tl_assert(vdata);
5922    }
5923
5924    tl_assert(isOriginalAtom(mce,addr));
5925    tl_assert(isShadowAtom(mce,vdata));
5926
5927    if (guard) {
5928       tl_assert(isOriginalAtom(mce, guard));
5929       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5930    }
5931
5932    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5933
5934    // If we're not doing undefined value checking, pretend that this value
5935    // is "all valid".  That lets Vex's optimiser remove some of the V bit
5936    // shadow computation ops that precede it.
5937    if (MC_(clo_mc_level) == 1) {
5938       switch (ty) {
5939          case Ity_V256: // V256 weirdness -- used four times
5940                         c = IRConst_V256(V_BITS32_DEFINED); break;
5941          case Ity_V128: // V128 weirdness -- used twice
5942                         c = IRConst_V128(V_BITS16_DEFINED); break;
5943          case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
5944          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
5945          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
5946          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
5947          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
5948          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5949       }
5950       vdata = IRExpr_Const( c );
5951    }
5952
5953    /* First, emit a definedness test for the address.  This also sets
5954       the address (shadow) to 'defined' following the test.  Both of
5955       those actions are gated on |guard|. */
5956    complainIfUndefined( mce, addr, guard );
5957
5958    /* Now decide which helper function to call to write the data V
5959       bits into shadow memory. */
5960    if (end == Iend_LE) {
5961       switch (ty) {
5962          case Ity_V256: /* we'll use the helper four times */
5963          case Ity_V128: /* we'll use the helper twice */
5964          case Ity_I128: /* we'll use the helper twice */
5965          case Ity_I64: helper = &MC_(helperc_STOREV64le);
5966                        hname = "MC_(helperc_STOREV64le)";
5967                        break;
5968          case Ity_I32: helper = &MC_(helperc_STOREV32le);
5969                        hname = "MC_(helperc_STOREV32le)";
5970                        break;
5971          case Ity_I16: helper = &MC_(helperc_STOREV16le);
5972                        hname = "MC_(helperc_STOREV16le)";
5973                        break;
5974          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5975                        hname = "MC_(helperc_STOREV8)";
5976                        break;
5977          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5978       }
5979    } else {
5980       switch (ty) {
5981          case Ity_V128: /* we'll use the helper twice */
5982          case Ity_I64: helper = &MC_(helperc_STOREV64be);
5983                        hname = "MC_(helperc_STOREV64be)";
5984                        break;
5985          case Ity_I32: helper = &MC_(helperc_STOREV32be);
5986                        hname = "MC_(helperc_STOREV32be)";
5987                        break;
5988          case Ity_I16: helper = &MC_(helperc_STOREV16be);
5989                        hname = "MC_(helperc_STOREV16be)";
5990                        break;
5991          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5992                        hname = "MC_(helperc_STOREV8)";
5993                        break;
5994          /* Note, no V256 case here, because no big-endian target that
5995             we support, has 256 vectors. */
5996          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
5997       }
5998    }
5999
6000    if (UNLIKELY(ty == Ity_V256)) {
6001
6002       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6003          Q3 being the most significant lane. */
6004       /* These are the offsets of the Qs in memory. */
6005       Int     offQ0, offQ1, offQ2, offQ3;
6006
6007       /* Various bits for constructing the 4 lane helper calls */
6008       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
6009       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
6010       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
6011       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
6012
6013       if (end == Iend_LE) {
6014          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
6015       } else {
6016          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
6017       }
6018
6019       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
6020       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
6021       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
6022       diQ0    = unsafeIRDirty_0_N(
6023                    1/*regparms*/,
6024                    hname, VG_(fnptr_to_fnentry)( helper ),
6025                    mkIRExprVec_2( addrQ0, vdataQ0 )
6026                 );
6027
6028       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
6029       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
6030       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
6031       diQ1    = unsafeIRDirty_0_N(
6032                    1/*regparms*/,
6033                    hname, VG_(fnptr_to_fnentry)( helper ),
6034                    mkIRExprVec_2( addrQ1, vdataQ1 )
6035                 );
6036
6037       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
6038       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
6039       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
6040       diQ2    = unsafeIRDirty_0_N(
6041                    1/*regparms*/,
6042                    hname, VG_(fnptr_to_fnentry)( helper ),
6043                    mkIRExprVec_2( addrQ2, vdataQ2 )
6044                 );
6045
6046       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
6047       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
6048       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
6049       diQ3    = unsafeIRDirty_0_N(
6050                    1/*regparms*/,
6051                    hname, VG_(fnptr_to_fnentry)( helper ),
6052                    mkIRExprVec_2( addrQ3, vdataQ3 )
6053                 );
6054
6055       if (guard)
6056          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
6057
6058       setHelperAnns( mce, diQ0 );
6059       setHelperAnns( mce, diQ1 );
6060       setHelperAnns( mce, diQ2 );
6061       setHelperAnns( mce, diQ3 );
6062       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
6063       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
6064       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
6065       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6066
6067    }
6068    else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6069
6070       /* V128/I128-bit case */
6071       /* See comment in next clause re 64-bit regparms */
6072       /* also, need to be careful about endianness */
6073
6074       Int     offLo64, offHi64;
6075       IRDirty *diLo64, *diHi64;
6076       IRAtom  *addrLo64, *addrHi64;
6077       IRAtom  *vdataLo64, *vdataHi64;
6078       IRAtom  *eBiasLo64, *eBiasHi64;
6079       IROp    opGetLO64,  opGetHI64;
6080
6081       if (end == Iend_LE) {
6082          offLo64 = 0;
6083          offHi64 = 8;
6084       } else {
6085          offLo64 = 8;
6086          offHi64 = 0;
6087       }
6088
6089       if (ty == Ity_V128) {
6090          opGetLO64 = Iop_V128to64;
6091          opGetHI64 = Iop_V128HIto64;
6092       } else {
6093          opGetLO64 = Iop_128to64;
6094          opGetHI64 = Iop_128HIto64;
6095       }
6096
6097       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6098       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6099       vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6100       diLo64    = unsafeIRDirty_0_N(
6101                      1/*regparms*/,
6102                      hname, VG_(fnptr_to_fnentry)( helper ),
6103                      mkIRExprVec_2( addrLo64, vdataLo64 )
6104                   );
6105       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6106       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6107       vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6108       diHi64    = unsafeIRDirty_0_N(
6109                      1/*regparms*/,
6110                      hname, VG_(fnptr_to_fnentry)( helper ),
6111                      mkIRExprVec_2( addrHi64, vdataHi64 )
6112                   );
6113       if (guard) diLo64->guard = guard;
6114       if (guard) diHi64->guard = guard;
6115       setHelperAnns( mce, diLo64 );
6116       setHelperAnns( mce, diHi64 );
6117       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
6118       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
6119
6120    } else {
6121
6122       IRDirty *di;
6123       IRAtom  *addrAct;
6124
6125       /* 8/16/32/64-bit cases */
6126       /* Generate the actual address into addrAct. */
6127       if (bias == 0) {
6128          addrAct = addr;
6129       } else {
6130          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
6131          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
6132       }
6133
6134       if (ty == Ity_I64) {
6135          /* We can't do this with regparm 2 on 32-bit platforms, since
6136             the back ends aren't clever enough to handle 64-bit
6137             regparm args.  Therefore be different. */
6138          di = unsafeIRDirty_0_N(
6139                  1/*regparms*/,
6140                  hname, VG_(fnptr_to_fnentry)( helper ),
6141                  mkIRExprVec_2( addrAct, vdata )
6142               );
6143       } else {
6144          di = unsafeIRDirty_0_N(
6145                  2/*regparms*/,
6146                  hname, VG_(fnptr_to_fnentry)( helper ),
6147                  mkIRExprVec_2( addrAct,
6148                                 zwidenToHostWord( mce, vdata ))
6149               );
6150       }
6151       if (guard) di->guard = guard;
6152       setHelperAnns( mce, di );
6153       stmt( 'V', mce, IRStmt_Dirty(di) );
6154    }
6155
6156 }
6157
6158
6159 /* Do lazy pessimistic propagation through a dirty helper call, by
6160    looking at the annotations on it.  This is the most complex part of
6161    Memcheck. */
6162
6163 static IRType szToITy ( Int n )
6164 {
6165    switch (n) {
6166       case 1: return Ity_I8;
6167       case 2: return Ity_I16;
6168       case 4: return Ity_I32;
6169       case 8: return Ity_I64;
6170       default: VG_(tool_panic)("szToITy(memcheck)");
6171    }
6172 }
6173
6174 static
6175 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6176 {
6177    Int       i, k, n, toDo, gSz, gOff;
6178    IRAtom    *src, *here, *curr;
6179    IRType    tySrc, tyDst;
6180    IRTemp    dst;
6181    IREndness end;
6182
6183    /* What's the native endianness?  We need to know this. */
6184 #  if defined(VG_BIGENDIAN)
6185    end = Iend_BE;
6186 #  elif defined(VG_LITTLEENDIAN)
6187    end = Iend_LE;
6188 #  else
6189 #    error "Unknown endianness"
6190 #  endif
6191
6192    /* First check the guard. */
6193    complainIfUndefined(mce, d->guard, NULL);
6194
6195    /* Now round up all inputs and PCast over them. */
6196    curr = definedOfType(Ity_I32);
6197
6198    /* Inputs: unmasked args
6199       Note: arguments are evaluated REGARDLESS of the guard expression */
6200    for (i = 0; d->args[i]; i++) {
6201       IRAtom* arg = d->args[i];
6202       if ( (d->cee->mcx_mask & (1<<i))
6203            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6204          /* ignore this arg */
6205       } else {
6206          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6207          curr = mkUifU32(mce, here, curr);
6208       }
6209    }
6210
6211    /* Inputs: guest state that we read. */
6212    for (i = 0; i < d->nFxState; i++) {
6213       tl_assert(d->fxState[i].fx != Ifx_None);
6214       if (d->fxState[i].fx == Ifx_Write)
6215          continue;
6216
6217       /* Enumerate the described state segments */
6218       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6219          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6220          gSz  = d->fxState[i].size;
6221
6222          /* Ignore any sections marked as 'always defined'. */
6223          if (isAlwaysDefd(mce, gOff, gSz)) {
6224             if (0)
6225             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6226                         gOff, gSz);
6227             continue;
6228          }
6229
6230          /* This state element is read or modified.  So we need to
6231             consider it.  If larger than 8 bytes, deal with it in
6232             8-byte chunks. */
6233          while (True) {
6234             tl_assert(gSz >= 0);
6235             if (gSz == 0) break;
6236             n = gSz <= 8 ? gSz : 8;
6237             /* update 'curr' with UifU of the state slice
6238                gOff .. gOff+n-1 */
6239             tySrc = szToITy( n );
6240
6241             /* Observe the guard expression. If it is false use an
6242                all-bits-defined bit pattern */
6243             IRAtom *cond, *iffalse, *iftrue;
6244
6245             cond    = assignNew('V', mce, Ity_I1, d->guard);
6246             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6247             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6248             src     = assignNew('V', mce, tySrc,
6249                                 IRExpr_ITE(cond, iftrue, iffalse));
6250
6251             here = mkPCastTo( mce, Ity_I32, src );
6252             curr = mkUifU32(mce, here, curr);
6253             gSz -= n;
6254             gOff += n;
6255          }
6256       }
6257    }
6258
6259    /* Inputs: memory.  First set up some info needed regardless of
6260       whether we're doing reads or writes. */
6261
6262    if (d->mFx != Ifx_None) {
6263       /* Because we may do multiple shadow loads/stores from the same
6264          base address, it's best to do a single test of its
6265          definedness right now.  Post-instrumentation optimisation
6266          should remove all but this test. */
6267       IRType tyAddr;
6268       tl_assert(d->mAddr);
6269       complainIfUndefined(mce, d->mAddr, d->guard);
6270
6271       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6272       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6273       tl_assert(tyAddr == mce->hWordTy); /* not really right */
6274    }
6275
6276    /* Deal with memory inputs (reads or modifies) */
6277    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6278       toDo   = d->mSize;
6279       /* chew off 32-bit chunks.  We don't care about the endianness
6280          since it's all going to be condensed down to a single bit,
6281          but nevertheless choose an endianness which is hopefully
6282          native to the platform. */
6283       while (toDo >= 4) {
6284          here = mkPCastTo(
6285                    mce, Ity_I32,
6286                    expr2vbits_Load_guarded_Simple(
6287                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6288                 );
6289          curr = mkUifU32(mce, here, curr);
6290          toDo -= 4;
6291       }
6292       /* chew off 16-bit chunks */
6293       while (toDo >= 2) {
6294          here = mkPCastTo(
6295                    mce, Ity_I32,
6296                    expr2vbits_Load_guarded_Simple(
6297                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6298                 );
6299          curr = mkUifU32(mce, here, curr);
6300          toDo -= 2;
6301       }
6302       /* chew off the remaining 8-bit chunk, if any */
6303       if (toDo == 1) {
6304          here = mkPCastTo(
6305                    mce, Ity_I32,
6306                    expr2vbits_Load_guarded_Simple(
6307                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6308                 );
6309          curr = mkUifU32(mce, here, curr);
6310          toDo -= 1;
6311       }
6312       tl_assert(toDo == 0);
6313    }
6314
6315    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
6316       all the inputs to the helper.  Now we need to re-distribute the
6317       results to all destinations. */
6318
6319    /* Outputs: the destination temporary, if there is one. */
6320    if (d->tmp != IRTemp_INVALID) {
6321       dst   = findShadowTmpV(mce, d->tmp);
6322       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6323       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6324    }
6325
6326    /* Outputs: guest state that we write or modify. */
6327    for (i = 0; i < d->nFxState; i++) {
6328       tl_assert(d->fxState[i].fx != Ifx_None);
6329       if (d->fxState[i].fx == Ifx_Read)
6330          continue;
6331
6332       /* Enumerate the described state segments */
6333       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6334          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6335          gSz  = d->fxState[i].size;
6336
6337          /* Ignore any sections marked as 'always defined'. */
6338          if (isAlwaysDefd(mce, gOff, gSz))
6339             continue;
6340
6341          /* This state element is written or modified.  So we need to
6342             consider it.  If larger than 8 bytes, deal with it in
6343             8-byte chunks. */
6344          while (True) {
6345             tl_assert(gSz >= 0);
6346             if (gSz == 0) break;
6347             n = gSz <= 8 ? gSz : 8;
6348             /* Write suitably-casted 'curr' to the state slice
6349                gOff .. gOff+n-1 */
6350             tyDst = szToITy( n );
6351             do_shadow_PUT( mce, gOff,
6352                                 NULL, /* original atom */
6353                                 mkPCastTo( mce, tyDst, curr ), d->guard );
6354             gSz -= n;
6355             gOff += n;
6356          }
6357       }
6358    }
6359
6360    /* Outputs: memory that we write or modify.  Same comments about
6361       endianness as above apply. */
6362    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6363       toDo   = d->mSize;
6364       /* chew off 32-bit chunks */
6365       while (toDo >= 4) {
6366          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6367                           NULL, /* original data */
6368                           mkPCastTo( mce, Ity_I32, curr ),
6369                           d->guard );
6370          toDo -= 4;
6371       }
6372       /* chew off 16-bit chunks */
6373       while (toDo >= 2) {
6374          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6375                           NULL, /* original data */
6376                           mkPCastTo( mce, Ity_I16, curr ),
6377                           d->guard );
6378          toDo -= 2;
6379       }
6380       /* chew off the remaining 8-bit chunk, if any */
6381       if (toDo == 1) {
6382          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6383                           NULL, /* original data */
6384                           mkPCastTo( mce, Ity_I8, curr ),
6385                           d->guard );
6386          toDo -= 1;
6387       }
6388       tl_assert(toDo == 0);
6389    }
6390
6391 }
6392
6393
6394 /* We have an ABI hint telling us that [base .. base+len-1] is to
6395    become undefined ("writable").  Generate code to call a helper to
6396    notify the A/V bit machinery of this fact.
6397
6398    We call
6399    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6400                                                     Addr nia );
6401 */
6402 static
6403 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6404 {
6405    IRDirty* di;
6406
6407    if (MC_(clo_mc_level) == 3) {
6408       di = unsafeIRDirty_0_N(
6409               3/*regparms*/,
6410               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6411               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6412               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6413            );
6414    } else {
6415       /* We ignore the supplied nia, since it is irrelevant. */
6416       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6417       /* Special-case the len==128 case, since that is for amd64-ELF,
6418          which is a very common target. */
6419       if (len == 128) {
6420          di = unsafeIRDirty_0_N(
6421                  1/*regparms*/,
6422                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6423                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6424                  mkIRExprVec_1( base )
6425               );
6426       } else {
6427          di = unsafeIRDirty_0_N(
6428                  2/*regparms*/,
6429                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6430                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6431                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6432               );
6433       }
6434    }
6435
6436    stmt( 'V', mce, IRStmt_Dirty(di) );
6437 }
6438
6439
6440 /* ------ Dealing with IRCAS (big and complex) ------ */
6441
6442 /* FWDS */
6443 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
6444                              IRAtom* baseaddr, Int offset );
6445 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6446 static void    gen_store_b ( MCEnv* mce, Int szB,
6447                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
6448                              IRAtom* guard );
6449
6450 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6451 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6452
6453
6454 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6455    IRExpr.Consts, else this asserts.  If they are both Consts, it
6456    doesn't do anything.  So that just leaves the RdTmp case.
6457
6458    In which case: this assigns the shadow value SHADOW to the IR
6459    shadow temporary associated with ORIG.  That is, ORIG, being an
6460    original temporary, will have a shadow temporary associated with
6461    it.  However, in the case envisaged here, there will so far have
6462    been no IR emitted to actually write a shadow value into that
6463    temporary.  What this routine does is to (emit IR to) copy the
6464    value in SHADOW into said temporary, so that after this call,
6465    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6466    value in SHADOW.
6467
6468    Point is to allow callers to compute "by hand" a shadow value for
6469    ORIG, and force it to be associated with ORIG.
6470
6471    How do we know that that shadow associated with ORIG has not so far
6472    been assigned to?  Well, we don't per se know that, but supposing
6473    it had.  Then this routine would create a second assignment to it,
6474    and later the IR sanity checker would barf.  But that never
6475    happens.  QED.
6476 */
6477 static void bind_shadow_tmp_to_orig ( UChar how,
6478                                       MCEnv* mce,
6479                                       IRAtom* orig, IRAtom* shadow )
6480 {
6481    tl_assert(isOriginalAtom(mce, orig));
6482    tl_assert(isShadowAtom(mce, shadow));
6483    switch (orig->tag) {
6484       case Iex_Const:
6485          tl_assert(shadow->tag == Iex_Const);
6486          break;
6487       case Iex_RdTmp:
6488          tl_assert(shadow->tag == Iex_RdTmp);
6489          if (how == 'V') {
6490             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6491                    shadow);
6492          } else {
6493             tl_assert(how == 'B');
6494             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6495                    shadow);
6496          }
6497          break;
6498       default:
6499          tl_assert(0);
6500    }
6501 }
6502
6503
6504 static
6505 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6506 {
6507    /* Scheme is (both single- and double- cases):
6508
6509       1. fetch data#,dataB (the proposed new value)
6510
6511       2. fetch expd#,expdB (what we expect to see at the address)
6512
6513       3. check definedness of address
6514
6515       4. load old#,oldB from shadow memory; this also checks
6516          addressibility of the address
6517
6518       5. the CAS itself
6519
6520       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
6521
6522       7. if "expected == old" (as computed by (6))
6523             store data#,dataB to shadow memory
6524
6525       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
6526       'data' but 7 stores 'data#'.  Hence it is possible for the
6527       shadow data to be incorrectly checked and/or updated:
6528
6529       * 7 is at least gated correctly, since the 'expected == old'
6530         condition is derived from outputs of 5.  However, the shadow
6531         write could happen too late: imagine after 5 we are
6532         descheduled, a different thread runs, writes a different
6533         (shadow) value at the address, and then we resume, hence
6534         overwriting the shadow value written by the other thread.
6535
6536       Because the original memory access is atomic, there's no way to
6537       make both the original and shadow accesses into a single atomic
6538       thing, hence this is unavoidable.
6539
6540       At least as Valgrind stands, I don't think it's a problem, since
6541       we're single threaded *and* we guarantee that there are no
6542       context switches during the execution of any specific superblock
6543       -- context switches can only happen at superblock boundaries.
6544
6545       If Valgrind ever becomes MT in the future, then it might be more
6546       of a problem.  A possible kludge would be to artificially
6547       associate with the location, a lock, which we must acquire and
6548       release around the transaction as a whole.  Hmm, that probably
6549       would't work properly since it only guards us against other
6550       threads doing CASs on the same location, not against other
6551       threads doing normal reads and writes.
6552
6553       ------------------------------------------------------------
6554
6555       COMMENT_ON_CasCmpEQ:
6556
6557       Note two things.  Firstly, in the sequence above, we compute
6558       "expected == old", but we don't check definedness of it.  Why
6559       not?  Also, the x86 and amd64 front ends use
6560       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6561       determination (expected == old ?) for themselves, and we also
6562       don't check definedness for those primops; we just say that the
6563       result is defined.  Why?  Details follow.
6564
6565       x86/amd64 contains various forms of locked insns:
6566       * lock prefix before all basic arithmetic insn;
6567         eg lock xorl %reg1,(%reg2)
6568       * atomic exchange reg-mem
6569       * compare-and-swaps
6570
6571       Rather than attempt to represent them all, which would be a
6572       royal PITA, I used a result from Maurice Herlihy
6573       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6574       demonstrates that compare-and-swap is a primitive more general
6575       than the other two, and so can be used to represent all of them.
6576       So the translation scheme for (eg) lock incl (%reg) is as
6577       follows:
6578
6579         again:
6580          old = * %reg
6581          new = old + 1
6582          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6583
6584       The "atomically" is the CAS bit.  The scheme is always the same:
6585       get old value from memory, compute new value, atomically stuff
6586       new value back in memory iff the old value has not changed (iow,
6587       no other thread modified it in the meantime).  If it has changed
6588       then we've been out-raced and we have to start over.
6589
6590       Now that's all very neat, but it has the bad side effect of
6591       introducing an explicit equality test into the translation.
6592       Consider the behaviour of said code on a memory location which
6593       is uninitialised.  We will wind up doing a comparison on
6594       uninitialised data, and mc duly complains.
6595
6596       What's difficult about this is, the common case is that the
6597       location is uncontended, and so we're usually comparing the same
6598       value (* %reg) with itself.  So we shouldn't complain even if it
6599       is undefined.  But mc doesn't know that.
6600
6601       My solution is to mark the == in the IR specially, so as to tell
6602       mc that it almost certainly compares a value with itself, and we
6603       should just regard the result as always defined.  Rather than
6604       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6605       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6606
6607       So there's always the question of, can this give a false
6608       negative?  eg, imagine that initially, * %reg is defined; and we
6609       read that; but then in the gap between the read and the CAS, a
6610       different thread writes an undefined (and different) value at
6611       the location.  Then the CAS in this thread will fail and we will
6612       go back to "again:", but without knowing that the trip back
6613       there was based on an undefined comparison.  No matter; at least
6614       the other thread won the race and the location is correctly
6615       marked as undefined.  What if it wrote an uninitialised version
6616       of the same value that was there originally, though?
6617
6618       etc etc.  Seems like there's a small corner case in which we
6619       might lose the fact that something's defined -- we're out-raced
6620       in between the "old = * reg" and the "atomically {", _and_ the
6621       other thread is writing in an undefined version of what's
6622       already there.  Well, that seems pretty unlikely.
6623
6624       ---
6625
6626       If we ever need to reinstate it .. code which generates a
6627       definedness test for "expected == old" was removed at r10432 of
6628       this file.
6629    */
6630    if (cas->oldHi == IRTemp_INVALID) {
6631       do_shadow_CAS_single( mce, cas );
6632    } else {
6633       do_shadow_CAS_double( mce, cas );
6634    }
6635 }
6636
6637
6638 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6639 {
6640    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6641    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6642    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6643    IRAtom *expd_eq_old = NULL;
6644    IROp   opCasCmpEQ;
6645    Int    elemSzB;
6646    IRType elemTy;
6647    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6648
6649    /* single CAS */
6650    tl_assert(cas->oldHi == IRTemp_INVALID);
6651    tl_assert(cas->expdHi == NULL);
6652    tl_assert(cas->dataHi == NULL);
6653
6654    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6655    switch (elemTy) {
6656       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
6657       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6658       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6659       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6660       default: tl_assert(0); /* IR defn disallows any other types */
6661    }
6662
6663    /* 1. fetch data# (the proposed new value) */
6664    tl_assert(isOriginalAtom(mce, cas->dataLo));
6665    vdataLo
6666       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6667    tl_assert(isShadowAtom(mce, vdataLo));
6668    if (otrak) {
6669       bdataLo
6670          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6671       tl_assert(isShadowAtom(mce, bdataLo));
6672    }
6673
6674    /* 2. fetch expected# (what we expect to see at the address) */
6675    tl_assert(isOriginalAtom(mce, cas->expdLo));
6676    vexpdLo
6677       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6678    tl_assert(isShadowAtom(mce, vexpdLo));
6679    if (otrak) {
6680       bexpdLo
6681          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6682       tl_assert(isShadowAtom(mce, bexpdLo));
6683    }
6684
6685    /* 3. check definedness of address */
6686    /* 4. fetch old# from shadow memory; this also checks
6687          addressibility of the address */
6688    voldLo
6689       = assignNew(
6690            'V', mce, elemTy,
6691            expr2vbits_Load(
6692               mce,
6693               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6694               NULL/*always happens*/
6695         ));
6696    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6697    if (otrak) {
6698       boldLo
6699          = assignNew('B', mce, Ity_I32,
6700                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6701       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6702    }
6703
6704    /* 5. the CAS itself */
6705    stmt( 'C', mce, IRStmt_CAS(cas) );
6706
6707    /* 6. compute "expected == old" */
6708    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6709    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6710       tree, but it's not copied from the input block. */
6711    expd_eq_old
6712       = assignNew('C', mce, Ity_I1,
6713                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6714
6715    /* 7. if "expected == old"
6716             store data# to shadow memory */
6717    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6718                     NULL/*data*/, vdataLo/*vdata*/,
6719                     expd_eq_old/*guard for store*/ );
6720    if (otrak) {
6721       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6722                    bdataLo/*bdata*/,
6723                    expd_eq_old/*guard for store*/ );
6724    }
6725 }
6726
6727
6728 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6729 {
6730    IRAtom *vdataHi = NULL, *bdataHi = NULL;
6731    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6732    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6733    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6734    IRAtom *voldHi  = NULL, *boldHi  = NULL;
6735    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6736    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6737    IRAtom *expd_eq_old = NULL, *zero = NULL;
6738    IROp   opCasCmpEQ, opOr, opXor;
6739    Int    elemSzB, memOffsLo, memOffsHi;
6740    IRType elemTy;
6741    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6742
6743    /* double CAS */
6744    tl_assert(cas->oldHi != IRTemp_INVALID);
6745    tl_assert(cas->expdHi != NULL);
6746    tl_assert(cas->dataHi != NULL);
6747
6748    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6749    switch (elemTy) {
6750       case Ity_I8:
6751          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6752          elemSzB = 1; zero = mkU8(0);
6753          break;
6754       case Ity_I16:
6755          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6756          elemSzB = 2; zero = mkU16(0);
6757          break;
6758       case Ity_I32:
6759          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6760          elemSzB = 4; zero = mkU32(0);
6761          break;
6762       case Ity_I64:
6763          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6764          elemSzB = 8; zero = mkU64(0);
6765          break;
6766       default:
6767          tl_assert(0); /* IR defn disallows any other types */
6768    }
6769
6770    /* 1. fetch data# (the proposed new value) */
6771    tl_assert(isOriginalAtom(mce, cas->dataHi));
6772    tl_assert(isOriginalAtom(mce, cas->dataLo));
6773    vdataHi
6774       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6775    vdataLo
6776       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6777    tl_assert(isShadowAtom(mce, vdataHi));
6778    tl_assert(isShadowAtom(mce, vdataLo));
6779    if (otrak) {
6780       bdataHi
6781          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6782       bdataLo
6783          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6784       tl_assert(isShadowAtom(mce, bdataHi));
6785       tl_assert(isShadowAtom(mce, bdataLo));
6786    }
6787
6788    /* 2. fetch expected# (what we expect to see at the address) */
6789    tl_assert(isOriginalAtom(mce, cas->expdHi));
6790    tl_assert(isOriginalAtom(mce, cas->expdLo));
6791    vexpdHi
6792       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6793    vexpdLo
6794       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6795    tl_assert(isShadowAtom(mce, vexpdHi));
6796    tl_assert(isShadowAtom(mce, vexpdLo));
6797    if (otrak) {
6798       bexpdHi
6799          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6800       bexpdLo
6801          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6802       tl_assert(isShadowAtom(mce, bexpdHi));
6803       tl_assert(isShadowAtom(mce, bexpdLo));
6804    }
6805
6806    /* 3. check definedness of address */
6807    /* 4. fetch old# from shadow memory; this also checks
6808          addressibility of the address */
6809    if (cas->end == Iend_LE) {
6810       memOffsLo = 0;
6811       memOffsHi = elemSzB;
6812    } else {
6813       tl_assert(cas->end == Iend_BE);
6814       memOffsLo = elemSzB;
6815       memOffsHi = 0;
6816    }
6817    voldHi
6818       = assignNew(
6819            'V', mce, elemTy,
6820            expr2vbits_Load(
6821               mce,
6822               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6823               NULL/*always happens*/
6824         ));
6825    voldLo
6826       = assignNew(
6827            'V', mce, elemTy,
6828            expr2vbits_Load(
6829               mce,
6830               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6831               NULL/*always happens*/
6832         ));
6833    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6834    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6835    if (otrak) {
6836       boldHi
6837          = assignNew('B', mce, Ity_I32,
6838                      gen_load_b(mce, elemSzB, cas->addr,
6839                                 memOffsHi/*addr bias*/));
6840       boldLo
6841          = assignNew('B', mce, Ity_I32,
6842                      gen_load_b(mce, elemSzB, cas->addr,
6843                                 memOffsLo/*addr bias*/));
6844       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6845       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6846    }
6847
6848    /* 5. the CAS itself */
6849    stmt( 'C', mce, IRStmt_CAS(cas) );
6850
6851    /* 6. compute "expected == old" */
6852    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6853    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6854       tree, but it's not copied from the input block. */
6855    /*
6856       xHi = oldHi ^ expdHi;
6857       xLo = oldLo ^ expdLo;
6858       xHL = xHi | xLo;
6859       expd_eq_old = xHL == 0;
6860    */
6861    xHi = assignNew('C', mce, elemTy,
6862                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6863    xLo = assignNew('C', mce, elemTy,
6864                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6865    xHL = assignNew('C', mce, elemTy,
6866                    binop(opOr, xHi, xLo));
6867    expd_eq_old
6868       = assignNew('C', mce, Ity_I1,
6869                   binop(opCasCmpEQ, xHL, zero));
6870
6871    /* 7. if "expected == old"
6872             store data# to shadow memory */
6873    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6874                     NULL/*data*/, vdataHi/*vdata*/,
6875                     expd_eq_old/*guard for store*/ );
6876    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6877                     NULL/*data*/, vdataLo/*vdata*/,
6878                     expd_eq_old/*guard for store*/ );
6879    if (otrak) {
6880       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6881                    bdataHi/*bdata*/,
6882                    expd_eq_old/*guard for store*/ );
6883       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6884                    bdataLo/*bdata*/,
6885                    expd_eq_old/*guard for store*/ );
6886    }
6887 }
6888
6889
6890 /* ------ Dealing with LL/SC (not difficult) ------ */
6891
6892 static void do_shadow_LLSC ( MCEnv*    mce,
6893                              IREndness stEnd,
6894                              IRTemp    stResult,
6895                              IRExpr*   stAddr,
6896                              IRExpr*   stStoredata )
6897 {
6898    /* In short: treat a load-linked like a normal load followed by an
6899       assignment of the loaded (shadow) data to the result temporary.
6900       Treat a store-conditional like a normal store, and mark the
6901       result temporary as defined. */
6902    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
6903    IRTemp resTmp = findShadowTmpV(mce, stResult);
6904
6905    tl_assert(isIRAtom(stAddr));
6906    if (stStoredata)
6907       tl_assert(isIRAtom(stStoredata));
6908
6909    if (stStoredata == NULL) {
6910       /* Load Linked */
6911       /* Just treat this as a normal load, followed by an assignment of
6912          the value to .result. */
6913       /* Stay sane */
6914       tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
6915                 || resTy == Ity_I16 || resTy == Ity_I8);
6916       assign( 'V', mce, resTmp,
6917                    expr2vbits_Load(
6918                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6919                       NULL/*always happens*/) );
6920    } else {
6921       /* Store Conditional */
6922       /* Stay sane */
6923       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6924                                    stStoredata);
6925       tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
6926                 || dataTy == Ity_I16 || dataTy == Ity_I8);
6927       do_shadow_Store( mce, stEnd,
6928                             stAddr, 0/* addr bias */,
6929                             stStoredata,
6930                             NULL /* shadow data */,
6931                             NULL/*guard*/ );
6932       /* This is a store conditional, so it writes to .result a value
6933          indicating whether or not the store succeeded.  Just claim
6934          this value is always defined.  In the PowerPC interpretation
6935          of store-conditional, definedness of the success indication
6936          depends on whether the address of the store matches the
6937          reservation address.  But we can't tell that here (and
6938          anyway, we're not being PowerPC-specific).  At least we are
6939          guaranteed that the definedness of the store address, and its
6940          addressibility, will be checked as per normal.  So it seems
6941          pretty safe to just say that the success indication is always
6942          defined.
6943
6944          In schemeS, for origin tracking, we must correspondingly set
6945          a no-origin value for the origin shadow of .result.
6946       */
6947       tl_assert(resTy == Ity_I1);
6948       assign( 'V', mce, resTmp, definedOfType(resTy) );
6949    }
6950 }
6951
6952
6953 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6954
6955 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6956 {
6957    complainIfUndefined(mce, sg->guard, NULL);
6958    /* do_shadow_Store will generate code to check the definedness and
6959       validity of sg->addr, in the case where sg->guard evaluates to
6960       True at run-time. */
6961    do_shadow_Store( mce, sg->end,
6962                     sg->addr, 0/* addr bias */,
6963                     sg->data,
6964                     NULL /* shadow data */,
6965                     sg->guard );
6966 }
6967
6968 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6969 {
6970    complainIfUndefined(mce, lg->guard, NULL);
6971    /* expr2vbits_Load_guarded_General will generate code to check the
6972       definedness and validity of lg->addr, in the case where
6973       lg->guard evaluates to True at run-time. */
6974
6975    /* Look at the LoadG's built-in conversion operation, to determine
6976       the source (actual loaded data) type, and the equivalent IROp.
6977       NOTE that implicitly we are taking a widening operation to be
6978       applied to original atoms and producing one that applies to V
6979       bits.  Since signed and unsigned widening are self-shadowing,
6980       this is a straight copy of the op (modulo swapping from the
6981       IRLoadGOp form to the IROp form).  Note also therefore that this
6982       implicitly duplicates the logic to do with said widening ops in
6983       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
6984    IROp   vwiden   = Iop_INVALID;
6985    IRType loadedTy = Ity_INVALID;
6986    switch (lg->cvt) {
6987       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
6988       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
6989       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
6990       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
6991       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
6992       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
6993       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
6994       default: VG_(tool_panic)("do_shadow_LoadG");
6995    }
6996
6997    IRAtom* vbits_alt
6998       = expr2vbits( mce, lg->alt, HuOth );
6999    IRAtom* vbits_final
7000       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
7001                                         lg->addr, 0/*addr bias*/,
7002                                         lg->guard, vwiden, vbits_alt );
7003    /* And finally, bind the V bits to the destination temporary. */
7004    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
7005 }
7006
7007
7008 /*------------------------------------------------------------*/
7009 /*--- Origin tracking stuff                                ---*/
7010 /*------------------------------------------------------------*/
7011
7012 /* Almost identical to findShadowTmpV. */
7013 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
7014 {
7015    TempMapEnt* ent;
7016    /* VG_(indexXA) range-checks 'orig', hence no need to check
7017       here. */
7018    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7019    tl_assert(ent->kind == Orig);
7020    if (ent->shadowB == IRTemp_INVALID) {
7021       IRTemp tmpB
7022         = newTemp( mce, Ity_I32, BSh );
7023       /* newTemp may cause mce->tmpMap to resize, hence previous results
7024          from VG_(indexXA) are invalid. */
7025       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7026       tl_assert(ent->kind == Orig);
7027       tl_assert(ent->shadowB == IRTemp_INVALID);
7028       ent->shadowB = tmpB;
7029    }
7030    return ent->shadowB;
7031 }
7032
7033 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
7034 {
7035    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
7036 }
7037
7038
7039 /* Make a guarded origin load, with no special handling in the
7040    didn't-happen case.  A GUARD of NULL is assumed to mean "always
7041    True".
7042
7043    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7044    return the otag.  The loaded size is SZB.  If GUARD evaluates to
7045    False at run time then the returned otag is zero.
7046 */
7047 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
7048                                     IRAtom* baseaddr,
7049                                     Int offset, IRExpr* guard )
7050 {
7051    void*    hFun;
7052    const HChar* hName;
7053    IRTemp   bTmp;
7054    IRDirty* di;
7055    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7056    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7057    IRAtom*  ea    = baseaddr;
7058    if (offset != 0) {
7059       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7060                                    : mkU64( (Long)(Int)offset );
7061       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7062    }
7063    bTmp = newTemp(mce, mce->hWordTy, BSh);
7064
7065    switch (szB) {
7066       case 1: hFun  = (void*)&MC_(helperc_b_load1);
7067               hName = "MC_(helperc_b_load1)";
7068               break;
7069       case 2: hFun  = (void*)&MC_(helperc_b_load2);
7070               hName = "MC_(helperc_b_load2)";
7071               break;
7072       case 4: hFun  = (void*)&MC_(helperc_b_load4);
7073               hName = "MC_(helperc_b_load4)";
7074               break;
7075       case 8: hFun  = (void*)&MC_(helperc_b_load8);
7076               hName = "MC_(helperc_b_load8)";
7077               break;
7078       case 16: hFun  = (void*)&MC_(helperc_b_load16);
7079                hName = "MC_(helperc_b_load16)";
7080                break;
7081       case 32: hFun  = (void*)&MC_(helperc_b_load32);
7082                hName = "MC_(helperc_b_load32)";
7083                break;
7084       default:
7085          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
7086          tl_assert(0);
7087    }
7088    di = unsafeIRDirty_1_N(
7089            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
7090            mkIRExprVec_1( ea )
7091         );
7092    if (guard) {
7093       di->guard = guard;
7094       /* Ideally the didn't-happen return value here would be
7095          all-zeroes (unknown-origin), so it'd be harmless if it got
7096          used inadvertently.  We slum it out with the IR-mandated
7097          default value (0b01 repeating, 0x55 etc) as that'll probably
7098          trump all legitimate otags via Max32, and it's pretty
7099          obviously bogus. */
7100    }
7101    /* no need to mess with any annotations.  This call accesses
7102       neither guest state nor guest memory. */
7103    stmt( 'B', mce, IRStmt_Dirty(di) );
7104    if (mce->hWordTy == Ity_I64) {
7105       /* 64-bit host */
7106       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7107       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7108       return mkexpr(bTmp32);
7109    } else {
7110       /* 32-bit host */
7111       return mkexpr(bTmp);
7112    }
7113 }
7114
7115
7116 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
7117    loaded size is SZB.  The load is regarded as unconditional (always
7118    happens).
7119 */
7120 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7121                             Int offset )
7122 {
7123    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7124 }
7125
7126
7127 /* The most general handler for guarded origin loads.  A GUARD of NULL
7128    is assumed to mean "always True".
7129
7130    Generate IR to do a shadow origin load from ADDR+BIAS and return
7131    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
7132    run time then the returned B bits are simply BALT instead.
7133 */
7134 static
7135 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7136                                         IRType ty,
7137                                         IRAtom* addr, UInt bias,
7138                                         IRAtom* guard, IRAtom* balt )
7139 {
7140    /* If the guard evaluates to True, this will hold the loaded
7141       origin.  If the guard evaluates to False, this will be zero,
7142       meaning "unknown origin", in which case we will have to replace
7143       it using an ITE below. */
7144    IRAtom* iftrue
7145       = assignNew('B', mce, Ity_I32,
7146                   gen_guarded_load_b(mce, sizeofIRType(ty),
7147                                      addr, bias, guard));
7148    /* These are the bits we will return if the load doesn't take
7149       place. */
7150    IRAtom* iffalse
7151       = balt;
7152    /* Prepare the cond for the ITE.  Convert a NULL cond into
7153       something that iropt knows how to fold out later. */
7154    IRAtom* cond
7155       = guard == NULL  ? mkU1(1)  : guard;
7156    /* And assemble the final result. */
7157    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7158 }
7159
7160
7161 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
7162    the store really happens; NULL means it unconditionally does. */
7163 static void gen_store_b ( MCEnv* mce, Int szB,
7164                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
7165                           IRAtom* guard )
7166 {
7167    void*    hFun;
7168    const HChar* hName;
7169    IRDirty* di;
7170    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7171    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7172    IRAtom*  ea    = baseaddr;
7173    if (guard) {
7174       tl_assert(isOriginalAtom(mce, guard));
7175       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7176    }
7177    if (offset != 0) {
7178       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7179                                    : mkU64( (Long)(Int)offset );
7180       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
7181    }
7182    if (mce->hWordTy == Ity_I64)
7183       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7184
7185    switch (szB) {
7186       case 1: hFun  = (void*)&MC_(helperc_b_store1);
7187               hName = "MC_(helperc_b_store1)";
7188               break;
7189       case 2: hFun  = (void*)&MC_(helperc_b_store2);
7190               hName = "MC_(helperc_b_store2)";
7191               break;
7192       case 4: hFun  = (void*)&MC_(helperc_b_store4);
7193               hName = "MC_(helperc_b_store4)";
7194               break;
7195       case 8: hFun  = (void*)&MC_(helperc_b_store8);
7196               hName = "MC_(helperc_b_store8)";
7197               break;
7198       case 16: hFun  = (void*)&MC_(helperc_b_store16);
7199                hName = "MC_(helperc_b_store16)";
7200                break;
7201       case 32: hFun  = (void*)&MC_(helperc_b_store32);
7202                hName = "MC_(helperc_b_store32)";
7203                break;
7204       default:
7205          tl_assert(0);
7206    }
7207    di = unsafeIRDirty_0_N( 2/*regparms*/,
7208            hName, VG_(fnptr_to_fnentry)( hFun ),
7209            mkIRExprVec_2( ea, dataB )
7210         );
7211    /* no need to mess with any annotations.  This call accesses
7212       neither guest state nor guest memory. */
7213    if (guard) di->guard = guard;
7214    stmt( 'B', mce, IRStmt_Dirty(di) );
7215 }
7216
7217 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7218    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7219    if (eTy == Ity_I64)
7220       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7221    if (eTy == Ity_I32)
7222       return e;
7223    tl_assert(0);
7224 }
7225
7226 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7227    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7228    tl_assert(eTy == Ity_I32);
7229    if (dstTy == Ity_I64)
7230       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7231    tl_assert(0);
7232 }
7233
7234
7235 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7236 {
7237    tl_assert(MC_(clo_mc_level) == 3);
7238
7239    switch (e->tag) {
7240
7241       case Iex_GetI: {
7242          IRRegArray* descr_b;
7243          IRAtom      *t1, *t2, *t3, *t4;
7244          IRRegArray* descr      = e->Iex.GetI.descr;
7245          IRType equivIntTy
7246             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7247          /* If this array is unshadowable for whatever reason, use the
7248             usual approximation. */
7249          if (equivIntTy == Ity_INVALID)
7250             return mkU32(0);
7251          tl_assert(sizeofIRType(equivIntTy) >= 4);
7252          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7253          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7254                                  equivIntTy, descr->nElems );
7255          /* Do a shadow indexed get of the same size, giving t1.  Take
7256             the bottom 32 bits of it, giving t2.  Compute into t3 the
7257             origin for the index (almost certainly zero, but there's
7258             no harm in being completely general here, since iropt will
7259             remove any useless code), and fold it in, giving a final
7260             value t4. */
7261          t1 = assignNew( 'B', mce, equivIntTy,
7262                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7263                                                 e->Iex.GetI.bias ));
7264          t2 = narrowTo32( mce, t1 );
7265          t3 = schemeE( mce, e->Iex.GetI.ix );
7266          t4 = gen_maxU32( mce, t2, t3 );
7267          return t4;
7268       }
7269       case Iex_CCall: {
7270          Int i;
7271          IRAtom*  here;
7272          IRExpr** args = e->Iex.CCall.args;
7273          IRAtom*  curr = mkU32(0);
7274          for (i = 0; args[i]; i++) {
7275             tl_assert(i < 32);
7276             tl_assert(isOriginalAtom(mce, args[i]));
7277             /* Only take notice of this arg if the callee's
7278                mc-exclusion mask does not say it is to be excluded. */
7279             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7280                /* the arg is to be excluded from definedness checking.
7281                   Do nothing. */
7282                if (0) VG_(printf)("excluding %s(%d)\n",
7283                                   e->Iex.CCall.cee->name, i);
7284             } else {
7285                /* calculate the arg's definedness, and pessimistically
7286                   merge it in. */
7287                here = schemeE( mce, args[i] );
7288                curr = gen_maxU32( mce, curr, here );
7289             }
7290          }
7291          return curr;
7292       }
7293       case Iex_Load: {
7294          Int dszB;
7295          dszB = sizeofIRType(e->Iex.Load.ty);
7296          /* assert that the B value for the address is already
7297             available (somewhere) */
7298          tl_assert(isIRAtom(e->Iex.Load.addr));
7299          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7300          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7301       }
7302       case Iex_ITE: {
7303          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7304          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7305          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7306          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7307       }
7308       case Iex_Qop: {
7309          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7310          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7311          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7312          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7313          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7314                                  gen_maxU32( mce, b3, b4 ) );
7315       }
7316       case Iex_Triop: {
7317          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7318          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7319          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7320          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7321       }
7322       case Iex_Binop: {
7323          switch (e->Iex.Binop.op) {
7324             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
7325             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7326             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7327             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7328                /* Just say these all produce a defined result,
7329                   regardless of their arguments.  See
7330                   COMMENT_ON_CasCmpEQ in this file. */
7331                return mkU32(0);
7332             default: {
7333                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7334                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7335                return gen_maxU32( mce, b1, b2 );
7336             }
7337          }
7338          tl_assert(0);
7339          /*NOTREACHED*/
7340       }
7341       case Iex_Unop: {
7342          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7343          return b1;
7344       }
7345       case Iex_Const:
7346          return mkU32(0);
7347       case Iex_RdTmp:
7348          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7349       case Iex_Get: {
7350          Int b_offset = MC_(get_otrack_shadow_offset)(
7351                            e->Iex.Get.offset,
7352                            sizeofIRType(e->Iex.Get.ty)
7353                         );
7354          tl_assert(b_offset >= -1
7355                    && b_offset <= mce->layout->total_sizeB -4);
7356          if (b_offset >= 0) {
7357             /* FIXME: this isn't an atom! */
7358             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7359                                Ity_I32 );
7360          }
7361          return mkU32(0);
7362       }
7363       default:
7364          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7365          ppIRExpr(e);
7366          VG_(tool_panic)("memcheck:schemeE");
7367    }
7368 }
7369
7370
7371 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7372 {
7373    // This is a hacked version of do_shadow_Dirty
7374    Int       i, k, n, toDo, gSz, gOff;
7375    IRAtom    *here, *curr;
7376    IRTemp    dst;
7377
7378    /* First check the guard. */
7379    curr = schemeE( mce, d->guard );
7380
7381    /* Now round up all inputs and maxU32 over them. */
7382
7383    /* Inputs: unmasked args
7384       Note: arguments are evaluated REGARDLESS of the guard expression */
7385    for (i = 0; d->args[i]; i++) {
7386       IRAtom* arg = d->args[i];
7387       if ( (d->cee->mcx_mask & (1<<i))
7388            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7389          /* ignore this arg */
7390       } else {
7391          here = schemeE( mce, arg );
7392          curr = gen_maxU32( mce, curr, here );
7393       }
7394    }
7395
7396    /* Inputs: guest state that we read. */
7397    for (i = 0; i < d->nFxState; i++) {
7398       tl_assert(d->fxState[i].fx != Ifx_None);
7399       if (d->fxState[i].fx == Ifx_Write)
7400          continue;
7401
7402       /* Enumerate the described state segments */
7403       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7404          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7405          gSz  = d->fxState[i].size;
7406
7407          /* Ignore any sections marked as 'always defined'. */
7408          if (isAlwaysDefd(mce, gOff, gSz)) {
7409             if (0)
7410             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7411                         gOff, gSz);
7412             continue;
7413          }
7414
7415          /* This state element is read or modified.  So we need to
7416             consider it.  If larger than 4 bytes, deal with it in
7417             4-byte chunks. */
7418          while (True) {
7419             Int b_offset;
7420             tl_assert(gSz >= 0);
7421             if (gSz == 0) break;
7422             n = gSz <= 4 ? gSz : 4;
7423             /* update 'curr' with maxU32 of the state slice
7424                gOff .. gOff+n-1 */
7425             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7426             if (b_offset != -1) {
7427                /* Observe the guard expression. If it is false use 0, i.e.
7428                   nothing is known about the origin */
7429                IRAtom *cond, *iffalse, *iftrue;
7430
7431                cond = assignNew( 'B', mce, Ity_I1, d->guard);
7432                iffalse = mkU32(0);
7433                iftrue  = assignNew( 'B', mce, Ity_I32,
7434                                     IRExpr_Get(b_offset
7435                                                  + 2*mce->layout->total_sizeB,
7436                                                Ity_I32));
7437                here = assignNew( 'B', mce, Ity_I32,
7438                                  IRExpr_ITE(cond, iftrue, iffalse));
7439                curr = gen_maxU32( mce, curr, here );
7440             }
7441             gSz -= n;
7442             gOff += n;
7443          }
7444       }
7445    }
7446
7447    /* Inputs: memory */
7448
7449    if (d->mFx != Ifx_None) {
7450       /* Because we may do multiple shadow loads/stores from the same
7451          base address, it's best to do a single test of its
7452          definedness right now.  Post-instrumentation optimisation
7453          should remove all but this test. */
7454       tl_assert(d->mAddr);
7455       here = schemeE( mce, d->mAddr );
7456       curr = gen_maxU32( mce, curr, here );
7457    }
7458
7459    /* Deal with memory inputs (reads or modifies) */
7460    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7461       toDo   = d->mSize;
7462       /* chew off 32-bit chunks.  We don't care about the endianness
7463          since it's all going to be condensed down to a single bit,
7464          but nevertheless choose an endianness which is hopefully
7465          native to the platform. */
7466       while (toDo >= 4) {
7467          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7468                                     d->guard );
7469          curr = gen_maxU32( mce, curr, here );
7470          toDo -= 4;
7471       }
7472       /* handle possible 16-bit excess */
7473       while (toDo >= 2) {
7474          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7475                                     d->guard );
7476          curr = gen_maxU32( mce, curr, here );
7477          toDo -= 2;
7478       }
7479       /* chew off the remaining 8-bit chunk, if any */
7480       if (toDo == 1) {
7481          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7482                                     d->guard );
7483          curr = gen_maxU32( mce, curr, here );
7484          toDo -= 1;
7485       }
7486       tl_assert(toDo == 0);
7487    }
7488
7489    /* Whew!  So curr is a 32-bit B-value which should give an origin
7490       of some use if any of the inputs to the helper are undefined.
7491       Now we need to re-distribute the results to all destinations. */
7492
7493    /* Outputs: the destination temporary, if there is one. */
7494    if (d->tmp != IRTemp_INVALID) {
7495       dst   = findShadowTmpB(mce, d->tmp);
7496       assign( 'V', mce, dst, curr );
7497    }
7498
7499    /* Outputs: guest state that we write or modify. */
7500    for (i = 0; i < d->nFxState; i++) {
7501       tl_assert(d->fxState[i].fx != Ifx_None);
7502       if (d->fxState[i].fx == Ifx_Read)
7503          continue;
7504
7505       /* Enumerate the described state segments */
7506       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7507          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7508          gSz  = d->fxState[i].size;
7509
7510          /* Ignore any sections marked as 'always defined'. */
7511          if (isAlwaysDefd(mce, gOff, gSz))
7512             continue;
7513
7514          /* This state element is written or modified.  So we need to
7515             consider it.  If larger than 4 bytes, deal with it in
7516             4-byte chunks. */
7517          while (True) {
7518             Int b_offset;
7519             tl_assert(gSz >= 0);
7520             if (gSz == 0) break;
7521             n = gSz <= 4 ? gSz : 4;
7522             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7523             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7524             if (b_offset != -1) {
7525
7526                /* If the guard expression evaluates to false we simply Put
7527                   the value that is already stored in the guest state slot */
7528                IRAtom *cond, *iffalse;
7529
7530                cond    = assignNew('B', mce, Ity_I1,
7531                                    d->guard);
7532                iffalse = assignNew('B', mce, Ity_I32,
7533                                    IRExpr_Get(b_offset +
7534                                               2*mce->layout->total_sizeB,
7535                                               Ity_I32));
7536                curr = assignNew('V', mce, Ity_I32,
7537                                 IRExpr_ITE(cond, curr, iffalse));
7538
7539                stmt( 'B', mce, IRStmt_Put(b_offset
7540                                           + 2*mce->layout->total_sizeB,
7541                                           curr ));
7542             }
7543             gSz -= n;
7544             gOff += n;
7545          }
7546       }
7547    }
7548
7549    /* Outputs: memory that we write or modify.  Same comments about
7550       endianness as above apply. */
7551    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7552       toDo   = d->mSize;
7553       /* chew off 32-bit chunks */
7554       while (toDo >= 4) {
7555          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7556                       d->guard );
7557          toDo -= 4;
7558       }
7559       /* handle possible 16-bit excess */
7560       while (toDo >= 2) {
7561          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7562                       d->guard );
7563          toDo -= 2;
7564       }
7565       /* chew off the remaining 8-bit chunk, if any */
7566       if (toDo == 1) {
7567          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7568                       d->guard );
7569          toDo -= 1;
7570       }
7571       tl_assert(toDo == 0);
7572    }
7573 }
7574
7575
7576 /* Generate IR for origin shadowing for a general guarded store. */
7577 static void do_origins_Store_guarded ( MCEnv* mce,
7578                                        IREndness stEnd,
7579                                        IRExpr* stAddr,
7580                                        IRExpr* stData,
7581                                        IRExpr* guard )
7582 {
7583    Int     dszB;
7584    IRAtom* dataB;
7585    /* assert that the B value for the address is already available
7586       (somewhere), since the call to schemeE will want to see it.
7587       XXXX how does this actually ensure that?? */
7588    tl_assert(isIRAtom(stAddr));
7589    tl_assert(isIRAtom(stData));
7590    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7591    dataB = schemeE( mce, stData );
7592    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7593 }
7594
7595
7596 /* Generate IR for origin shadowing for a plain store. */
7597 static void do_origins_Store_plain ( MCEnv* mce,
7598                                      IREndness stEnd,
7599                                      IRExpr* stAddr,
7600                                      IRExpr* stData )
7601 {
7602    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7603                               NULL/*guard*/ );
7604 }
7605
7606
7607 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7608
7609 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7610 {
7611    do_origins_Store_guarded( mce, sg->end, sg->addr,
7612                              sg->data, sg->guard );
7613 }
7614
7615 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7616 {
7617    IRType loadedTy = Ity_INVALID;
7618    switch (lg->cvt) {
7619       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7620       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7621       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7622       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7623       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7624       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7625       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7626       default: VG_(tool_panic)("schemeS.IRLoadG");
7627    }
7628    IRAtom* ori_alt
7629       = schemeE( mce,lg->alt );
7630    IRAtom* ori_final
7631       = expr2ori_Load_guarded_General(mce, loadedTy,
7632                                       lg->addr, 0/*addr bias*/,
7633                                       lg->guard, ori_alt );
7634    /* And finally, bind the origin to the destination temporary. */
7635    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7636 }
7637
7638
7639 static void schemeS ( MCEnv* mce, IRStmt* st )
7640 {
7641    tl_assert(MC_(clo_mc_level) == 3);
7642
7643    switch (st->tag) {
7644
7645       case Ist_AbiHint:
7646          /* The value-check instrumenter handles this - by arranging
7647             to pass the address of the next instruction to
7648             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7649             happen for origin tracking w.r.t. AbiHints.  So there is
7650             nothing to do here. */
7651          break;
7652
7653       case Ist_PutI: {
7654          IRPutI *puti = st->Ist.PutI.details;
7655          IRRegArray* descr_b;
7656          IRAtom      *t1, *t2, *t3, *t4;
7657          IRRegArray* descr = puti->descr;
7658          IRType equivIntTy
7659             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7660          /* If this array is unshadowable for whatever reason,
7661             generate no code. */
7662          if (equivIntTy == Ity_INVALID)
7663             break;
7664          tl_assert(sizeofIRType(equivIntTy) >= 4);
7665          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7666          descr_b
7667             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7668                             equivIntTy, descr->nElems );
7669          /* Compute a value to Put - the conjoinment of the origin for
7670             the data to be Put-ted (obviously) and of the index value
7671             (not so obviously). */
7672          t1 = schemeE( mce, puti->data );
7673          t2 = schemeE( mce, puti->ix );
7674          t3 = gen_maxU32( mce, t1, t2 );
7675          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7676          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7677                                                puti->bias, t4) ));
7678          break;
7679       }
7680
7681       case Ist_Dirty:
7682          do_origins_Dirty( mce, st->Ist.Dirty.details );
7683          break;
7684
7685       case Ist_Store:
7686          do_origins_Store_plain( mce, st->Ist.Store.end,
7687                                       st->Ist.Store.addr,
7688                                       st->Ist.Store.data );
7689          break;
7690
7691       case Ist_StoreG:
7692          do_origins_StoreG( mce, st->Ist.StoreG.details );
7693          break;
7694
7695       case Ist_LoadG:
7696          do_origins_LoadG( mce, st->Ist.LoadG.details );
7697          break;
7698
7699       case Ist_LLSC: {
7700          /* In short: treat a load-linked like a normal load followed
7701             by an assignment of the loaded (shadow) data the result
7702             temporary.  Treat a store-conditional like a normal store,
7703             and mark the result temporary as defined. */
7704          if (st->Ist.LLSC.storedata == NULL) {
7705             /* Load Linked */
7706             IRType resTy
7707                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7708             IRExpr* vanillaLoad
7709                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7710             tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7711                       || resTy == Ity_I16 || resTy == Ity_I8);
7712             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7713                               schemeE(mce, vanillaLoad));
7714          } else {
7715             /* Store conditional */
7716             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7717                                     st->Ist.LLSC.addr,
7718                                     st->Ist.LLSC.storedata );
7719             /* For the rationale behind this, see comments at the
7720                place where the V-shadow for .result is constructed, in
7721                do_shadow_LLSC.  In short, we regard .result as
7722                always-defined. */
7723             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7724                               mkU32(0) );
7725          }
7726          break;
7727       }
7728
7729       case Ist_Put: {
7730          Int b_offset
7731             = MC_(get_otrack_shadow_offset)(
7732                  st->Ist.Put.offset,
7733                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7734               );
7735          if (b_offset >= 0) {
7736             /* FIXME: this isn't an atom! */
7737             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7738                                        schemeE( mce, st->Ist.Put.data )) );
7739          }
7740          break;
7741       }
7742
7743       case Ist_WrTmp:
7744          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7745                            schemeE(mce, st->Ist.WrTmp.data) );
7746          break;
7747
7748       case Ist_MBE:
7749       case Ist_NoOp:
7750       case Ist_Exit:
7751       case Ist_IMark:
7752          break;
7753
7754       default:
7755          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7756          ppIRStmt(st);
7757          VG_(tool_panic)("memcheck:schemeS");
7758    }
7759 }
7760
7761
7762 /*------------------------------------------------------------*/
7763 /*--- Post-tree-build final tidying                        ---*/
7764 /*------------------------------------------------------------*/
7765
7766 /* This exploits the observation that Memcheck often produces
7767    repeated conditional calls of the form
7768
7769    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7770
7771    with the same guard expression G guarding the same helper call.
7772    The second and subsequent calls are redundant.  This usually
7773    results from instrumentation of guest code containing multiple
7774    memory references at different constant offsets from the same base
7775    register.  After optimisation of the instrumentation, you get a
7776    test for the definedness of the base register for each memory
7777    reference, which is kinda pointless.  MC_(final_tidy) therefore
7778    looks for such repeated calls and removes all but the first. */
7779
7780
7781 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7782    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7783    get almost all the benefits of this transformation whilst causing
7784    the slide-back case to just often enough to be verifiably
7785    correct.  For posterity, the numbers are:
7786
7787    bz2-32
7788
7789    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
7790    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
7791    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
7792    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
7793    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
7794    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
7795    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
7796    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
7797    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
7798    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
7799    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
7800    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
7801    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7802
7803    bz2-64
7804
7805    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
7806    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
7807    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
7808    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
7809    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
7810    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
7811    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
7812    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
7813    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
7814    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
7815    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
7816    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
7817    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7818 */
7819
7820 /* Structs for recording which (helper, guard) pairs we have already
7821    seen. */
7822
7823 #define N_TIDYING_PAIRS 16
7824
7825 typedef
7826    struct { void* entry; IRExpr* guard; }
7827    Pair;
7828
7829 typedef
7830    struct {
7831       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7832       UInt pairsUsed;
7833    }
7834    Pairs;
7835
7836
7837 /* Return True if e1 and e2 definitely denote the same value (used to
7838    compare guards).  Return False if unknown; False is the safe
7839    answer.  Since guest registers and guest memory do not have the
7840    SSA property we must return False if any Gets or Loads appear in
7841    the expression.  This implicitly assumes that e1 and e2 have the
7842    same IR type, which is always true here -- the type is Ity_I1. */
7843
7844 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7845 {
7846    if (e1->tag != e2->tag)
7847       return False;
7848    switch (e1->tag) {
7849       case Iex_Const:
7850          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7851       case Iex_Binop:
7852          return e1->Iex.Binop.op == e2->Iex.Binop.op
7853                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7854                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7855       case Iex_Unop:
7856          return e1->Iex.Unop.op == e2->Iex.Unop.op
7857                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7858       case Iex_RdTmp:
7859          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7860       case Iex_ITE:
7861          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7862                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
7863                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7864       case Iex_Qop:
7865       case Iex_Triop:
7866       case Iex_CCall:
7867          /* be lazy.  Could define equality for these, but they never
7868             appear to be used. */
7869          return False;
7870       case Iex_Get:
7871       case Iex_GetI:
7872       case Iex_Load:
7873          /* be conservative - these may not give the same value each
7874             time */
7875          return False;
7876       case Iex_Binder:
7877          /* should never see this */
7878          /* fallthrough */
7879       default:
7880          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7881          ppIRExpr(e1);
7882          VG_(tool_panic)("memcheck:sameIRValue");
7883          return False;
7884    }
7885 }
7886
7887 /* See if 'pairs' already has an entry for (entry, guard).  Return
7888    True if so.  If not, add an entry. */
7889
7890 static
7891 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
7892 {
7893    UInt i, n = tidyingEnv->pairsUsed;
7894    tl_assert(n <= N_TIDYING_PAIRS);
7895    for (i = 0; i < n; i++) {
7896       if (tidyingEnv->pairs[i].entry == entry
7897           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
7898          return True;
7899    }
7900    /* (guard, entry) wasn't found in the array.  Add it at the end.
7901       If the array is already full, slide the entries one slot
7902       backwards.  This means we will lose to ability to detect
7903       duplicates from the pair in slot zero, but that happens so
7904       rarely that it's unlikely to have much effect on overall code
7905       quality.  Also, this strategy loses the check for the oldest
7906       tracked exit (memory reference, basically) and so that is (I'd
7907       guess) least likely to be re-used after this point. */
7908    tl_assert(i == n);
7909    if (n == N_TIDYING_PAIRS) {
7910       for (i = 1; i < N_TIDYING_PAIRS; i++) {
7911          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
7912       }
7913       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
7914       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
7915    } else {
7916       tl_assert(n < N_TIDYING_PAIRS);
7917       tidyingEnv->pairs[n].entry = entry;
7918       tidyingEnv->pairs[n].guard = guard;
7919       n++;
7920       tidyingEnv->pairsUsed = n;
7921    }
7922    return False;
7923 }
7924
7925 static Bool is_helperc_value_checkN_fail ( const HChar* name )
7926 {
7927    /* This is expensive because it happens a lot.  We are checking to
7928       see whether |name| is one of the following 8 strings:
7929
7930          MC_(helperc_value_check8_fail_no_o)
7931          MC_(helperc_value_check4_fail_no_o)
7932          MC_(helperc_value_check0_fail_no_o)
7933          MC_(helperc_value_check1_fail_no_o)
7934          MC_(helperc_value_check8_fail_w_o)
7935          MC_(helperc_value_check0_fail_w_o)
7936          MC_(helperc_value_check1_fail_w_o)
7937          MC_(helperc_value_check4_fail_w_o)
7938
7939       To speed it up, check the common prefix just once, rather than
7940       all 8 times.
7941    */
7942    const HChar* prefix = "MC_(helperc_value_check";
7943
7944    HChar n, p;
7945    while (True) {
7946       n = *name;
7947       p = *prefix;
7948       if (p == 0) break; /* ran off the end of the prefix */
7949       /* We still have some prefix to use */
7950       if (n == 0) return False; /* have prefix, but name ran out */
7951       if (n != p) return False; /* have both pfx and name, but no match */
7952       name++;
7953       prefix++;
7954    }
7955
7956    /* Check the part after the prefix. */
7957    tl_assert(*prefix == 0 && *name != 0);
7958    return    0==VG_(strcmp)(name, "8_fail_no_o)")
7959           || 0==VG_(strcmp)(name, "4_fail_no_o)")
7960           || 0==VG_(strcmp)(name, "0_fail_no_o)")
7961           || 0==VG_(strcmp)(name, "1_fail_no_o)")
7962           || 0==VG_(strcmp)(name, "8_fail_w_o)")
7963           || 0==VG_(strcmp)(name, "4_fail_w_o)")
7964           || 0==VG_(strcmp)(name, "0_fail_w_o)")
7965           || 0==VG_(strcmp)(name, "1_fail_w_o)");
7966 }
7967
7968 IRSB* MC_(final_tidy) ( IRSB* sb_in )
7969 {
7970    Int       i;
7971    IRStmt*   st;
7972    IRDirty*  di;
7973    IRExpr*   guard;
7974    IRCallee* cee;
7975    Bool      alreadyPresent;
7976    Pairs     pairs;
7977
7978    pairs.pairsUsed = 0;
7979
7980    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
7981    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
7982
7983    /* Scan forwards through the statements.  Each time a call to one
7984       of the relevant helpers is seen, check if we have made a
7985       previous call to the same helper using the same guard
7986       expression, and if so, delete the call. */
7987    for (i = 0; i < sb_in->stmts_used; i++) {
7988       st = sb_in->stmts[i];
7989       tl_assert(st);
7990       if (st->tag != Ist_Dirty)
7991          continue;
7992       di = st->Ist.Dirty.details;
7993       guard = di->guard;
7994       tl_assert(guard);
7995       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
7996       cee = di->cee;
7997       if (!is_helperc_value_checkN_fail( cee->name ))
7998          continue;
7999        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8000           guard 'guard'.  Check if we have already seen a call to this
8001           function with the same guard.  If so, delete it.  If not,
8002           add it to the set of calls we do know about. */
8003       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
8004       if (alreadyPresent) {
8005          sb_in->stmts[i] = IRStmt_NoOp();
8006          if (0) VG_(printf)("XX\n");
8007       }
8008    }
8009
8010    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
8011    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
8012
8013    return sb_in;
8014 }
8015
8016 #undef N_TIDYING_PAIRS
8017
8018
8019 /*------------------------------------------------------------*/
8020 /*--- Startup assertion checking                           ---*/
8021 /*------------------------------------------------------------*/
8022
8023 void MC_(do_instrumentation_startup_checks)( void )
8024 {
8025    /* Make a best-effort check to see that is_helperc_value_checkN_fail
8026       is working as we expect. */
8027
8028 #  define CHECK(_expected, _string) \
8029       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8030
8031    /* It should identify these 8, and no others, as targets. */
8032    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
8033    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
8034    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
8035    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
8036    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
8037    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
8038    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
8039    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
8040
8041    /* Ad-hoc selection of other strings gathered via a quick test. */
8042    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
8043    CHECK(False, "amd64g_dirtyhelper_RDTSC");
8044    CHECK(False, "MC_(helperc_b_load1)");
8045    CHECK(False, "MC_(helperc_b_load2)");
8046    CHECK(False, "MC_(helperc_b_load4)");
8047    CHECK(False, "MC_(helperc_b_load8)");
8048    CHECK(False, "MC_(helperc_b_load16)");
8049    CHECK(False, "MC_(helperc_b_load32)");
8050    CHECK(False, "MC_(helperc_b_store1)");
8051    CHECK(False, "MC_(helperc_b_store2)");
8052    CHECK(False, "MC_(helperc_b_store4)");
8053    CHECK(False, "MC_(helperc_b_store8)");
8054    CHECK(False, "MC_(helperc_b_store16)");
8055    CHECK(False, "MC_(helperc_b_store32)");
8056    CHECK(False, "MC_(helperc_LOADV8)");
8057    CHECK(False, "MC_(helperc_LOADV16le)");
8058    CHECK(False, "MC_(helperc_LOADV32le)");
8059    CHECK(False, "MC_(helperc_LOADV64le)");
8060    CHECK(False, "MC_(helperc_LOADV128le)");
8061    CHECK(False, "MC_(helperc_LOADV256le)");
8062    CHECK(False, "MC_(helperc_STOREV16le)");
8063    CHECK(False, "MC_(helperc_STOREV32le)");
8064    CHECK(False, "MC_(helperc_STOREV64le)");
8065    CHECK(False, "MC_(helperc_STOREV8)");
8066    CHECK(False, "track_die_mem_stack_8");
8067    CHECK(False, "track_new_mem_stack_8_w_ECU");
8068    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8069    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
8070
8071 #  undef CHECK
8072 }
8073
8074
8075 /*------------------------------------------------------------*/
8076 /*--- Memcheck main                                        ---*/
8077 /*------------------------------------------------------------*/
8078
8079 static Bool isBogusAtom ( IRAtom* at )
8080 {
8081    if (at->tag == Iex_RdTmp)
8082       return False;
8083    tl_assert(at->tag == Iex_Const);
8084
8085    ULong n = 0;
8086    IRConst* con = at->Iex.Const.con;
8087    switch (con->tag) {
8088       case Ico_U1:   return False;
8089       case Ico_U8:   n = (ULong)con->Ico.U8; break;
8090       case Ico_U16:  n = (ULong)con->Ico.U16; break;
8091       case Ico_U32:  n = (ULong)con->Ico.U32; break;
8092       case Ico_U64:  n = (ULong)con->Ico.U64; break;
8093       case Ico_F32:  return False;
8094       case Ico_F64:  return False;
8095       case Ico_F32i: return False;
8096       case Ico_F64i: return False;
8097       case Ico_V128: return False;
8098       case Ico_V256: return False;
8099       default: ppIRExpr(at); tl_assert(0);
8100    }
8101    /* VG_(printf)("%llx\n", n); */
8102    /* Shortcuts */
8103    if (LIKELY(n <= 0x0000000000001000ULL)) return False;
8104    if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
8105    /* The list of bogus atoms is: */
8106    return (/*32*/    n == 0xFEFEFEFFULL
8107            /*32*/ || n == 0x80808080ULL
8108            /*32*/ || n == 0x7F7F7F7FULL
8109            /*32*/ || n == 0x7EFEFEFFULL
8110            /*32*/ || n == 0x81010100ULL
8111            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
8112            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
8113            /*64*/ || n == 0x0000000000008080ULL
8114            /*64*/ || n == 0x8080808080808080ULL
8115            /*64*/ || n == 0x0101010101010101ULL
8116           );
8117 }
8118
8119
8120 /* Does 'st' mention any of the literals identified/listed in
8121    isBogusAtom()? */
8122 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
8123 {
8124    Int      i;
8125    IRExpr*  e;
8126    IRDirty* d;
8127    IRCAS*   cas;
8128    switch (st->tag) {
8129       case Ist_WrTmp:
8130          e = st->Ist.WrTmp.data;
8131          switch (e->tag) {
8132             case Iex_Get:
8133             case Iex_RdTmp:
8134                return False;
8135             case Iex_Const:
8136                return isBogusAtom(e);
8137             case Iex_Unop:
8138                return isBogusAtom(e->Iex.Unop.arg)
8139                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
8140             case Iex_GetI:
8141                return isBogusAtom(e->Iex.GetI.ix);
8142             case Iex_Binop:
8143                return isBogusAtom(e->Iex.Binop.arg1)
8144                       || isBogusAtom(e->Iex.Binop.arg2);
8145             case Iex_Triop:
8146                return isBogusAtom(e->Iex.Triop.details->arg1)
8147                       || isBogusAtom(e->Iex.Triop.details->arg2)
8148                       || isBogusAtom(e->Iex.Triop.details->arg3);
8149             case Iex_Qop:
8150                return isBogusAtom(e->Iex.Qop.details->arg1)
8151                       || isBogusAtom(e->Iex.Qop.details->arg2)
8152                       || isBogusAtom(e->Iex.Qop.details->arg3)
8153                       || isBogusAtom(e->Iex.Qop.details->arg4);
8154             case Iex_ITE:
8155                return isBogusAtom(e->Iex.ITE.cond)
8156                       || isBogusAtom(e->Iex.ITE.iftrue)
8157                       || isBogusAtom(e->Iex.ITE.iffalse);
8158             case Iex_Load:
8159                return isBogusAtom(e->Iex.Load.addr);
8160             case Iex_CCall:
8161                for (i = 0; e->Iex.CCall.args[i]; i++)
8162                   if (isBogusAtom(e->Iex.CCall.args[i]))
8163                      return True;
8164                return False;
8165             default:
8166                goto unhandled;
8167          }
8168       case Ist_Dirty:
8169          d = st->Ist.Dirty.details;
8170          for (i = 0; d->args[i]; i++) {
8171             IRAtom* atom = d->args[i];
8172             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8173                if (isBogusAtom(atom))
8174                   return True;
8175             }
8176          }
8177          if (isBogusAtom(d->guard))
8178             return True;
8179          if (d->mAddr && isBogusAtom(d->mAddr))
8180             return True;
8181          return False;
8182       case Ist_Put:
8183          return isBogusAtom(st->Ist.Put.data);
8184       case Ist_PutI:
8185          return isBogusAtom(st->Ist.PutI.details->ix)
8186                 || isBogusAtom(st->Ist.PutI.details->data);
8187       case Ist_Store:
8188          return isBogusAtom(st->Ist.Store.addr)
8189                 || isBogusAtom(st->Ist.Store.data);
8190       case Ist_StoreG: {
8191          IRStoreG* sg = st->Ist.StoreG.details;
8192          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8193                 || isBogusAtom(sg->guard);
8194       }
8195       case Ist_LoadG: {
8196          IRLoadG* lg = st->Ist.LoadG.details;
8197          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8198                 || isBogusAtom(lg->guard);
8199       }
8200       case Ist_Exit:
8201          return isBogusAtom(st->Ist.Exit.guard);
8202       case Ist_AbiHint:
8203          return isBogusAtom(st->Ist.AbiHint.base)
8204                 || isBogusAtom(st->Ist.AbiHint.nia);
8205       case Ist_NoOp:
8206       case Ist_IMark:
8207       case Ist_MBE:
8208          return False;
8209       case Ist_CAS:
8210          cas = st->Ist.CAS.details;
8211          return isBogusAtom(cas->addr)
8212                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8213                 || isBogusAtom(cas->expdLo)
8214                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8215                 || isBogusAtom(cas->dataLo);
8216       case Ist_LLSC:
8217          return isBogusAtom(st->Ist.LLSC.addr)
8218                 || (st->Ist.LLSC.storedata
8219                        ? isBogusAtom(st->Ist.LLSC.storedata)
8220                        : False);
8221       default:
8222       unhandled:
8223          ppIRStmt(st);
8224          VG_(tool_panic)("hasBogusLiterals");
8225    }
8226 }
8227
8228
8229 /* This is the pre-instrumentation analysis.  It does a backwards pass over
8230    the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8231    the block.
8232
8233    Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8234    as a positive result from that is a strong indication that we need to
8235    expensively instrument add/sub in the block.  We do both analyses in one
8236    pass, even though they are independent, so as to avoid the overhead of
8237    having to traverse the whole block twice.
8238
8239    The usage pass proceeds as follows.  Let max= be the max operation in the
8240    HowUsed lattice, hence
8241
8242      X max= Y   means   X = max(X, Y)
8243
8244    then
8245
8246      for t in original tmps . useEnv[t] = HuUnU
8247
8248      for t used in the block's . next field
8249         useEnv[t] max= HuPCa  // because jmp targets are PCast-tested
8250
8251      for st iterating *backwards* in the block
8252
8253         match st
8254
8255            case "t1 = load(t2)"          // case 1
8256               useEnv[t2] max= HuPCa
8257
8258            case "t1 = add(t2, t3)"       // case 2
8259               useEnv[t2] max= useEnv[t1]
8260               useEnv[t3] max= useEnv[t1]
8261
8262            other
8263               for t in st.usedTmps       // case 3
8264                  useEnv[t] max= HuOth
8265                  // same as useEnv[t] = HuOth
8266
8267    The general idea is that we accumulate, in useEnv[], information about
8268    how each tmp is used.  That can be updated as we work further back
8269    through the block and find more uses of it, but its HowUsed value can
8270    only ascend the lattice, not descend.
8271
8272    Initially we mark all tmps as unused.  In case (1), if a tmp is seen to
8273    be used as a memory address, then its use is at least HuPCa.  The point
8274    is that for a memory address we will add instrumentation to check if any
8275    bit of the address is undefined, which means that we won't need expensive
8276    V-bit propagation through an add expression that computed the address --
8277    cheap add instrumentation will be equivalent.
8278
8279    Note in case (1) that if we have previously seen a non-memory-address use
8280    of the tmp, then its use will already be HuOth and will be unchanged by
8281    the max= operation.  And if it turns out that the source of the tmp was
8282    an add, then we'll have to expensively instrument the add, because we
8283    can't prove that, for the previous non-memory-address use of the tmp,
8284    cheap and expensive instrumentation will be equivalent.
8285
8286    In case 2, we propagate the usage-mode of the result of an add back
8287    through to its operands.  Again, we use max= so as to take account of the
8288    fact that t2 or t3 might later in the block (viz, earlier in the
8289    iteration) have been used in a way that requires expensive add
8290    instrumentation.
8291
8292    In case 3, we deal with all other tmp uses.  We assume that we'll need a
8293    result that is as accurate as possible, so we max= HuOth into its use
8294    mode.  Since HuOth is the top of the lattice, that's equivalent to just
8295    setting its use to HuOth.
8296
8297    The net result of all this is that:
8298
8299      tmps that are used either
8300        - only as a memory address, or
8301        - only as part of a tree of adds that computes a memory address,
8302          and has no other use
8303      are marked as HuPCa, and so we can instrument their generating Add
8304      nodes cheaply, which is the whole point of this analysis
8305
8306      tmps that are used any other way at all are marked as HuOth
8307
8308      tmps that are unused are marked as HuUnU.  We don't expect to see any
8309      since we expect that the incoming IR has had all dead assignments
8310      removed by previous optimisation passes.  Nevertheless the analysis is
8311      correct even in the presence of dead tmps.
8312
8313    A final comment on dead tmps.  In case 1 and case 2, we could actually
8314    conditionalise the updates thusly:
8315
8316      if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa }  // case 1
8317
8318      if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] }  // case 2
8319      if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] }  // case 2
8320
8321    In other words, if the assigned-to tmp |t1| is never used, then there's
8322    no point in propagating any use through to its operands.  That won't
8323    change the final HuPCa-vs-HuOth results, which is what we care about.
8324    Given that we expect to get dead-code-free inputs, there's no point in
8325    adding this extra refinement.
8326 */
8327
8328 /* Helper for |preInstrumentationAnalysis|. */
8329 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8330                                    UInt tyenvUsed,
8331                                    HowUsed newUse, IRAtom* at )
8332 {
8333    /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8334       seen a use of |newUse|.  So, merge that info into |t|'s accumulated
8335       use info. */
8336    switch (at->tag) {
8337       case Iex_GSPTR:
8338       case Iex_VECRET:
8339       case Iex_Const:
8340          return;
8341       case Iex_RdTmp: {
8342          IRTemp t = at->Iex.RdTmp.tmp;
8343          tl_assert(t < tyenvUsed); // "is an original tmp"
8344          // The "max" operation in the lattice
8345          if (newUse > useEnv[t]) useEnv[t] = newUse;
8346          return;
8347       }
8348       default:
8349          // We should never get here -- it implies non-flat IR
8350          ppIRExpr(at);
8351          VG_(tool_panic)("noteTmpUsesIn");
8352    }
8353    /*NOTREACHED*/
8354    tl_assert(0);
8355 }
8356
8357
8358 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8359                                          /*OUT*/Bool* hasBogusLiteralsP,
8360                                          const IRSB* sb_in )
8361 {
8362    const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8363
8364    // We've seen no bogus literals so far.
8365    Bool bogus = False;
8366
8367    // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8368    HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8369                                  nOrigTmps, sizeof(HowUsed));
8370
8371    // Firstly, roll in contributions from the final dst address.
8372    bogus = isBogusAtom(sb_in->next);
8373    noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8374
8375    // Now work backwards through the stmts.
8376    for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8377       IRStmt* st = sb_in->stmts[i];
8378
8379       // Deal with literals.
8380       if (LIKELY(!bogus)) {
8381          bogus = containsBogusLiterals(st);
8382       }
8383
8384       // Deal with tmp uses.
8385       switch (st->tag) {
8386          case Ist_WrTmp: {
8387             IRTemp  dst = st->Ist.WrTmp.tmp;
8388             IRExpr* rhs = st->Ist.WrTmp.data;
8389             // This is the one place where we have to consider all possible
8390             // tags for |rhs|, and can't just assume it is a tmp or a const.
8391             switch (rhs->tag) {
8392                case Iex_RdTmp:
8393                   // just propagate demand for |dst| into this tmp use.
8394                   noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8395                   break;
8396                case Iex_Unop:
8397                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8398                   break;
8399                case Iex_Binop:
8400                   if (rhs->Iex.Binop.op == Iop_Add64
8401                       || rhs->Iex.Binop.op == Iop_Add32) {
8402                      // propagate demand for |dst| through to the operands.
8403                      noteTmpUsesIn(useEnv, nOrigTmps,
8404                                    useEnv[dst], rhs->Iex.Binop.arg1);
8405                      noteTmpUsesIn(useEnv, nOrigTmps,
8406                                    useEnv[dst], rhs->Iex.Binop.arg2);
8407                   } else {
8408                      // just say that the operands are used in some unknown way.
8409                      noteTmpUsesIn(useEnv, nOrigTmps,
8410                                    HuOth, rhs->Iex.Binop.arg1);
8411                      noteTmpUsesIn(useEnv, nOrigTmps,
8412                                    HuOth, rhs->Iex.Binop.arg2);
8413                   }
8414                   break;
8415                case Iex_Triop: {
8416                   // All operands are used in some unknown way.
8417                   IRTriop* tri = rhs->Iex.Triop.details;
8418                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8419                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8420                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8421                   break;
8422                }
8423                case Iex_Qop: {
8424                   // All operands are used in some unknown way.
8425                   IRQop* qop = rhs->Iex.Qop.details;
8426                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8427                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8428                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8429                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8430                   break;
8431                }
8432                case Iex_Load:
8433                   // The address will be checked (== PCasted).
8434                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8435                   break;
8436                case Iex_ITE:
8437                   // The condition is PCasted, the then- and else-values
8438                   // aren't.
8439                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8440                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8441                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8442                   break;
8443                case Iex_CCall:
8444                   // The args are used in unknown ways.
8445                   for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8446                      noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8447                   }
8448                   break;
8449                case Iex_GetI: {
8450                   // The index will be checked/PCasted (see do_shadow_GETI)
8451                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8452                   break;
8453                }
8454                case Iex_Const:
8455                case Iex_Get:
8456                   break;
8457                default:
8458                   ppIRExpr(rhs);
8459                   VG_(tool_panic)("preInstrumentationAnalysis:"
8460                                   " unhandled IRExpr");
8461             }
8462             break;
8463          }
8464          case Ist_Store:
8465             // The address will be checked (== PCasted).  The data will be
8466             // used in some unknown way.
8467             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8468             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8469             break;
8470          case Ist_Exit:
8471             // The guard will be checked (== PCasted)
8472             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8473             break;
8474          case Ist_Put:
8475             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8476             break;
8477          case Ist_PutI: {
8478             IRPutI* putI = st->Ist.PutI.details;
8479             // The index will be checked/PCasted (see do_shadow_PUTI).  The
8480             // data will be used in an unknown way.
8481             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8482             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8483             break;
8484          }
8485          case Ist_Dirty: {
8486             IRDirty* d = st->Ist.Dirty.details;
8487             // The guard will be checked (== PCasted)
8488             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8489             // The args will be used in unknown ways.
8490             for (IRExpr** args = d->args; *args; args++) {
8491                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8492             }
8493             break;
8494          }
8495          case Ist_CAS: {
8496             IRCAS* cas = st->Ist.CAS.details;
8497             // Address will be pcasted, everything else used as unknown
8498             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8499             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8500             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8501             if (cas->expdHi)
8502                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8503             if (cas->dataHi)
8504                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8505             break;
8506          }
8507          case Ist_AbiHint:
8508             // Both exprs are used in unknown ways.  TODO: can we safely
8509             // just ignore AbiHints?
8510             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8511             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8512             break;
8513          case Ist_StoreG: {
8514             // We might be able to do better, and use HuPCa for the addr.
8515             // It's not immediately obvious that we can, because the address
8516             // is regarded as "used" only when the guard is true.
8517             IRStoreG* sg = st->Ist.StoreG.details;
8518             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8519             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8520             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8521             break;
8522          }
8523          case Ist_LoadG: {
8524             // Per similar comments to Ist_StoreG .. not sure whether this
8525             // is really optimal.
8526             IRLoadG* lg = st->Ist.LoadG.details;
8527             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8528             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8529             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8530             break;
8531          }
8532          case Ist_LLSC: {
8533             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8534             if (st->Ist.LLSC.storedata)
8535                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8536             break;
8537          }
8538          case Ist_MBE:
8539          case Ist_IMark:
8540          case Ist_NoOp:
8541             break;
8542          default: {
8543             ppIRStmt(st);
8544             VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8545          }
8546       }
8547    } // Now work backwards through the stmts.
8548
8549    // Return the computed use env and the bogus-atom flag.
8550    tl_assert(*useEnvP == NULL);
8551    *useEnvP = useEnv;
8552
8553    tl_assert(*hasBogusLiteralsP == False);
8554    *hasBogusLiteralsP = bogus;
8555 }
8556
8557
8558 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8559                         IRSB* sb_in,
8560                         const VexGuestLayout* layout,
8561                         const VexGuestExtents* vge,
8562                         const VexArchInfo* archinfo_host,
8563                         IRType gWordTy, IRType hWordTy )
8564 {
8565    Bool    verboze = 0||False;
8566    Int     i, j, first_stmt;
8567    IRStmt* st;
8568    MCEnv   mce;
8569    IRSB*   sb_out;
8570
8571    if (gWordTy != hWordTy) {
8572       /* We don't currently support this case. */
8573       VG_(tool_panic)("host/guest word size mismatch");
8574    }
8575
8576    /* Check we're not completely nuts */
8577    tl_assert(sizeof(UWord)  == sizeof(void*));
8578    tl_assert(sizeof(Word)   == sizeof(void*));
8579    tl_assert(sizeof(Addr)   == sizeof(void*));
8580    tl_assert(sizeof(ULong)  == 8);
8581    tl_assert(sizeof(Long)   == 8);
8582    tl_assert(sizeof(UInt)   == 4);
8583    tl_assert(sizeof(Int)    == 4);
8584
8585    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8586
8587    /* Set up SB */
8588    sb_out = deepCopyIRSBExceptStmts(sb_in);
8589
8590    /* Set up the running environment.  Both .sb and .tmpMap are
8591       modified as we go along.  Note that tmps are added to both
8592       .sb->tyenv and .tmpMap together, so the valid index-set for
8593       those two arrays should always be identical. */
8594    VG_(memset)(&mce, 0, sizeof(mce));
8595    mce.sb             = sb_out;
8596    mce.trace          = verboze;
8597    mce.layout         = layout;
8598    mce.hWordTy        = hWordTy;
8599    mce.tmpHowUsed     = NULL;
8600
8601    /* BEGIN decide on expense levels for instrumentation. */
8602
8603    /* Initially, select the cheap version of everything for which we have an
8604       option. */
8605    DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8606
8607    /* Take account of the --expensive-definedness-checks= flag. */
8608    if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8609       /* We just selected 'cheap for everything', so we don't need to do
8610          anything here.  mce.tmpHowUsed remains NULL. */
8611    }
8612    else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8613       /* Select 'expensive for everything'.  mce.tmpHowUsed remains NULL. */
8614       DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8615    }
8616    else {
8617       tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8618       /* We'll make our own selection, based on known per-target constraints
8619          and also on analysis of the block to be instrumented.  First, set
8620          up default values for detail levels.
8621
8622          On x86 and amd64, we'll routinely encounter code optimised by LLVM
8623          5 and above.  Enable accurate interpretation of the following.
8624          LLVM uses adds for some bitfield inserts, and we get a lot of false
8625          errors if the cheap interpretation is used, alas.  Could solve this
8626          much better if we knew which of such adds came from x86/amd64 LEA
8627          instructions, since these are the only ones really needing the
8628          expensive interpretation, but that would require some way to tag
8629          them in the _toIR.c front ends, which is a lot of faffing around.
8630          So for now we use preInstrumentationAnalysis() to detect adds which
8631          are used only to construct memory addresses, which is an
8632          approximation to the above, and is self-contained.*/
8633 #     if defined(VGA_x86)
8634       mce.dlbo.dl_Add32           = DLauto;
8635       mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8636       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8637 #     elif defined(VGA_amd64)
8638       mce.dlbo.dl_Add32           = DLexpensive;
8639       mce.dlbo.dl_Add64           = DLauto;
8640       mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8641       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8642       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8643 #     elif defined(VGA_ppc64le)
8644       // Needed by (at least) set_AV_CR6() in the front end.
8645       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8646 #     elif defined(VGA_arm64)
8647       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8648       mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8649 #     elif defined(VGA_arm)
8650       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8651 #     endif
8652
8653       /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8654          fill it in. */
8655       Bool hasBogusLiterals = False;
8656       preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8657
8658       if (hasBogusLiterals) {
8659          /* This happens very rarely.  In this case just select expensive
8660             for everything, and throw away the tmp-use analysis results. */
8661          DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8662          VG_(free)( mce.tmpHowUsed );
8663          mce.tmpHowUsed = NULL;
8664       } else {
8665          /* Nothing.  mce.tmpHowUsed contains tmp-use analysis results,
8666             which will be used for some subset of Iop_{Add,Sub}{32,64},
8667             based on which ones are set to DLauto for this target. */
8668       }
8669    }
8670
8671    DetailLevelByOp__check_sanity( &mce.dlbo );
8672
8673    if (0) {
8674       // Debug printing: which tmps have been identified as PCast-only use
8675       if (mce.tmpHowUsed) {
8676          VG_(printf)("Cheapies: ");
8677          for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8678             if (mce.tmpHowUsed[q] == HuPCa) {
8679                VG_(printf)("t%u ", q);
8680             }
8681          }
8682          VG_(printf)("\n");
8683       }
8684
8685       // Debug printing: number of ops by detail level
8686       UChar nCheap     = DetailLevelByOp__count( &mce.dlbo, DLcheap     );
8687       UChar nAuto      = DetailLevelByOp__count( &mce.dlbo, DLauto      );
8688       UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8689       tl_assert(nCheap + nAuto + nExpensive == 8);
8690
8691       VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8692    }
8693    /* END decide on expense levels for instrumentation. */
8694
8695    /* Initialise the running the tmp environment. */
8696
8697    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8698                             sizeof(TempMapEnt));
8699    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8700    for (i = 0; i < sb_in->tyenv->types_used; i++) {
8701       TempMapEnt ent;
8702       ent.kind    = Orig;
8703       ent.shadowV = IRTemp_INVALID;
8704       ent.shadowB = IRTemp_INVALID;
8705       VG_(addToXA)( mce.tmpMap, &ent );
8706    }
8707    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8708
8709    /* Finally, begin instrumentation. */
8710    /* Copy verbatim any IR preamble preceding the first IMark */
8711
8712    tl_assert(mce.sb == sb_out);
8713    tl_assert(mce.sb != sb_in);
8714
8715    i = 0;
8716    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8717
8718       st = sb_in->stmts[i];
8719       tl_assert(st);
8720       tl_assert(isFlatIRStmt(st));
8721
8722       stmt( 'C', &mce, sb_in->stmts[i] );
8723       i++;
8724    }
8725
8726    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
8727       cause the IR following the preamble to contain references to IR
8728       temporaries defined in the preamble.  Because the preamble isn't
8729       instrumented, these temporaries don't have any shadows.
8730       Nevertheless uses of them following the preamble will cause
8731       memcheck to generate references to their shadows.  End effect is
8732       to cause IR sanity check failures, due to references to
8733       non-existent shadows.  This is only evident for the complex
8734       preambles used for function wrapping on TOC-afflicted platforms
8735       (ppc64-linux).
8736
8737       The following loop therefore scans the preamble looking for
8738       assignments to temporaries.  For each one found it creates an
8739       assignment to the corresponding (V) shadow temp, marking it as
8740       'defined'.  This is the same resulting IR as if the main
8741       instrumentation loop before had been applied to the statement
8742       'tmp = CONSTANT'.
8743
8744       Similarly, if origin tracking is enabled, we must generate an
8745       assignment for the corresponding origin (B) shadow, claiming
8746       no-origin, as appropriate for a defined value.
8747    */
8748    for (j = 0; j < i; j++) {
8749       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8750          /* findShadowTmpV checks its arg is an original tmp;
8751             no need to assert that here. */
8752          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8753          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8754          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
8755          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8756          if (MC_(clo_mc_level) == 3) {
8757             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8758             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8759             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8760          }
8761          if (0) {
8762             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8763             ppIRType( ty_v );
8764             VG_(printf)("\n");
8765          }
8766       }
8767    }
8768
8769    /* Iterate over the remaining stmts to generate instrumentation. */
8770
8771    tl_assert(sb_in->stmts_used > 0);
8772    tl_assert(i >= 0);
8773    tl_assert(i < sb_in->stmts_used);
8774    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8775
8776    for (/* use current i*/; i < sb_in->stmts_used; i++) {
8777
8778       st = sb_in->stmts[i];
8779       first_stmt = sb_out->stmts_used;
8780
8781       if (verboze) {
8782          VG_(printf)("\n");
8783          ppIRStmt(st);
8784          VG_(printf)("\n");
8785       }
8786
8787       if (MC_(clo_mc_level) == 3) {
8788          /* See comments on case Ist_CAS below. */
8789          if (st->tag != Ist_CAS)
8790             schemeS( &mce, st );
8791       }
8792
8793       /* Generate instrumentation code for each stmt ... */
8794
8795       switch (st->tag) {
8796
8797          case Ist_WrTmp: {
8798             IRTemp dst = st->Ist.WrTmp.tmp;
8799             tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8800             HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8801                                         : HuOth/*we don't know, so play safe*/;
8802             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8803                                expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8804             break;
8805          }
8806
8807          case Ist_Put:
8808             do_shadow_PUT( &mce,
8809                            st->Ist.Put.offset,
8810                            st->Ist.Put.data,
8811                            NULL /* shadow atom */, NULL /* guard */ );
8812             break;
8813
8814          case Ist_PutI:
8815             do_shadow_PUTI( &mce, st->Ist.PutI.details);
8816             break;
8817
8818          case Ist_Store:
8819             do_shadow_Store( &mce, st->Ist.Store.end,
8820                                    st->Ist.Store.addr, 0/* addr bias */,
8821                                    st->Ist.Store.data,
8822                                    NULL /* shadow data */,
8823                                    NULL/*guard*/ );
8824             break;
8825
8826          case Ist_StoreG:
8827             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8828             break;
8829
8830          case Ist_LoadG:
8831             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8832             break;
8833
8834          case Ist_Exit:
8835             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8836             break;
8837
8838          case Ist_IMark:
8839             break;
8840
8841          case Ist_NoOp:
8842          case Ist_MBE:
8843             break;
8844
8845          case Ist_Dirty:
8846             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8847             break;
8848
8849          case Ist_AbiHint:
8850             do_AbiHint( &mce, st->Ist.AbiHint.base,
8851                               st->Ist.AbiHint.len,
8852                               st->Ist.AbiHint.nia );
8853             break;
8854
8855          case Ist_CAS:
8856             do_shadow_CAS( &mce, st->Ist.CAS.details );
8857             /* Note, do_shadow_CAS copies the CAS itself to the output
8858                block, because it needs to add instrumentation both
8859                before and after it.  Hence skip the copy below.  Also
8860                skip the origin-tracking stuff (call to schemeS) above,
8861                since that's all tangled up with it too; do_shadow_CAS
8862                does it all. */
8863             break;
8864
8865          case Ist_LLSC:
8866             do_shadow_LLSC( &mce,
8867                             st->Ist.LLSC.end,
8868                             st->Ist.LLSC.result,
8869                             st->Ist.LLSC.addr,
8870                             st->Ist.LLSC.storedata );
8871             break;
8872
8873          default:
8874             VG_(printf)("\n");
8875             ppIRStmt(st);
8876             VG_(printf)("\n");
8877             VG_(tool_panic)("memcheck: unhandled IRStmt");
8878
8879       } /* switch (st->tag) */
8880
8881       if (0 && verboze) {
8882          for (j = first_stmt; j < sb_out->stmts_used; j++) {
8883             VG_(printf)("   ");
8884             ppIRStmt(sb_out->stmts[j]);
8885             VG_(printf)("\n");
8886          }
8887          VG_(printf)("\n");
8888       }
8889
8890       /* ... and finally copy the stmt itself to the output.  Except,
8891          skip the copy of IRCASs; see comments on case Ist_CAS
8892          above. */
8893       if (st->tag != Ist_CAS)
8894          stmt('C', &mce, st);
8895    }
8896
8897    /* Now we need to complain if the jump target is undefined. */
8898    first_stmt = sb_out->stmts_used;
8899
8900    if (verboze) {
8901       VG_(printf)("sb_in->next = ");
8902       ppIRExpr(sb_in->next);
8903       VG_(printf)("\n\n");
8904    }
8905
8906    complainIfUndefined( &mce, sb_in->next, NULL );
8907
8908    if (0 && verboze) {
8909       for (j = first_stmt; j < sb_out->stmts_used; j++) {
8910          VG_(printf)("   ");
8911          ppIRStmt(sb_out->stmts[j]);
8912          VG_(printf)("\n");
8913       }
8914       VG_(printf)("\n");
8915    }
8916
8917    /* If this fails, there's been some serious snafu with tmp management,
8918       that should be investigated. */
8919    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
8920    VG_(deleteXA)( mce.tmpMap );
8921
8922    if (mce.tmpHowUsed) {
8923       VG_(free)( mce.tmpHowUsed );
8924    }
8925
8926    tl_assert(mce.sb == sb_out);
8927    return sb_out;
8928 }
8929
8930
8931 /*--------------------------------------------------------------------*/
8932 /*--- end                                           mc_translate.c ---*/
8933 /*--------------------------------------------------------------------*/