helgrind/libhb_core.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- LibHB: a library for implementing and checking               ---*/
   4 /*--- the happens-before relationship in concurrent programs.      ---*/
   5 /*---                                                 libhb_main.c ---*/
   6 /*--------------------------------------------------------------------*/
   7
   8 /*
   9    This file is part of LibHB, a library for implementing and checking
  10    the happens-before relationship in concurrent programs.
  11
  12    Copyright (C) 2008-2017 OpenWorks Ltd
  13       info@open-works.co.uk
  14
  15    This program is free software; you can redistribute it and/or
  16    modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation; either version 2 of the
  18    License, or (at your option) any later version.
  19
  20    This program is distributed in the hope that it will be useful, but
  21    WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program; if not, see <http://www.gnu.org/licenses/>.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29 */
  30
  31 #include "pub_tool_basics.h"
  32 #include "pub_tool_poolalloc.h"
  33 #include "pub_tool_libcassert.h"
  34 #include "pub_tool_libcbase.h"
  35 #include "pub_tool_libcprint.h"
  36 #include "pub_tool_machine.h"
  37 #include "pub_tool_mallocfree.h"
  38 #include "pub_tool_wordfm.h"
  39 #include "pub_tool_hashtable.h"
  40 #include "pub_tool_xarray.h"
  41 #include "pub_tool_oset.h"
  42 #include "pub_tool_threadstate.h"
  43 #include "pub_tool_aspacemgr.h"
  44 #include "pub_tool_stacktrace.h"
  45 #include "pub_tool_execontext.h"
  46 #include "pub_tool_errormgr.h"
  47 #include "pub_tool_debuginfo.h"
  48 #include "pub_tool_gdbserver.h"
  49 #include "pub_tool_options.h"        // VG_(clo_stats)
  50 #include "hg_basics.h"
  51 #include "hg_wordset.h"
  52 #include "hg_lock_n_thread.h"
  53 #include "hg_errors.h"
  54
  55 #include "libhb.h"
  56
  57
  58 /////////////////////////////////////////////////////////////////
  59 /////////////////////////////////////////////////////////////////
  60 //                                                             //
  61 // Debugging #defines                                          //
  62 //                                                             //
  63 /////////////////////////////////////////////////////////////////
  64 /////////////////////////////////////////////////////////////////
  65
  66 /* Check the sanity of shadow values in the core memory state
  67    machine.  Change #if 0 to #if 1 to enable this. */
  68 #if 0
  69 #  define CHECK_MSM 1
  70 #else
  71 #  define CHECK_MSM 0
  72 #endif
  73
  74
  75 /* Check sanity (reference counts, etc) in the conflicting access
  76    machinery.  Change #if 0 to #if 1 to enable this. */
  77 #if 0
  78 #  define CHECK_CEM 1
  79 #else
  80 #  define CHECK_CEM 0
  81 #endif
  82
  83
  84 /* Check sanity in the compressed shadow memory machinery,
  85    particularly in its caching innards.  Unfortunately there's no
  86    almost-zero-cost way to make them selectable at run time.  Hence
  87    set the #if 0 to #if 1 and rebuild if you want them. */
  88 #if 0
  89 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
  90 #  define inline __attribute__((noinline))
  91    /* probably want to ditch -fomit-frame-pointer too */
  92 #else
  93 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
  94 #endif
  95
  96 /* Define to 1 to activate tracing cached rcec. */
  97 #define DEBUG_CACHED_RCEC 0
  98
  99 /////////////////////////////////////////////////////////////////
 100 /////////////////////////////////////////////////////////////////
 101 //                                                             //
 102 // data decls: VtsID                                           //
 103 //                                                             //
 104 /////////////////////////////////////////////////////////////////
 105 /////////////////////////////////////////////////////////////////
 106
 107 /* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
 108    bits, since they have to be packed into the lowest 30 bits of an
 109    SVal. */
 110 typedef  UInt  VtsID;
 111 #define VtsID_INVALID 0xFFFFFFFF
 112
 113
 114
 115 /////////////////////////////////////////////////////////////////
 116 /////////////////////////////////////////////////////////////////
 117 //                                                             //
 118 // data decls: SVal                                            //
 119 //                                                             //
 120 /////////////////////////////////////////////////////////////////
 121 /////////////////////////////////////////////////////////////////
 122
 123 typedef  ULong  SVal;
 124
 125 /* This value has special significance to the implementation, and callers
 126    may not store it in the shadow memory. */
 127 #define SVal_INVALID (3ULL << 62)
 128
 129 /* This is the default value for shadow memory.  Initially the shadow
 130    memory contains no accessible areas and so all reads produce this
 131    value.  TODO: make this caller-defineable. */
 132 #define SVal_NOACCESS (2ULL << 62)
 133
 134
 135
 136 /////////////////////////////////////////////////////////////////
 137 /////////////////////////////////////////////////////////////////
 138 //                                                             //
 139 // data decls: ScalarTS                                        //
 140 //                                                             //
 141 /////////////////////////////////////////////////////////////////
 142 /////////////////////////////////////////////////////////////////
 143
 144 /* Scalar Timestamp.  We have to store a lot of these, so there is
 145    some effort to make them as small as possible.  Logically they are
 146    a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
 147    We pack it into 64 bits by representing the Thr* using a ThrID, a
 148    small integer (18 bits), and a 46 bit integer for the timestamp
 149    number.  The 46/18 split is arbitrary, but has the effect that
 150    Helgrind can only handle programs that create 2^18 or fewer threads
 151    over their entire lifetime, and have no more than 2^46 timestamp
 152    ticks (synchronisation operations on the same thread).
 153
 154    This doesn't seem like much of a limitation.  2^46 ticks is
 155    7.06e+13, and if each tick (optimistically) takes the machine 1000
 156    cycles to process, then the minimum time to process that many ticks
 157    at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
 158    but VTS ticks, which isn't realistic.
 159
 160    NB1: SCALARTS_N_THRBITS must be 27 or lower.  The obvious limit is
 161    32 since a ThrID is a UInt.  27 comes from the fact that
 162    'Thr_n_RCEC', which records information about old accesses, packs
 163    in tsw not only a ThrID but also minimum 4+1 other bits (access size
 164    and writeness) in a UInt, hence limiting size to 32-(4+1) == 27.
 165
 166    NB2: thrid values are issued upwards from 1024, and values less
 167    than that aren't valid.  This isn't per se necessary (any order
 168    will do, so long as they are unique), but it does help ensure they
 169    are less likely to get confused with the various other kinds of
 170    small-integer thread ids drifting around (eg, TId).
 171    So, SCALARTS_N_THRBITS must be 11 or more.
 172    See also NB5.
 173
 174    NB3: this probably also relies on the fact that Thr's are never
 175    deallocated -- they exist forever.  Hence the 1-1 mapping from
 176    Thr's to thrid values (set up in Thr__new) persists forever.
 177
 178    NB4: temp_max_sized_VTS is allocated at startup and never freed.
 179    It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
 180    ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
 181    making the memory use for this go sky-high.  With
 182    SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
 183    like an OK tradeoff.  If more than 256k threads need to be
 184    supported, we could change SCALARTS_N_THRBITS to 20, which would
 185    facilitate supporting 1 million threads at the cost of 8MB storage
 186    for temp_max_sized_VTS.
 187
 188    NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
 189    ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
 190    must never be a valid ThrID.  Given NB2 that's OK.
 191 */
 192 #define SCALARTS_N_THRBITS 18  /* valid range: 11 to 27 inclusive,
 193                                   See NB1 and NB2 above. */
 194
 195 #define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
 196 typedef
 197    struct {
 198       ThrID thrid : SCALARTS_N_THRBITS;
 199       ULong tym   : SCALARTS_N_TYMBITS;
 200    }
 201    ScalarTS;
 202
 203 #define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
 204
 205
 206
 207 /////////////////////////////////////////////////////////////////
 208 /////////////////////////////////////////////////////////////////
 209 //                                                             //
 210 // data decls: Filter                                          //
 211 //                                                             //
 212 /////////////////////////////////////////////////////////////////
 213 /////////////////////////////////////////////////////////////////
 214
 215 // baseline: 5, 9
 216 #define FI_LINE_SZB_LOG2  5
 217 #define FI_NUM_LINES_LOG2 10
 218
 219 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
 220 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
 221
 222 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
 223 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
 224
 225 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
 226                              & (Addr)(FI_NUM_LINES-1) )
 227
 228
 229 /* In the lines, each 8 bytes are treated individually, and are mapped
 230    to a UShort.  Regardless of endianness of the underlying machine,
 231    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
 232    the highest address.
 233
 234    Of each bit pair, the higher numbered bit is set if a R has been
 235    seen, so the actual layout is:
 236
 237    15 14             ...  01 00
 238
 239    R  W  for addr+7  ...  R  W  for addr+0
 240
 241    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
 242 */
 243
 244 /* tags are separated from lines.  tags are Addrs and are
 245    the base address of the line. */
 246 typedef
 247    struct {
 248       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
 249    }
 250    FiLine;
 251
 252 typedef
 253    struct {
 254       Addr   tags[FI_NUM_LINES];
 255       FiLine lines[FI_NUM_LINES];
 256    }
 257    Filter;
 258
 259
 260
 261 /////////////////////////////////////////////////////////////////
 262 /////////////////////////////////////////////////////////////////
 263 //                                                             //
 264 // data decls: Thr, ULong_n_EC                                 //
 265 //                                                             //
 266 /////////////////////////////////////////////////////////////////
 267 /////////////////////////////////////////////////////////////////
 268
 269 // Records stacks for H1 history mechanism (DRD-style)
 270 typedef
 271    struct { ULong ull; ExeContext* ec; }
 272    ULong_n_EC;
 273
 274
 275 /* How many of the above records to collect for each thread?  Older
 276    ones are dumped when we run out of space.  62.5k requires 1MB per
 277    thread, since each ULong_n_EC record is 16 bytes long.  When more
 278    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
 279    deleted to make space.  Hence in the worst case we will be able to
 280    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
 281    Kw transitions (segments in this thread).  For the current setting
 282    that gives a guaranteed stack for at least the last 31.25k
 283    segments. */
 284 #define N_KWs_N_STACKs_PER_THREAD 62500
 285
 286
 287 UInt HG_(clo_history_backtrace_size) = 8;
 288
 289 // (UInt) `echo "Reference Counted Execution Context" | md5sum`
 290 #define RCEC_MAGIC 0xab88abb2UL
 291
 292 /* RCEC usage is commented more in details in the section 'Change-event map2'
 293    later in this file */
 294 typedef
 295    struct _RCEC {
 296       UWord magic;  /* sanity check only */
 297       struct _RCEC* next;
 298       UWord rc;
 299       UWord rcX; /* used for crosschecking */
 300       UWord frames_hash;          /* hash of all the frames */
 301       UWord frames[0];
 302       /* Variable-length array.
 303          The size depends on HG_(clo_history_backtrace_size). */
 304    }
 305    RCEC;
 306
 307 struct _Thr {
 308    /* Current VTSs for this thread.  They change as we go along.  viR
 309       is the VTS to be used for reads, viW for writes.  Usually they
 310       are the same, but can differ when we deal with reader-writer
 311       locks.  It is always the case that
 312          VtsID__cmpLEQ(viW,viR) == True
 313       that is, viW must be the same, or lagging behind, viR. */
 314    VtsID viR;
 315    VtsID viW;
 316
 317    /* Is initially False, and is set to True after the thread really
 318       has done a low-level exit.  When True, we expect to never see
 319       any more memory references done by this thread. */
 320    Bool llexit_done;
 321
 322    /* Is initially False, and is set to True after the thread has been
 323       joined with (reaped by some other thread).  After this point, we
 324       do not expect to see any uses of .viR or .viW, so it is safe to
 325       set them to VtsID_INVALID. */
 326    Bool joinedwith_done;
 327
 328    /* A small integer giving a unique identity to this Thr.  See
 329       comments on the definition of ScalarTS for details. */
 330    ThrID thrid : SCALARTS_N_THRBITS;
 331
 332    /* A filter that removes references for which we believe that
 333       msmcread/msmcwrite will not change the state, nor report a
 334       race. */
 335    Filter* filter;
 336
 337    /* A pointer back to the top level Thread structure.  There is a
 338       1-1 mapping between Thread and Thr structures -- each Thr points
 339       at its corresponding Thread, and vice versa.  Really, Thr and
 340       Thread should be merged into a single structure. */
 341    Thread* hgthread;
 342
 343    /* The ULongs (scalar Kws) in this accumulate in strictly
 344       increasing order, without duplicates.  This is important because
 345       we need to be able to find a given scalar Kw in this array
 346       later, by binary search. */
 347    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
 348
 349    /* cached_rcec maintains the last RCEC that was retrieved for this thread. */
 350    RCEC cached_rcec;
 351    // cached_rcec value, not ref-counted.
 352    // As the last member of an RCEC is a variable length array, this must be
 353    // the last element of the  _Thr struct.
 354
 355    /* The shadow register vex_shadow1 SP register (SP_s1) is used to maintain
 356       the validity of the cached rcec.
 357       If SP_s1 is 0, then the cached rcec is invalid (cannot be used).
 358       If SP_S1 is != 0, then the cached rcec is valid. The valid cached rcec
 359       can be used to generate a new RCEC by changing just the last frame. */
 360
 361 };
 362
 363
 364
 365 /////////////////////////////////////////////////////////////////
 366 /////////////////////////////////////////////////////////////////
 367 //                                                             //
 368 // data decls: SO                                              //
 369 //                                                             //
 370 /////////////////////////////////////////////////////////////////
 371 /////////////////////////////////////////////////////////////////
 372
 373 // (UInt) `echo "Synchronisation object" | md5sum`
 374 #define SO_MAGIC 0x56b3c5b0U
 375
 376 struct _SO {
 377    struct _SO* admin_prev;
 378    struct _SO* admin_next;
 379    VtsID viR; /* r-clock of sender */
 380    VtsID viW; /* w-clock of sender */
 381    UInt  magic;
 382 };
 383
 384
 385
 386 /////////////////////////////////////////////////////////////////
 387 /////////////////////////////////////////////////////////////////
 388 //                                                             //
 389 // Forward declarations                                        //
 390 //                                                             //
 391 /////////////////////////////////////////////////////////////////
 392 /////////////////////////////////////////////////////////////////
 393
 394 /* fwds for
 395    Globals needed by other parts of the library.  These are set
 396    once at startup and then never changed. */
 397 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
 398 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
 399
 400 /* misc fn and data fwdses */
 401 static void VtsID__rcinc ( VtsID ii );
 402 static void VtsID__rcdec ( VtsID ii );
 403
 404 static inline Bool SVal__isC ( SVal s );
 405 static inline VtsID SVal__unC_Rmin ( SVal s );
 406 static inline VtsID SVal__unC_Wmin ( SVal s );
 407 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
 408 static inline void SVal__rcinc ( SVal s );
 409 static inline void SVal__rcdec ( SVal s );
 410 /* SVal in LineZ are used to store various pointers. */
 411 static inline void *SVal2Ptr (SVal s);
 412 static inline SVal Ptr2SVal (void* ptr);
 413
 414 /* A double linked list of all the SO's. */
 415 SO* admin_SO;
 416
 417
 418
 419 /////////////////////////////////////////////////////////////////
 420 /////////////////////////////////////////////////////////////////
 421 //                                                             //
 422 // SECTION BEGIN compressed shadow memory                      //
 423 //                                                             //
 424 /////////////////////////////////////////////////////////////////
 425 /////////////////////////////////////////////////////////////////
 426
 427 #ifndef __HB_ZSM_H
 428 #define __HB_ZSM_H
 429
 430 /* Initialise the library.  Once initialised, it will (or may) call
 431    SVal__rcinc and SVal__rcdec in response to all the calls below, in order to
 432    allow the user to do reference counting on the SVals stored herein.
 433    It is important to understand, however, that due to internal
 434    caching, the reference counts are in general inaccurate, and can be
 435    both above or below the true reference count for an item.  In
 436    particular, the library may indicate that the reference count for
 437    an item is zero, when in fact it is not.
 438
 439    To make the reference counting exact and therefore non-pointless,
 440    call zsm_flush_cache.  Immediately after it returns, the reference
 441    counts for all items, as deduced by the caller by observing calls
 442    to SVal__rcinc and SVal__rcdec, will be correct, and so any items with a
 443    zero reference count may be freed (or at least considered to be
 444    unreferenced by this library).
 445 */
 446 static void zsm_init ( void );
 447
 448 static void zsm_sset_range  ( Addr, SizeT, SVal );
 449 static void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew );
 450 static void zsm_scopy_range ( Addr, Addr, SizeT );
 451 static void zsm_flush_cache ( void );
 452
 453 #endif /* ! __HB_ZSM_H */
 454
 455
 456 /* Round a up to the next multiple of N.  N must be a power of 2 */
 457 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
 458 /* Round a down to the next multiple of N.  N must be a power of 2 */
 459 #define ROUNDDN(a, N)   ((a) & ~(N-1))
 460
 461 /* True if a belongs in range [start, start + szB[
 462    (i.e. start + szB is excluded). */
 463 static inline Bool address_in_range (Addr a, Addr start,  SizeT szB)
 464 {
 465    /* Checking start <= a && a < start + szB.
 466       As start and a are unsigned addresses, the condition can
 467       be simplified. */
 468    if (CHECK_ZSM)
 469       tl_assert ((a - start < szB)
 470                  == (start <= a
 471                      &&       a < start + szB));
 472    return a - start < szB;
 473 }
 474
 475 /* ------ CacheLine ------ */
 476
 477 #define N_LINE_BITS      6 /* must be >= 3 */
 478 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
 479 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
 480
 481 typedef
 482    struct {
 483       UShort descrs[N_LINE_TREES];
 484       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
 485    }
 486    CacheLine;
 487
 488 #define TREE_DESCR_16_0 (1<<0)
 489 #define TREE_DESCR_32_0 (1<<1)
 490 #define TREE_DESCR_16_1 (1<<2)
 491 #define TREE_DESCR_64   (1<<3)
 492 #define TREE_DESCR_16_2 (1<<4)
 493 #define TREE_DESCR_32_1 (1<<5)
 494 #define TREE_DESCR_16_3 (1<<6)
 495 #define TREE_DESCR_8_0  (1<<7)
 496 #define TREE_DESCR_8_1  (1<<8)
 497 #define TREE_DESCR_8_2  (1<<9)
 498 #define TREE_DESCR_8_3  (1<<10)
 499 #define TREE_DESCR_8_4  (1<<11)
 500 #define TREE_DESCR_8_5  (1<<12)
 501 #define TREE_DESCR_8_6  (1<<13)
 502 #define TREE_DESCR_8_7  (1<<14)
 503 #define TREE_DESCR_DTY  (1<<15)
 504
 505 typedef
 506    struct {
 507       SVal  dict[4]; /* can represent up to 4 diff values in the line */
 508       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
 509                                       dict indexes */
 510       /* if dict[0] == SVal_INVALID then dict[1] is a pointer to the
 511          LineF to use, and dict[2..] are also SVal_INVALID. */
 512    }
 513    LineZ; /* compressed rep for a cache line */
 514
 515 /* LineZ.dict[1] is used to store various pointers:
 516    * In the first lineZ of a free SecMap, it points to the next free SecMap.
 517    * In a lineZ for which we need to use a lineF, it points to the lineF. */
 518
 519
 520 typedef
 521    struct {
 522       SVal w64s[N_LINE_ARANGE];
 523    }
 524    LineF; /* full rep for a cache line */
 525
 526 /* We use a pool allocator for LineF, as LineF is relatively small,
 527    and we will often alloc/release such lines. */
 528 static PoolAlloc* LineF_pool_allocator;
 529
 530 /* SVal in a lineZ are used to store various pointers.
 531    Below are conversion functions to support that. */
 532 static inline LineF *LineF_Ptr (LineZ *lineZ)
 533 {
 534    tl_assert(lineZ->dict[0] == SVal_INVALID);
 535    return SVal2Ptr (lineZ->dict[1]);
 536 }
 537
 538 /* Shadow memory.
 539    Primary map is a WordFM Addr SecMap*.
 540    SecMaps cover some page-size-ish section of address space and hold
 541      a compressed representation.
 542    CacheLine-sized chunks of SecMaps are copied into a Cache, being
 543    decompressed when moved into the cache and recompressed on the
 544    way out.  Because of this, the cache must operate as a writeback
 545    cache, not a writethrough one.
 546
 547    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
 548    N_SECMAP_BITS must >= N_LINE_BITS.
 549 */
 550 #define N_SECMAP_BITS   13
 551 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
 552
 553 // # CacheLines held by a SecMap
 554 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
 555
 556 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
 557    either carries the required data directly, in a compressed
 558    representation, or it holds (in .dict[1]) a pointer to a LineF
 559    that holds the full representation.
 560
 561    As each in-use LineF is referred to by exactly one LineZ,
 562    the number of .linesZ[] that refer to a lineF should equal
 563    the number of used lineF.
 564
 565    RC obligations: the RCs presented to the user include exactly
 566    the values in:
 567    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
 568    * F reps that are in use
 569
 570    Hence the following actions at the following transitions are required:
 571
 572    F rep: alloc'd       -> freed                -- rcdec_LineF
 573    F rep:               -> alloc'd              -- rcinc_LineF
 574    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
 575    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
 576 */
 577
 578 typedef
 579    struct {
 580       UInt   magic;
 581       LineZ  linesZ[N_SECMAP_ZLINES];
 582    }
 583    SecMap;
 584
 585 #define SecMap_MAGIC   0x571e58cbU
 586
 587 // (UInt) `echo "Free SecMap" | md5sum`
 588 #define SecMap_free_MAGIC 0x5a977f30U
 589
 590 __attribute__((unused))
 591 static inline Bool is_sane_SecMap ( SecMap* sm ) {
 592    return sm != NULL && sm->magic == SecMap_MAGIC;
 593 }
 594
 595 /* ------ Cache ------ */
 596
 597 #define N_WAY_BITS 16
 598 #define N_WAY_NENT (1 << N_WAY_BITS)
 599
 600 /* Each tag is the address of the associated CacheLine, rounded down
 601    to a CacheLine address boundary.  A CacheLine size must be a power
 602    of 2 and must be 8 or more.  Hence an easy way to initialise the
 603    cache so it is empty is to set all the tag values to any value % 8
 604    != 0, eg 1.  This means all queries in the cache initially miss.
 605    It does however require us to detect and not writeback, any line
 606    with a bogus tag. */
 607 typedef
 608    struct {
 609       CacheLine lyns0[N_WAY_NENT];
 610       Addr      tags0[N_WAY_NENT];
 611    }
 612    Cache;
 613
 614 static inline Bool is_valid_scache_tag ( Addr tag ) {
 615    /* a valid tag should be naturally aligned to the start of
 616       a CacheLine. */
 617    return 0 == (tag & (N_LINE_ARANGE - 1));
 618 }
 619
 620
 621 /* --------- Primary data structures --------- */
 622
 623 /* Shadow memory primary map */
 624 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
 625 static Cache   cache_shmem;
 626
 627
 628 static UWord stats__secmaps_search       = 0; // # SM finds
 629 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
 630 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
 631 static UWord stats__secmaps_in_map_shmem = 0; // # SecMaps 'live'
 632 static UWord stats__secmaps_scanGC       = 0; // # nr of scan GC done.
 633 static UWord stats__secmaps_scanGCed     = 0; // # SecMaps GC-ed via scan
 634 static UWord stats__secmaps_ssetGCed     = 0; // # SecMaps GC-ed via setnoaccess
 635 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
 636 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
 637 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
 638 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
 639 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
 640 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
 641 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
 642 static UWord stats__cache_flushes_invals = 0; // # cache flushes and invals
 643 static UWord stats__cache_totrefs        = 0; // # total accesses
 644 static UWord stats__cache_totmisses      = 0; // # misses
 645 static ULong stats__cache_make_New_arange = 0; // total arange made New
 646 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
 647 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
 648 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
 649 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
 650 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
 651 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
 652 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
 653 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
 654 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
 655 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
 656 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
 657 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
 658 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
 659 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
 660 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
 661 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
 662 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
 663 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
 664 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
 665 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
 666 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
 667 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
 668 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
 669 static UWord stats__vts__join            = 0; // # calls to VTS__join
 670 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
 671 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
 672 static UWord stats__vts_tab_GC           = 0; // # nr of vts_tab GC
 673 static UWord stats__vts_pruning          = 0; // # nr of vts pruning
 674
 675 // # calls to VTS__cmp_structural w/ slow case
 676 static UWord stats__vts__cmp_structural_slow = 0;
 677
 678 // # calls to VTS__indexAt_SLOW
 679 static UWord stats__vts__indexat_slow = 0;
 680
 681 // # calls to vts_set__find__or__clone_and_add
 682 static UWord stats__vts_set__focaa    = 0;
 683
 684 // # calls to vts_set__find__or__clone_and_add that lead to an
 685 // allocation
 686 static UWord stats__vts_set__focaa_a  = 0;
 687
 688
 689 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
 690    return a & ~(N_SECMAP_ARANGE - 1);
 691 }
 692 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
 693    return a & (N_SECMAP_ARANGE - 1);
 694 }
 695
 696
 697 /*----------------------------------------------------------------*/
 698 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
 699 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
 700 /*----------------------------------------------------------------*/
 701
 702 /*--------------- SecMap allocation --------------- */
 703
 704 static HChar* shmem__bigchunk_next = NULL;
 705 static HChar* shmem__bigchunk_end1 = NULL;
 706
 707 static void* shmem__bigchunk_alloc ( SizeT n )
 708 {
 709    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
 710    tl_assert(n > 0);
 711    n = VG_ROUNDUP(n, 16);
 712    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
 713    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
 714              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
 715    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
 716       if (0)
 717       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
 718                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
 719       SysRes sres = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
 720       if (sr_isError(sres)) {
 721          VG_(out_of_memory_NORETURN)(
 722              "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE,
 723              sr_Err(sres));
 724       }
 725       shmem__bigchunk_next = (void*)(Addr)sr_Res(sres);;
 726       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
 727    }
 728    tl_assert(shmem__bigchunk_next);
 729    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
 730    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
 731    shmem__bigchunk_next += n;
 732    return shmem__bigchunk_next - n;
 733 }
 734
 735 /* SecMap changed to be fully SVal_NOACCESS are inserted in a list of
 736    recycled SecMap. When a new SecMap is needed, a recycled SecMap
 737    will be used in preference to allocating a new SecMap. */
 738 /* We make a linked list of SecMap. The first LineZ is re-used to
 739    implement the linked list. */
 740 /* Returns the SecMap following sm in the free list.
 741    NULL if sm is the last SecMap. sm must be on the free list. */
 742 static inline SecMap *SecMap_freelist_next ( SecMap* sm )
 743 {
 744    tl_assert (sm);
 745    tl_assert (sm->magic == SecMap_free_MAGIC);
 746    return SVal2Ptr (sm->linesZ[0].dict[1]);
 747 }
 748 static inline void set_SecMap_freelist_next ( SecMap* sm, SecMap* next )
 749 {
 750    tl_assert (sm);
 751    tl_assert (sm->magic == SecMap_free_MAGIC);
 752    tl_assert (next == NULL || next->magic == SecMap_free_MAGIC);
 753    sm->linesZ[0].dict[1] = Ptr2SVal (next);
 754 }
 755
 756 static SecMap *SecMap_freelist = NULL;
 757 static UWord SecMap_freelist_length(void)
 758 {
 759    SecMap *sm;
 760    UWord n = 0;
 761
 762    sm = SecMap_freelist;
 763    while (sm) {
 764      n++;
 765      sm = SecMap_freelist_next (sm);
 766    }
 767    return n;
 768 }
 769
 770 static void push_SecMap_on_freelist(SecMap* sm)
 771 {
 772    if (0) VG_(message)(Vg_DebugMsg, "%p push\n", sm);
 773    sm->magic = SecMap_free_MAGIC;
 774    set_SecMap_freelist_next(sm, SecMap_freelist);
 775    SecMap_freelist = sm;
 776 }
 777 /* Returns a free SecMap if there is one.
 778    Otherwise, returns NULL. */
 779 static SecMap *pop_SecMap_from_freelist(void)
 780 {
 781    SecMap *sm;
 782
 783    sm = SecMap_freelist;
 784    if (sm) {
 785       tl_assert (sm->magic == SecMap_free_MAGIC);
 786       SecMap_freelist = SecMap_freelist_next (sm);
 787       if (0) VG_(message)(Vg_DebugMsg, "%p pop\n", sm);
 788    }
 789    return sm;
 790 }
 791
 792 static SecMap* shmem__alloc_or_recycle_SecMap ( void )
 793 {
 794    Word    i, j;
 795    SecMap* sm = pop_SecMap_from_freelist();
 796
 797    if (!sm) {
 798       sm = shmem__bigchunk_alloc( sizeof(SecMap) );
 799       stats__secmaps_allocd++;
 800       stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
 801       stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
 802       stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
 803    }
 804    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
 805    tl_assert(sm);
 806    sm->magic = SecMap_MAGIC;
 807    for (i = 0; i < N_SECMAP_ZLINES; i++) {
 808       sm->linesZ[i].dict[0] = SVal_NOACCESS;
 809       sm->linesZ[i].dict[1] = SVal_INVALID;
 810       sm->linesZ[i].dict[2] = SVal_INVALID;
 811       sm->linesZ[i].dict[3] = SVal_INVALID;
 812       for (j = 0; j < N_LINE_ARANGE/4; j++)
 813          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
 814    }
 815    return sm;
 816 }
 817
 818 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
 819 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
 820
 821 static SecMap* shmem__find_SecMap ( Addr ga )
 822 {
 823    SecMap* sm    = NULL;
 824    Addr    gaKey = shmem__round_to_SecMap_base(ga);
 825    // Cache
 826    stats__secmaps_search++;
 827    if (LIKELY(gaKey == smCache[0].gaKey))
 828       return smCache[0].sm;
 829    if (LIKELY(gaKey == smCache[1].gaKey)) {
 830       SMCacheEnt tmp = smCache[0];
 831       smCache[0] = smCache[1];
 832       smCache[1] = tmp;
 833       return smCache[0].sm;
 834    }
 835    if (gaKey == smCache[2].gaKey) {
 836       SMCacheEnt tmp = smCache[1];
 837       smCache[1] = smCache[2];
 838       smCache[2] = tmp;
 839       return smCache[1].sm;
 840    }
 841    // end Cache
 842    stats__secmaps_search_slow++;
 843    if (VG_(lookupFM)( map_shmem,
 844                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
 845       tl_assert(sm != NULL);
 846       smCache[2] = smCache[1];
 847       smCache[1] = smCache[0];
 848       smCache[0].gaKey = gaKey;
 849       smCache[0].sm    = sm;
 850    } else {
 851       tl_assert(sm == NULL);
 852    }
 853    return sm;
 854 }
 855
 856 /* Scan the SecMap and count the SecMap that can be GC-ed.
 857    If really, really does the GC of the SecMap. */
 858 /* NOT TO BE CALLED FROM WITHIN libzsm. */
 859 static UWord next_SecMap_GC_at = 1000;
 860 __attribute__((noinline))
 861 static UWord shmem__SecMap_do_GC(Bool really)
 862 {
 863    UWord secmapW = 0;
 864    Addr  gaKey;
 865    UWord examined = 0;
 866    UWord ok_GCed = 0;
 867
 868    /* First invalidate the smCache */
 869    smCache[0].gaKey = 1;
 870    smCache[1].gaKey = 1;
 871    smCache[2].gaKey = 1;
 872    STATIC_ASSERT (3 == sizeof(smCache)/sizeof(smCache[0]));
 873
 874    VG_(initIterFM)( map_shmem );
 875    while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
 876       UWord   i;
 877       UWord   j;
 878       UWord   n_linesF = 0;
 879       SecMap* sm = (SecMap*)secmapW;
 880       tl_assert(sm->magic == SecMap_MAGIC);
 881       Bool ok_to_GC = True;
 882
 883       examined++;
 884
 885       /* Deal with the LineZs and the possible LineF of a LineZ. */
 886       for (i = 0; i < N_SECMAP_ZLINES && ok_to_GC; i++) {
 887          LineZ* lineZ = &sm->linesZ[i];
 888          if (lineZ->dict[0] != SVal_INVALID) {
 889             ok_to_GC = lineZ->dict[0] == SVal_NOACCESS
 890                && !SVal__isC (lineZ->dict[1])
 891                && !SVal__isC (lineZ->dict[2])
 892                && !SVal__isC (lineZ->dict[3]);
 893          } else {
 894             LineF *lineF = LineF_Ptr(lineZ);
 895             n_linesF++;
 896             for (j = 0; j < N_LINE_ARANGE && ok_to_GC; j++)
 897                ok_to_GC = lineF->w64s[j] == SVal_NOACCESS;
 898          }
 899       }
 900       if (ok_to_GC)
 901          ok_GCed++;
 902       if (ok_to_GC && really) {
 903         SecMap *fm_sm;
 904         Addr fm_gaKey;
 905         /* We cannot remove a SecMap from map_shmem while iterating.
 906            So, stop iteration, remove from map_shmem, recreate the iteration
 907            on the next SecMap. */
 908         VG_(doneIterFM) ( map_shmem );
 909         /* No need to rcdec linesZ or linesF, these are all SVal_NOACCESS.
 910            We just need to free the lineF referenced by the linesZ. */
 911         if (n_linesF > 0) {
 912            for (i = 0; i < N_SECMAP_ZLINES && n_linesF > 0; i++) {
 913               LineZ* lineZ = &sm->linesZ[i];
 914               if (lineZ->dict[0] == SVal_INVALID) {
 915                  VG_(freeEltPA)( LineF_pool_allocator, LineF_Ptr(lineZ) );
 916                  n_linesF--;
 917               }
 918            }
 919         }
 920         if (!VG_(delFromFM)(map_shmem, &fm_gaKey, (UWord*)&fm_sm, gaKey))
 921           tl_assert (0);
 922         stats__secmaps_in_map_shmem--;
 923         tl_assert (gaKey == fm_gaKey);
 924         tl_assert (sm == fm_sm);
 925         stats__secmaps_scanGCed++;
 926         push_SecMap_on_freelist (sm);
 927         VG_(initIterAtFM) (map_shmem, gaKey + N_SECMAP_ARANGE);
 928       }
 929    }
 930    VG_(doneIterFM)( map_shmem );
 931
 932    if (really) {
 933       stats__secmaps_scanGC++;
 934       /* Next GC when we approach the max allocated */
 935       next_SecMap_GC_at = stats__secmaps_allocd - 1000;
 936       /* Unless we GCed less than 10%. We then allow to alloc 10%
 937          more before GCing. This avoids doing a lot of costly GC
 938          for the worst case : the 'growing phase' of an application
 939          that allocates a lot of memory.
 940          Worst can can be reproduced e.g. by
 941              perf/memrw -t 30000000 -b 1000 -r 1 -l 1
 942          that allocates around 30Gb of memory. */
 943       if (ok_GCed < stats__secmaps_allocd/10)
 944          next_SecMap_GC_at = stats__secmaps_allocd + stats__secmaps_allocd/10;
 945
 946    }
 947
 948    if (VG_(clo_stats) && really) {
 949       VG_(message)(Vg_DebugMsg,
 950                   "libhb: SecMap GC: #%lu scanned %lu, GCed %lu,"
 951                    " next GC at %lu\n",
 952                    stats__secmaps_scanGC, examined, ok_GCed,
 953                    next_SecMap_GC_at);
 954    }
 955
 956    return ok_GCed;
 957 }
 958
 959 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
 960 {
 961    SecMap* sm = shmem__find_SecMap ( ga );
 962    if (LIKELY(sm)) {
 963       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
 964       return sm;
 965    } else {
 966       /* create a new one */
 967       Addr gaKey = shmem__round_to_SecMap_base(ga);
 968       sm = shmem__alloc_or_recycle_SecMap();
 969       tl_assert(sm);
 970       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
 971       stats__secmaps_in_map_shmem++;
 972       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
 973       return sm;
 974    }
 975 }
 976
 977 /* Returns the nr of linesF which are in use. Note: this is scanning
 978    the secmap wordFM. So, this is to be used for statistics only. */
 979 __attribute__((noinline))
 980 static UWord shmem__SecMap_used_linesF(void)
 981 {
 982    UWord secmapW = 0;
 983    Addr  gaKey;
 984    UWord inUse = 0;
 985
 986    VG_(initIterFM)( map_shmem );
 987    while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
 988       UWord   i;
 989       SecMap* sm = (SecMap*)secmapW;
 990       tl_assert(sm->magic == SecMap_MAGIC);
 991
 992       for (i = 0; i < N_SECMAP_ZLINES; i++) {
 993          LineZ* lineZ = &sm->linesZ[i];
 994          if (lineZ->dict[0] == SVal_INVALID)
 995             inUse++;
 996       }
 997    }
 998    VG_(doneIterFM)( map_shmem );
 999
1000    return inUse;
1001 }
1002
1003 /* ------------ LineF and LineZ related ------------ */
1004
1005 static void rcinc_LineF ( LineF* lineF ) {
1006    UWord i;
1007    for (i = 0; i < N_LINE_ARANGE; i++)
1008       SVal__rcinc(lineF->w64s[i]);
1009 }
1010
1011 static void rcdec_LineF ( LineF* lineF ) {
1012    UWord i;
1013    for (i = 0; i < N_LINE_ARANGE; i++)
1014       SVal__rcdec(lineF->w64s[i]);
1015 }
1016
1017 static void rcinc_LineZ ( LineZ* lineZ ) {
1018    tl_assert(lineZ->dict[0] != SVal_INVALID);
1019    SVal__rcinc(lineZ->dict[0]);
1020    if (lineZ->dict[1] != SVal_INVALID) SVal__rcinc(lineZ->dict[1]);
1021    if (lineZ->dict[2] != SVal_INVALID) SVal__rcinc(lineZ->dict[2]);
1022    if (lineZ->dict[3] != SVal_INVALID) SVal__rcinc(lineZ->dict[3]);
1023 }
1024
1025 static void rcdec_LineZ ( LineZ* lineZ ) {
1026    tl_assert(lineZ->dict[0] != SVal_INVALID);
1027    SVal__rcdec(lineZ->dict[0]);
1028    if (lineZ->dict[1] != SVal_INVALID) SVal__rcdec(lineZ->dict[1]);
1029    if (lineZ->dict[2] != SVal_INVALID) SVal__rcdec(lineZ->dict[2]);
1030    if (lineZ->dict[3] != SVal_INVALID) SVal__rcdec(lineZ->dict[3]);
1031 }
1032
1033 inline
1034 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
1035    Word bix, shft, mask, prep;
1036    tl_assert(ix >= 0);
1037    bix  = ix >> 2;
1038    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1039    mask = 3 << shft;
1040    prep = b2 << shft;
1041    arr[bix] = (arr[bix] & ~mask) | prep;
1042 }
1043
1044 inline
1045 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
1046    Word bix, shft;
1047    tl_assert(ix >= 0);
1048    bix  = ix >> 2;
1049    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1050    return (arr[bix] >> shft) & 3;
1051 }
1052
1053 /* We cache one free lineF, to avoid pool allocator calls.
1054    Measurement on firefox has shown that this avoids more than 90%
1055    of the PA calls. */
1056 static LineF *free_lineF = NULL;
1057
1058 /* Allocates a lineF for LineZ. Sets lineZ in a state indicating
1059    lineF has to be used. */
1060 static inline LineF *alloc_LineF_for_Z (LineZ *lineZ)
1061 {
1062    LineF *lineF;
1063
1064    tl_assert(lineZ->dict[0] == SVal_INVALID);
1065
1066    if (LIKELY(free_lineF)) {
1067       lineF = free_lineF;
1068       free_lineF = NULL;
1069    } else {
1070       lineF = VG_(allocEltPA) ( LineF_pool_allocator );
1071    }
1072    lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1073    lineZ->dict[1] = Ptr2SVal (lineF);
1074
1075    return lineF;
1076 }
1077
1078 /* rcdec the LineF of lineZ, frees the lineF, and sets lineZ
1079    back to its initial state SVal_NOACCESS (i.e. ready to be
1080    read or written just after SecMap allocation). */
1081 static inline void clear_LineF_of_Z (LineZ *lineZ)
1082 {
1083    LineF *lineF = LineF_Ptr(lineZ);
1084
1085    rcdec_LineF(lineF);
1086    if (UNLIKELY(free_lineF)) {
1087       VG_(freeEltPA)( LineF_pool_allocator, lineF );
1088    } else {
1089       free_lineF = lineF;
1090    }
1091    lineZ->dict[0] = SVal_NOACCESS;
1092    lineZ->dict[1] = SVal_INVALID;
1093 }
1094
1095 /* Given address 'tag', find either the Z or F line containing relevant
1096    data, so it can be read into the cache.
1097 */
1098 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
1099                                   /*OUT*/LineF** fp, Addr tag ) {
1100    LineZ* lineZ;
1101    LineF* lineF;
1102    UWord   zix;
1103    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1104    UWord   smoff = shmem__get_SecMap_offset(tag);
1105    /* since smoff is derived from a valid tag, it should be
1106       cacheline-aligned. */
1107    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1108    zix = smoff >> N_LINE_BITS;
1109    tl_assert(zix < N_SECMAP_ZLINES);
1110    lineZ = &sm->linesZ[zix];
1111    lineF = NULL;
1112    if (lineZ->dict[0] == SVal_INVALID) {
1113       lineF = LineF_Ptr (lineZ);
1114       lineZ = NULL;
1115    }
1116    *zp = lineZ;
1117    *fp = lineF;
1118 }
1119
1120 /* Given address 'tag', return the relevant SecMap and the index of
1121    the LineZ within it, in the expectation that the line is to be
1122    overwritten.  Regardless of whether 'tag' is currently associated
1123    with a Z or F representation, to rcdec on the current
1124    representation, in recognition of the fact that the contents are
1125    just about to be overwritten. */
1126 static __attribute__((noinline))
1127 void find_Z_for_writing ( /*OUT*/SecMap** smp,
1128                           /*OUT*/Word* zixp,
1129                           Addr tag ) {
1130    LineZ* lineZ;
1131    UWord   zix;
1132    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1133    UWord   smoff = shmem__get_SecMap_offset(tag);
1134    /* since smoff is derived from a valid tag, it should be
1135       cacheline-aligned. */
1136    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1137    zix = smoff >> N_LINE_BITS;
1138    tl_assert(zix < N_SECMAP_ZLINES);
1139    lineZ = &sm->linesZ[zix];
1140    /* re RCs, we are rcdec_LineZ/clear_LineF_of_Z this LineZ so that new data
1141       can be parked in it.  Hence have to rcdec it accordingly. */
1142    /* If lineZ has an associated lineF, free it up. */
1143    if (lineZ->dict[0] == SVal_INVALID)
1144       clear_LineF_of_Z(lineZ);
1145    else
1146       rcdec_LineZ(lineZ);
1147    *smp  = sm;
1148    *zixp = zix;
1149 }
1150
1151 /* ------------ CacheLine and implicit-tree related ------------ */
1152
1153 __attribute__((unused))
1154 static void pp_CacheLine ( CacheLine* cl ) {
1155    Word i;
1156    if (!cl) {
1157       VG_(printf)("%s","pp_CacheLine(NULL)\n");
1158       return;
1159    }
1160    for (i = 0; i < N_LINE_TREES; i++)
1161       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
1162    for (i = 0; i < N_LINE_ARANGE; i++)
1163       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
1164 }
1165
1166 static UChar descr_to_validbits ( UShort descr )
1167 {
1168    /* a.k.a Party Time for gcc's constant folder */
1169 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
1170                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
1171              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
1172                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
1173                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
1174                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
1175                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
1176                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
1177                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
1178                           ( (b16_0) << 0) ) )
1179
1180 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
1181              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
1182                          ( (bit5) << 5) | ( (bit4) << 4) | \
1183                          ( (bit3) << 3) | ( (bit2) << 2) | \
1184                          ( (bit1) << 1) | ( (bit0) << 0) ) )
1185
1186    /* these should all get folded out at compile time */
1187    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
1188    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
1189    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
1190    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
1191    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
1192    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
1193    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
1194    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
1195    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
1196
1197    switch (descr) {
1198    /*
1199               +--------------------------------- TREE_DESCR_8_7
1200               |             +------------------- TREE_DESCR_8_0
1201               |             |  +---------------- TREE_DESCR_16_3
1202               |             |  | +-------------- TREE_DESCR_32_1
1203               |             |  | | +------------ TREE_DESCR_16_2
1204               |             |  | | |  +--------- TREE_DESCR_64
1205               |             |  | | |  |  +------ TREE_DESCR_16_1
1206               |             |  | | |  |  | +---- TREE_DESCR_32_0
1207               |             |  | | |  |  | | +-- TREE_DESCR_16_0
1208               |             |  | | |  |  | | |
1209               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
1210    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
1211                                                  return BYTE(1,1,1,1,1,1,1,1);
1212    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
1213                                                  return BYTE(1,1,0,1,1,1,1,1);
1214    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
1215                                                  return BYTE(0,1,1,1,1,1,1,1);
1216    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
1217                                                  return BYTE(0,1,0,1,1,1,1,1);
1218
1219    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
1220                                                  return BYTE(1,1,1,1,1,1,0,1);
1221    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
1222                                                  return BYTE(1,1,0,1,1,1,0,1);
1223    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
1224                                                  return BYTE(0,1,1,1,1,1,0,1);
1225    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
1226                                                  return BYTE(0,1,0,1,1,1,0,1);
1227
1228    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
1229                                                  return BYTE(1,1,1,1,0,1,1,1);
1230    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1231                                                  return BYTE(1,1,0,1,0,1,1,1);
1232    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1233                                                  return BYTE(0,1,1,1,0,1,1,1);
1234    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1235                                                  return BYTE(0,1,0,1,0,1,1,1);
1236
1237    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1238                                                  return BYTE(1,1,1,1,0,1,0,1);
1239    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1240                                                  return BYTE(1,1,0,1,0,1,0,1);
1241    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1242                                                  return BYTE(0,1,1,1,0,1,0,1);
1243    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1244                                                  return BYTE(0,1,0,1,0,1,0,1);
1245
1246    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1247                                                  return BYTE(0,0,0,1,1,1,1,1);
1248    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1249                                                  return BYTE(0,0,0,1,1,1,0,1);
1250    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1251                                                  return BYTE(0,0,0,1,0,1,1,1);
1252    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1253                                                  return BYTE(0,0,0,1,0,1,0,1);
1254
1255    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1256                                                  return BYTE(1,1,1,1,0,0,0,1);
1257    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1258                                                  return BYTE(1,1,0,1,0,0,0,1);
1259    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1260                                                  return BYTE(0,1,1,1,0,0,0,1);
1261    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1262                                                  return BYTE(0,1,0,1,0,0,0,1);
1263
1264    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1265                                                  return BYTE(0,0,0,1,0,0,0,1);
1266
1267    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1268                                                  return BYTE(0,0,0,0,0,0,0,1);
1269
1270    default: return BYTE(0,0,0,0,0,0,0,0);
1271                    /* INVALID - any valid descr produces at least one
1272                       valid bit in tree[0..7]*/
1273    }
1274    /* NOTREACHED*/
1275    tl_assert(0);
1276
1277 #  undef DESCR
1278 #  undef BYTE
1279 }
1280
1281 __attribute__((unused))
1282 static Bool is_sane_Descr ( UShort descr ) {
1283    return descr_to_validbits(descr) != 0;
1284 }
1285
1286 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1287    VG_(sprintf)(dst,
1288                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1289                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1290                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1291                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1292                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1293                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1294                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1295                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1296                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1297                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1298                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1299                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1300                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1301                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1302                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1303                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1304    );
1305 }
1306 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1307    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1308                      (Int)((byte & 128) ? 1 : 0),
1309                      (Int)((byte &  64) ? 1 : 0),
1310                      (Int)((byte &  32) ? 1 : 0),
1311                      (Int)((byte &  16) ? 1 : 0),
1312                      (Int)((byte &   8) ? 1 : 0),
1313                      (Int)((byte &   4) ? 1 : 0),
1314                      (Int)((byte &   2) ? 1 : 0),
1315                      (Int)((byte &   1) ? 1 : 0)
1316    );
1317 }
1318
1319 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1320    Word  i;
1321    UChar validbits = descr_to_validbits(descr);
1322    HChar buf[128], buf2[128];    // large enough
1323    if (validbits == 0)
1324       goto bad;
1325    for (i = 0; i < 8; i++) {
1326       if (validbits & (1<<i)) {
1327          if (tree[i] == SVal_INVALID)
1328             goto bad;
1329       } else {
1330          if (tree[i] != SVal_INVALID)
1331             goto bad;
1332       }
1333    }
1334    return True;
1335   bad:
1336    sprintf_Descr( buf, descr );
1337    sprintf_Byte( buf2, validbits );
1338    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1339    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1340    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1341    for (i = 0; i < 8; i++)
1342       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1343    VG_(printf)("%s","}\n");
1344    return 0;
1345 }
1346
1347 static Bool is_sane_CacheLine ( CacheLine* cl )
1348 {
1349    Word tno, cloff;
1350
1351    if (!cl) goto bad;
1352
1353    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1354       UShort descr = cl->descrs[tno];
1355       SVal*  tree  = &cl->svals[cloff];
1356       if (!is_sane_Descr_and_Tree(descr, tree))
1357          goto bad;
1358    }
1359    tl_assert(cloff == N_LINE_ARANGE);
1360    return True;
1361   bad:
1362    pp_CacheLine(cl);
1363    return False;
1364 }
1365
1366 static UShort normalise_tree ( /*MOD*/SVal* tree )
1367 {
1368    UShort descr;
1369    /* pre: incoming tree[0..7] does not have any invalid shvals, in
1370       particular no zeroes. */
1371    if (CHECK_ZSM
1372        && UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1373                    || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1374                    || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1375                    || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1376       tl_assert(0);
1377
1378    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1379            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1380            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1381    /* build 16-bit layer */
1382    if (tree[1] == tree[0]) {
1383       tree[1] = SVal_INVALID;
1384       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1385       descr |= TREE_DESCR_16_0;
1386    }
1387    if (tree[3] == tree[2]) {
1388       tree[3] = SVal_INVALID;
1389       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1390       descr |= TREE_DESCR_16_1;
1391    }
1392    if (tree[5] == tree[4]) {
1393       tree[5] = SVal_INVALID;
1394       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1395       descr |= TREE_DESCR_16_2;
1396    }
1397    if (tree[7] == tree[6]) {
1398       tree[7] = SVal_INVALID;
1399       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1400       descr |= TREE_DESCR_16_3;
1401    }
1402    /* build 32-bit layer */
1403    if (tree[2] == tree[0]
1404        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1405       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1406       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1407       descr |= TREE_DESCR_32_0;
1408    }
1409    if (tree[6] == tree[4]
1410        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1411       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1412       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1413       descr |= TREE_DESCR_32_1;
1414    }
1415    /* build 64-bit layer */
1416    if (tree[4] == tree[0]
1417        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1418       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1419       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1420       descr |= TREE_DESCR_64;
1421    }
1422    return descr;
1423 }
1424
1425 /* This takes a cacheline where all the data is at the leaves
1426    (w8[..]) and builds a correctly normalised tree. */
1427 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1428 {
1429    Word tno, cloff;
1430    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1431       SVal* tree = &cl->svals[cloff];
1432       cl->descrs[tno] = normalise_tree( tree );
1433    }
1434    tl_assert(cloff == N_LINE_ARANGE);
1435    if (CHECK_ZSM)
1436       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1437    stats__cline_normalises++;
1438 }
1439
1440
1441 typedef struct { UChar count; SVal sval; } CountedSVal;
1442
1443 static
1444 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1445                                /*OUT*/Word* dstUsedP,
1446                                Word nDst, CacheLine* src )
1447 {
1448    Word  tno, cloff, dstUsed;
1449
1450    tl_assert(nDst == N_LINE_ARANGE);
1451    dstUsed = 0;
1452
1453    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1454       UShort descr = src->descrs[tno];
1455       SVal*  tree  = &src->svals[cloff];
1456
1457       /* sequentialise the tree described by (descr,tree). */
1458 #     define PUT(_n,_v)                                \
1459          do { dst[dstUsed  ].count = (_n);             \
1460               dst[dstUsed++].sval  = (_v);             \
1461          } while (0)
1462
1463       /* byte 0 */
1464       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1465       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1466       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1467       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1468       /* byte 1 */
1469       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1470       /* byte 2 */
1471       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1472       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1473       /* byte 3 */
1474       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1475       /* byte 4 */
1476       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1477       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1478       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1479       /* byte 5 */
1480       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1481       /* byte 6 */
1482       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1483       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1484       /* byte 7 */
1485       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1486
1487 #     undef PUT
1488       /* END sequentialise the tree described by (descr,tree). */
1489
1490    }
1491    tl_assert(cloff == N_LINE_ARANGE);
1492    tl_assert(dstUsed <= nDst);
1493
1494    *dstUsedP = dstUsed;
1495 }
1496
1497 /* Write the cacheline 'wix' to backing store.  Where it ends up
1498    is determined by its tag field. */
1499 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1500 {
1501    Word        i, j, k, m;
1502    Addr        tag;
1503    SecMap*     sm;
1504    CacheLine*  cl;
1505    LineZ* lineZ;
1506    LineF* lineF;
1507    Word        zix, fix, csvalsUsed;
1508    CountedSVal csvals[N_LINE_ARANGE];
1509    SVal        sv;
1510
1511    if (0)
1512    VG_(printf)("scache wback line %d\n", (Int)wix);
1513
1514    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1515
1516    tag =  cache_shmem.tags0[wix];
1517    cl  = &cache_shmem.lyns0[wix];
1518
1519    /* The cache line may have been invalidated; if so, ignore it. */
1520    if (!is_valid_scache_tag(tag))
1521       return;
1522
1523    /* Where are we going to put it? */
1524    sm         = NULL;
1525    lineZ      = NULL;
1526    lineF      = NULL;
1527    zix = fix = -1;
1528
1529    /* find the Z line to write in and rcdec it or the associated F
1530       line. */
1531    find_Z_for_writing( &sm, &zix, tag );
1532
1533    tl_assert(sm);
1534    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1535    lineZ = &sm->linesZ[zix];
1536
1537    /* Generate the data to be stored */
1538    if (CHECK_ZSM)
1539       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1540
1541    csvalsUsed = -1;
1542    sequentialise_CacheLine( csvals, &csvalsUsed,
1543                             N_LINE_ARANGE, cl );
1544    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1545    if (0) VG_(printf)("%ld ", csvalsUsed);
1546
1547    lineZ->dict[0] = lineZ->dict[1]
1548                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1549
1550    /* i indexes actual shadow values, k is cursor in csvals */
1551    i = 0;
1552    for (k = 0; k < csvalsUsed; k++) {
1553
1554       sv = csvals[k].sval;
1555       if (CHECK_ZSM)
1556          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1557       /* do we already have it? */
1558       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1559       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1560       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1561       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1562       /* no.  look for a free slot. */
1563       if (CHECK_ZSM)
1564          tl_assert(sv != SVal_INVALID);
1565       if (lineZ->dict[0]
1566           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1567       if (lineZ->dict[1]
1568           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1569       if (lineZ->dict[2]
1570           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1571       if (lineZ->dict[3]
1572           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1573       break; /* we'll have to use the f rep */
1574      dict_ok:
1575       m = csvals[k].count;
1576       if (m == 8) {
1577          write_twobit_array( lineZ->ix2s, i+0, j );
1578          write_twobit_array( lineZ->ix2s, i+1, j );
1579          write_twobit_array( lineZ->ix2s, i+2, j );
1580          write_twobit_array( lineZ->ix2s, i+3, j );
1581          write_twobit_array( lineZ->ix2s, i+4, j );
1582          write_twobit_array( lineZ->ix2s, i+5, j );
1583          write_twobit_array( lineZ->ix2s, i+6, j );
1584          write_twobit_array( lineZ->ix2s, i+7, j );
1585          i += 8;
1586       }
1587       else if (m == 4) {
1588          write_twobit_array( lineZ->ix2s, i+0, j );
1589          write_twobit_array( lineZ->ix2s, i+1, j );
1590          write_twobit_array( lineZ->ix2s, i+2, j );
1591          write_twobit_array( lineZ->ix2s, i+3, j );
1592          i += 4;
1593       }
1594       else if (m == 1) {
1595          write_twobit_array( lineZ->ix2s, i+0, j );
1596          i += 1;
1597       }
1598       else if (m == 2) {
1599          write_twobit_array( lineZ->ix2s, i+0, j );
1600          write_twobit_array( lineZ->ix2s, i+1, j );
1601          i += 2;
1602       }
1603       else {
1604          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1605       }
1606
1607    }
1608
1609    if (LIKELY(i == N_LINE_ARANGE)) {
1610       /* Construction of the compressed representation was
1611          successful. */
1612       rcinc_LineZ(lineZ);
1613       stats__cache_Z_wbacks++;
1614    } else {
1615       /* Cannot use the compressed(z) representation.  Use the full(f)
1616          rep instead. */
1617       tl_assert(i >= 0 && i < N_LINE_ARANGE);
1618       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1619       lineF = alloc_LineF_for_Z (lineZ);
1620       i = 0;
1621       for (k = 0; k < csvalsUsed; k++) {
1622          if (CHECK_ZSM)
1623             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1624          sv = csvals[k].sval;
1625          if (CHECK_ZSM)
1626             tl_assert(sv != SVal_INVALID);
1627          for (m = csvals[k].count; m > 0; m--) {
1628             lineF->w64s[i] = sv;
1629             i++;
1630          }
1631       }
1632       tl_assert(i == N_LINE_ARANGE);
1633       rcinc_LineF(lineF);
1634       stats__cache_F_wbacks++;
1635    }
1636 }
1637
1638 /* Fetch the cacheline 'wix' from the backing store.  The tag
1639    associated with 'wix' is assumed to have already been filled in;
1640    hence that is used to determine where in the backing store to read
1641    from. */
1642 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1643 {
1644    Word       i;
1645    Addr       tag;
1646    CacheLine* cl;
1647    LineZ*     lineZ;
1648    LineF*     lineF;
1649
1650    if (0)
1651    VG_(printf)("scache fetch line %d\n", (Int)wix);
1652
1653    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1654
1655    tag =  cache_shmem.tags0[wix];
1656    cl  = &cache_shmem.lyns0[wix];
1657
1658    /* reject nonsense requests */
1659    tl_assert(is_valid_scache_tag(tag));
1660
1661    lineZ = NULL;
1662    lineF = NULL;
1663    find_ZF_for_reading( &lineZ, &lineF, tag );
1664    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1665
1666    /* expand the data into the bottom layer of the tree, then get
1667       cacheline_normalise to build the descriptor array. */
1668    if (lineF) {
1669       for (i = 0; i < N_LINE_ARANGE; i++) {
1670          cl->svals[i] = lineF->w64s[i];
1671       }
1672       stats__cache_F_fetches++;
1673    } else {
1674       for (i = 0; i < N_LINE_ARANGE; i++) {
1675          UWord ix = read_twobit_array( lineZ->ix2s, i );
1676          if (CHECK_ZSM) tl_assert(ix >= 0 && ix <= 3);
1677          cl->svals[i] = lineZ->dict[ix];
1678          if (CHECK_ZSM) tl_assert(cl->svals[i] != SVal_INVALID);
1679       }
1680       stats__cache_Z_fetches++;
1681    }
1682    normalise_CacheLine( cl );
1683 }
1684
1685 /* Invalid the cachelines corresponding to the given range, which
1686    must start and end on a cacheline boundary. */
1687 static void shmem__invalidate_scache_range (Addr ga, SizeT szB)
1688 {
1689    Word wix;
1690
1691    /* ga must be on a cacheline boundary. */
1692    tl_assert (is_valid_scache_tag (ga));
1693    /* szB must be a multiple of cacheline size. */
1694    tl_assert (0 == (szB & (N_LINE_ARANGE - 1)));
1695
1696
1697    Word ga_ix = (ga >> N_LINE_BITS) & (N_WAY_NENT - 1);
1698    Word nwix = szB / N_LINE_ARANGE;
1699
1700    if (nwix > N_WAY_NENT)
1701       nwix = N_WAY_NENT; // no need to check several times the same entry.
1702
1703    for (wix = 0; wix < nwix; wix++) {
1704       if (address_in_range(cache_shmem.tags0[ga_ix], ga, szB))
1705          cache_shmem.tags0[ga_ix] = 1/*INVALID*/;
1706       ga_ix++;
1707       if (UNLIKELY(ga_ix == N_WAY_NENT))
1708          ga_ix = 0;
1709    }
1710 }
1711
1712
1713 static void shmem__flush_and_invalidate_scache ( void ) {
1714    Word wix;
1715    Addr tag;
1716    if (0) VG_(printf)("%s","scache flush and invalidate\n");
1717    tl_assert(!is_valid_scache_tag(1));
1718    for (wix = 0; wix < N_WAY_NENT; wix++) {
1719       tag = cache_shmem.tags0[wix];
1720       if (tag == 1/*INVALID*/) {
1721          /* already invalid; nothing to do */
1722       } else {
1723          tl_assert(is_valid_scache_tag(tag));
1724          cacheline_wback( wix );
1725       }
1726       cache_shmem.tags0[wix] = 1/*INVALID*/;
1727    }
1728    stats__cache_flushes_invals++;
1729 }
1730
1731
1732 static inline Bool aligned16 ( Addr a ) {
1733    return 0 == (a & 1);
1734 }
1735 static inline Bool aligned32 ( Addr a ) {
1736    return 0 == (a & 3);
1737 }
1738 static inline Bool aligned64 ( Addr a ) {
1739    return 0 == (a & 7);
1740 }
1741 static inline UWord get_cacheline_offset ( Addr a ) {
1742    return (UWord)(a & (N_LINE_ARANGE - 1));
1743 }
1744 static inline Addr cacheline_ROUNDUP ( Addr a ) {
1745    return ROUNDUP(a, N_LINE_ARANGE);
1746 }
1747 static inline Addr cacheline_ROUNDDN ( Addr a ) {
1748    return ROUNDDN(a, N_LINE_ARANGE);
1749 }
1750 static inline UWord get_treeno ( Addr a ) {
1751    return get_cacheline_offset(a) >> 3;
1752 }
1753 static inline UWord get_tree_offset ( Addr a ) {
1754    return a & 7;
1755 }
1756
1757 static __attribute__((noinline))
1758        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
1759 static inline CacheLine* get_cacheline ( Addr a )
1760 {
1761    /* tag is 'a' with the in-line offset masked out,
1762       eg a[31]..a[4] 0000 */
1763    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1764    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1765    stats__cache_totrefs++;
1766    if (LIKELY(tag == cache_shmem.tags0[wix])) {
1767       return &cache_shmem.lyns0[wix];
1768    } else {
1769       return get_cacheline_MISS( a );
1770    }
1771 }
1772
1773 static __attribute__((noinline))
1774        CacheLine* get_cacheline_MISS ( Addr a )
1775 {
1776    /* tag is 'a' with the in-line offset masked out,
1777       eg a[31]..a[4] 0000 */
1778
1779    CacheLine* cl;
1780    Addr*      tag_old_p;
1781    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1782    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1783
1784    tl_assert(tag != cache_shmem.tags0[wix]);
1785
1786    /* Dump the old line into the backing store. */
1787    stats__cache_totmisses++;
1788
1789    cl        = &cache_shmem.lyns0[wix];
1790    tag_old_p = &cache_shmem.tags0[wix];
1791
1792    if (is_valid_scache_tag( *tag_old_p )) {
1793       /* EXPENSIVE and REDUNDANT: callee does it */
1794       if (CHECK_ZSM)
1795          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1796       cacheline_wback( wix );
1797    }
1798    /* and reload the new one */
1799    *tag_old_p = tag;
1800    cacheline_fetch( wix );
1801    if (CHECK_ZSM)
1802       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1803    return cl;
1804 }
1805
1806 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1807    stats__cline_64to32pulldown++;
1808    switch (toff) {
1809       case 0: case 4:
1810          tl_assert(descr & TREE_DESCR_64);
1811          tree[4] = tree[0];
1812          descr &= ~TREE_DESCR_64;
1813          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1814          break;
1815       default:
1816          tl_assert(0);
1817    }
1818    return descr;
1819 }
1820
1821 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1822    stats__cline_32to16pulldown++;
1823    switch (toff) {
1824       case 0: case 2:
1825          if (!(descr & TREE_DESCR_32_0)) {
1826             descr = pulldown_to_32(tree, 0, descr);
1827          }
1828          tl_assert(descr & TREE_DESCR_32_0);
1829          tree[2] = tree[0];
1830          descr &= ~TREE_DESCR_32_0;
1831          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1832          break;
1833       case 4: case 6:
1834          if (!(descr & TREE_DESCR_32_1)) {
1835             descr = pulldown_to_32(tree, 4, descr);
1836          }
1837          tl_assert(descr & TREE_DESCR_32_1);
1838          tree[6] = tree[4];
1839          descr &= ~TREE_DESCR_32_1;
1840          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1841          break;
1842       default:
1843          tl_assert(0);
1844    }
1845    return descr;
1846 }
1847
1848 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1849    stats__cline_16to8pulldown++;
1850    switch (toff) {
1851       case 0: case 1:
1852          if (!(descr & TREE_DESCR_16_0)) {
1853             descr = pulldown_to_16(tree, 0, descr);
1854          }
1855          tl_assert(descr & TREE_DESCR_16_0);
1856          tree[1] = tree[0];
1857          descr &= ~TREE_DESCR_16_0;
1858          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1859          break;
1860       case 2: case 3:
1861          if (!(descr & TREE_DESCR_16_1)) {
1862             descr = pulldown_to_16(tree, 2, descr);
1863          }
1864          tl_assert(descr & TREE_DESCR_16_1);
1865          tree[3] = tree[2];
1866          descr &= ~TREE_DESCR_16_1;
1867          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1868          break;
1869       case 4: case 5:
1870          if (!(descr & TREE_DESCR_16_2)) {
1871             descr = pulldown_to_16(tree, 4, descr);
1872          }
1873          tl_assert(descr & TREE_DESCR_16_2);
1874          tree[5] = tree[4];
1875          descr &= ~TREE_DESCR_16_2;
1876          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1877          break;
1878       case 6: case 7:
1879          if (!(descr & TREE_DESCR_16_3)) {
1880             descr = pulldown_to_16(tree, 6, descr);
1881          }
1882          tl_assert(descr & TREE_DESCR_16_3);
1883          tree[7] = tree[6];
1884          descr &= ~TREE_DESCR_16_3;
1885          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1886          break;
1887       default:
1888          tl_assert(0);
1889    }
1890    return descr;
1891 }
1892
1893
1894 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1895    UShort mask;
1896    switch (toff) {
1897       case 0:
1898          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1899          tl_assert( (descr & mask) == mask );
1900          descr &= ~mask;
1901          descr |= TREE_DESCR_16_0;
1902          break;
1903       case 2:
1904          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1905          tl_assert( (descr & mask) == mask );
1906          descr &= ~mask;
1907          descr |= TREE_DESCR_16_1;
1908          break;
1909       case 4:
1910          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1911          tl_assert( (descr & mask) == mask );
1912          descr &= ~mask;
1913          descr |= TREE_DESCR_16_2;
1914          break;
1915       case 6:
1916          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1917          tl_assert( (descr & mask) == mask );
1918          descr &= ~mask;
1919          descr |= TREE_DESCR_16_3;
1920          break;
1921       default:
1922          tl_assert(0);
1923    }
1924    return descr;
1925 }
1926
1927 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1928    UShort mask;
1929    switch (toff) {
1930       case 0:
1931          if (!(descr & TREE_DESCR_16_0))
1932             descr = pullup_descr_to_16(descr, 0);
1933          if (!(descr & TREE_DESCR_16_1))
1934             descr = pullup_descr_to_16(descr, 2);
1935          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1936          tl_assert( (descr & mask) == mask );
1937          descr &= ~mask;
1938          descr |= TREE_DESCR_32_0;
1939          break;
1940       case 4:
1941          if (!(descr & TREE_DESCR_16_2))
1942             descr = pullup_descr_to_16(descr, 4);
1943          if (!(descr & TREE_DESCR_16_3))
1944             descr = pullup_descr_to_16(descr, 6);
1945          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1946          tl_assert( (descr & mask) == mask );
1947          descr &= ~mask;
1948          descr |= TREE_DESCR_32_1;
1949          break;
1950       default:
1951          tl_assert(0);
1952    }
1953    return descr;
1954 }
1955
1956 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1957    switch (toff) {
1958       case 0: case 4:
1959          return 0 != (descr & TREE_DESCR_64);
1960       default:
1961          tl_assert(0);
1962    }
1963 }
1964
1965 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1966    switch (toff) {
1967       case 0:
1968          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1969       case 2:
1970          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1971       case 4:
1972          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1973       case 6:
1974          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1975       default:
1976          tl_assert(0);
1977    }
1978 }
1979
1980 /* ------------ Cache management ------------ */
1981
1982 static void zsm_flush_cache ( void )
1983 {
1984    shmem__flush_and_invalidate_scache();
1985 }
1986
1987
1988 static void zsm_init ( void )
1989 {
1990    tl_assert( sizeof(UWord) == sizeof(Addr) );
1991
1992    tl_assert(map_shmem == NULL);
1993    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1994                            HG_(free),
1995                            NULL/*unboxed UWord cmp*/);
1996    /* Invalidate all cache entries. */
1997    tl_assert(!is_valid_scache_tag(1));
1998    for (UWord wix = 0; wix < N_WAY_NENT; wix++) {
1999       cache_shmem.tags0[wix] = 1/*INVALID*/;
2000    }
2001
2002    LineF_pool_allocator = VG_(newPA) (
2003                              sizeof(LineF),
2004                              /* Nr elements/pool to fill a core arena block
2005                                 taking some arena overhead into account. */
2006                              (4 * 1024 * 1024 - 200)/sizeof(LineF),
2007                              HG_(zalloc),
2008                              "libhb.LineF_storage.pool",
2009                              HG_(free)
2010                           );
2011
2012    /* a SecMap must contain an integral number of CacheLines */
2013    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
2014    /* also ... a CacheLine holds an integral number of trees */
2015    tl_assert(0 == (N_LINE_ARANGE % 8));
2016 }
2017
2018 /////////////////////////////////////////////////////////////////
2019 /////////////////////////////////////////////////////////////////
2020 //                                                             //
2021 // SECTION END compressed shadow memory                        //
2022 //                                                             //
2023 /////////////////////////////////////////////////////////////////
2024 /////////////////////////////////////////////////////////////////
2025
2026
2027
2028 /////////////////////////////////////////////////////////////////
2029 /////////////////////////////////////////////////////////////////
2030 //                                                             //
2031 // SECTION BEGIN vts primitives                                //
2032 //                                                             //
2033 /////////////////////////////////////////////////////////////////
2034 /////////////////////////////////////////////////////////////////
2035
2036
2037 /* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
2038    being compact stand-ins for Thr*'s.  Use these functions to map
2039    between them. */
2040 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
2041 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
2042
2043 __attribute__((noreturn))
2044 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
2045 {
2046    if (due_to_nThrs) {
2047       const HChar* s =
2048          "\n"
2049          "Helgrind: cannot continue, run aborted: too many threads.\n"
2050          "Sorry.  Helgrind can only handle programs that create\n"
2051          "%'llu or fewer threads over their entire lifetime.\n"
2052          "\n";
2053       VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
2054    } else {
2055       const HChar* s =
2056          "\n"
2057          "Helgrind: cannot continue, run aborted: too many\n"
2058          "synchronisation events.  Sorry. Helgrind can only handle\n"
2059          "programs which perform %'llu or fewer\n"
2060          "inter-thread synchronisation events (locks, unlocks, etc).\n"
2061          "\n";
2062       VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
2063    }
2064    VG_(exit)(1);
2065    /*NOTREACHED*/
2066    tl_assert(0); /*wtf?!*/
2067 }
2068
2069
2070 /* The dead thread (ThrID, actually) tables.  A thread may only be
2071    listed here if we have been notified thereof by libhb_async_exit.
2072    New entries are added at the end.  The order isn't important, but
2073    the ThrID values must be unique.
2074    verydead_thread_table_not_pruned lists the identity of the threads
2075    that died since the previous round of pruning.
2076    Once pruning is done, these ThrID are added in verydead_thread_table.
2077    We don't actually need to keep the set of threads that have ever died --
2078    only the threads that have died since the previous round of
2079    pruning.  But it's useful for sanity check purposes to keep the
2080    entire set, so we do. */
2081 static XArray* /* of ThrID */ verydead_thread_table_not_pruned = NULL;
2082 static XArray* /* of ThrID */ verydead_thread_table = NULL;
2083
2084 /* Arbitrary total ordering on ThrIDs. */
2085 static Int cmp__ThrID ( const void* v1, const void* v2 ) {
2086    ThrID id1 = *(const ThrID*)v1;
2087    ThrID id2 = *(const ThrID*)v2;
2088    if (id1 < id2) return -1;
2089    if (id1 > id2) return 1;
2090    return 0;
2091 }
2092
2093 static void verydead_thread_tables_init ( void )
2094 {
2095    tl_assert(!verydead_thread_table);
2096    tl_assert(!verydead_thread_table_not_pruned);
2097    verydead_thread_table
2098      = VG_(newXA)( HG_(zalloc),
2099                    "libhb.verydead_thread_table_init.1",
2100                    HG_(free), sizeof(ThrID) );
2101    VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
2102    verydead_thread_table_not_pruned
2103      = VG_(newXA)( HG_(zalloc),
2104                    "libhb.verydead_thread_table_init.2",
2105                    HG_(free), sizeof(ThrID) );
2106    VG_(setCmpFnXA)(verydead_thread_table_not_pruned, cmp__ThrID);
2107 }
2108
2109 static void verydead_thread_table_sort_and_check (XArray* thrids)
2110 {
2111    UWord i;
2112
2113    VG_(sortXA)( thrids );
2114    /* Sanity check: check for unique .sts.thr values. */
2115    UWord nBT = VG_(sizeXA)( thrids );
2116    if (nBT > 0) {
2117       ThrID thrid1, thrid2;
2118       thrid2 = *(ThrID*)VG_(indexXA)( thrids, 0 );
2119       for (i = 1; i < nBT; i++) {
2120          thrid1 = thrid2;
2121          thrid2 = *(ThrID*)VG_(indexXA)( thrids, i );
2122          tl_assert(thrid1 < thrid2);
2123       }
2124    }
2125    /* Ok, so the dead thread table thrids has unique and in-order keys. */
2126 }
2127
2128 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
2129    a backlink for the caller's convenience.  Since we have no idea
2130    what to set that to in the library, it always gets set to
2131    VtsID_INVALID. */
2132 typedef
2133    struct {
2134       VtsID    id;
2135       UInt     usedTS;
2136       UInt     sizeTS;
2137       ScalarTS ts[0];
2138    }
2139    VTS;
2140
2141 /* Allocate a VTS capable of storing 'sizeTS' entries. */
2142 static VTS* VTS__new ( const HChar* who, UInt sizeTS );
2143
2144 /* Make a clone of 'vts', sizing the new array to exactly match the
2145    number of ScalarTSs present. */
2146 static VTS* VTS__clone ( const HChar* who, VTS* vts );
2147
2148 /* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
2149    array is sized exactly to hold the number of required elements.
2150    'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
2151    must be in strictly increasing order. */
2152 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel );
2153
2154 /* Delete this VTS in its entirety. */
2155 static void VTS__delete ( VTS* vts );
2156
2157 /* Create a new singleton VTS in 'out'.  Caller must have
2158    pre-allocated 'out' sufficiently big to hold the result in all
2159    possible cases. */
2160 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
2161
2162 /* Create in 'out' a VTS which is the same as 'vts' except with
2163    vts[me]++, so to speak.  Caller must have pre-allocated 'out'
2164    sufficiently big to hold the result in all possible cases. */
2165 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
2166
2167 /* Create in 'out' a VTS which is the join (max) of 'a' and
2168    'b'. Caller must have pre-allocated 'out' sufficiently big to hold
2169    the result in all possible cases. */
2170 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
2171
2172 /* Compute the partial ordering relation of the two args.  Although we
2173    could be completely general and return an enumeration value (EQ,
2174    LT, GT, UN), in fact we only need LEQ, and so we may as well
2175    hardwire that fact.
2176
2177    Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
2178    invald ThrID).  In the latter case, the returned ThrID indicates
2179    the discovered point for which they are not.  There may be more
2180    than one such point, but we only care about seeing one of them, not
2181    all of them.  This rather strange convention is used because
2182    sometimes we want to know the actual index at which they first
2183    differ. */
2184 static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
2185
2186 /* Compute an arbitrary structural (total) ordering on the two args,
2187    based on their VCs, so they can be looked up in a table, tree, etc.
2188    Returns -1, 0 or 1. */
2189 static Word VTS__cmp_structural ( VTS* a, VTS* b );
2190
2191 /* Debugging only.  Display the given VTS. */
2192 static void VTS__show ( const VTS* vts );
2193
2194 /* Debugging only.  Return vts[index], so to speak. */
2195 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
2196
2197 /* Notify the VTS machinery that a thread has been declared
2198    comprehensively dead: that is, it has done an async exit AND it has
2199    been joined with.  This should ensure that its local clocks (.viR
2200    and .viW) will never again change, and so all mentions of this
2201    thread from all VTSs in the system may be removed. */
2202 static void VTS__declare_thread_very_dead ( Thr* idx );
2203
2204 /*--------------- to do with Vector Timestamps ---------------*/
2205
2206 static Bool is_sane_VTS ( VTS* vts )
2207 {
2208    UWord     i, n;
2209    ScalarTS  *st1, *st2;
2210    if (!vts) return False;
2211    if (vts->usedTS > vts->sizeTS) return False;
2212    n = vts->usedTS;
2213    if (n == 1) {
2214       st1 = &vts->ts[0];
2215       if (st1->tym == 0)
2216          return False;
2217    }
2218    else
2219    if (n >= 2) {
2220       for (i = 0; i < n-1; i++) {
2221          st1 = &vts->ts[i];
2222          st2 = &vts->ts[i+1];
2223          if (st1->thrid >= st2->thrid)
2224             return False;
2225          if (st1->tym == 0 || st2->tym == 0)
2226             return False;
2227       }
2228    }
2229    return True;
2230 }
2231
2232
2233 /* Create a new, empty VTS.
2234 */
2235 static VTS* VTS__new ( const HChar* who, UInt sizeTS )
2236 {
2237    VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
2238    tl_assert(vts->usedTS == 0);
2239    vts->sizeTS = sizeTS;
2240    *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
2241    return vts;
2242 }
2243
2244 /* Clone this VTS.
2245 */
2246 static VTS* VTS__clone ( const HChar* who, VTS* vts )
2247 {
2248    tl_assert(vts);
2249    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2250    UInt nTS = vts->usedTS;
2251    VTS* clone = VTS__new(who, nTS);
2252    clone->id = vts->id;
2253    clone->sizeTS = nTS;
2254    clone->usedTS = nTS;
2255    UInt i;
2256    for (i = 0; i < nTS; i++) {
2257       clone->ts[i] = vts->ts[i];
2258    }
2259    tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2260    return clone;
2261 }
2262
2263
2264 /* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
2265    must be in strictly increasing order.  We could obviously do this
2266    much more efficiently (in linear time) if necessary.
2267 */
2268 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel )
2269 {
2270    UInt i, j;
2271    tl_assert(vts);
2272    tl_assert(thridsToDel);
2273    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2274    UInt nTS = vts->usedTS;
2275    /* Figure out how many ScalarTSs will remain in the output. */
2276    UInt nReq = nTS;
2277    for (i = 0; i < nTS; i++) {
2278       ThrID thrid = vts->ts[i].thrid;
2279       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2280          nReq--;
2281    }
2282    tl_assert(nReq <= nTS);
2283    /* Copy the ones that will remain. */
2284    VTS* res = VTS__new(who, nReq);
2285    j = 0;
2286    for (i = 0; i < nTS; i++) {
2287       ThrID thrid = vts->ts[i].thrid;
2288       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2289          continue;
2290       res->ts[j++] = vts->ts[i];
2291    }
2292    tl_assert(j == nReq);
2293    tl_assert(j == res->sizeTS);
2294    res->usedTS = j;
2295    tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2296    return res;
2297 }
2298
2299
2300 /* Delete this VTS in its entirety.
2301 */
2302 static void VTS__delete ( VTS* vts )
2303 {
2304    tl_assert(vts);
2305    tl_assert(vts->usedTS <= vts->sizeTS);
2306    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2307    HG_(free)(vts);
2308 }
2309
2310
2311 /* Create a new singleton VTS.
2312 */
2313 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2314 {
2315    tl_assert(thr);
2316    tl_assert(tym >= 1);
2317    tl_assert(out);
2318    tl_assert(out->usedTS == 0);
2319    tl_assert(out->sizeTS >= 1);
2320    UInt hi = out->usedTS++;
2321    out->ts[hi].thrid = Thr__to_ThrID(thr);
2322    out->ts[hi].tym   = tym;
2323 }
2324
2325
2326 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2327    not modified.
2328 */
2329 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2330 {
2331    UInt      i, n;
2332    ThrID     me_thrid;
2333    Bool      found = False;
2334
2335    stats__vts__tick++;
2336
2337    tl_assert(out);
2338    tl_assert(out->usedTS == 0);
2339    if (vts->usedTS >= ThrID_MAX_VALID)
2340       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2341    tl_assert(out->sizeTS >= 1 + vts->usedTS);
2342
2343    tl_assert(me);
2344    me_thrid = Thr__to_ThrID(me);
2345    tl_assert(is_sane_VTS(vts));
2346    n = vts->usedTS;
2347
2348    /* Copy all entries which precede 'me'. */
2349    for (i = 0; i < n; i++) {
2350       ScalarTS* here = &vts->ts[i];
2351       if (UNLIKELY(here->thrid >= me_thrid))
2352          break;
2353       UInt hi = out->usedTS++;
2354       out->ts[hi] = *here;
2355    }
2356
2357    /* 'i' now indicates the next entry to copy, if any.
2358        There are 3 possibilities:
2359        (a) there is no next entry (we used them all up already):
2360            add (me_thrid,1) to the output, and quit
2361        (b) there is a next entry, and its thrid > me_thrid:
2362            add (me_thrid,1) to the output, then copy the remaining entries
2363        (c) there is a next entry, and its thrid == me_thrid:
2364            copy it to the output but increment its timestamp value.
2365            Then copy the remaining entries.  (c) is the common case.
2366    */
2367    tl_assert(i >= 0 && i <= n);
2368    if (i == n) { /* case (a) */
2369       UInt hi = out->usedTS++;
2370       out->ts[hi].thrid = me_thrid;
2371       out->ts[hi].tym   = 1;
2372    } else {
2373       /* cases (b) and (c) */
2374       ScalarTS* here = &vts->ts[i];
2375       if (me_thrid == here->thrid) { /* case (c) */
2376          if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2377             /* We're hosed.  We have to stop. */
2378             scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2379          }
2380          UInt hi = out->usedTS++;
2381          out->ts[hi].thrid = here->thrid;
2382          out->ts[hi].tym   = here->tym + 1;
2383          i++;
2384          found = True;
2385       } else { /* case (b) */
2386          UInt hi = out->usedTS++;
2387          out->ts[hi].thrid = me_thrid;
2388          out->ts[hi].tym   = 1;
2389       }
2390       /* And copy any remaining entries. */
2391       for (/*keepgoing*/; i < n; i++) {
2392          ScalarTS* here2 = &vts->ts[i];
2393          UInt hi = out->usedTS++;
2394          out->ts[hi] = *here2;
2395       }
2396    }
2397
2398    tl_assert(is_sane_VTS(out));
2399    tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2400    tl_assert(out->usedTS <= out->sizeTS);
2401 }
2402
2403
2404 /* Return a new VTS constructed as the join (max) of the 2 args.
2405    Neither arg is modified.
2406 */
2407 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2408 {
2409    UInt     ia, ib, useda, usedb;
2410    ULong    tyma, tymb, tymMax;
2411    ThrID    thrid;
2412    UInt     ncommon = 0;
2413
2414    stats__vts__join++;
2415
2416    tl_assert(a);
2417    tl_assert(b);
2418    useda = a->usedTS;
2419    usedb = b->usedTS;
2420
2421    tl_assert(out);
2422    tl_assert(out->usedTS == 0);
2423    /* overly conservative test, but doing better involves comparing
2424       the two VTSs, which we don't want to do at this point. */
2425    if (useda + usedb >= ThrID_MAX_VALID)
2426       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2427    tl_assert(out->sizeTS >= useda + usedb);
2428
2429    ia = ib = 0;
2430
2431    while (1) {
2432
2433       /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2434          from a and b in order, where thrid is the next ThrID
2435          occurring in either a or b, and tyma/b are the relevant
2436          scalar timestamps, taking into account implicit zeroes. */
2437       tl_assert(ia >= 0 && ia <= useda);
2438       tl_assert(ib >= 0 && ib <= usedb);
2439
2440       if        (ia == useda && ib == usedb) {
2441          /* both empty - done */
2442          break;
2443
2444       } else if (ia == useda && ib != usedb) {
2445          /* a empty, use up b */
2446          ScalarTS* tmpb = &b->ts[ib];
2447          thrid = tmpb->thrid;
2448          tyma  = 0;
2449          tymb  = tmpb->tym;
2450          ib++;
2451
2452       } else if (ia != useda && ib == usedb) {
2453          /* b empty, use up a */
2454          ScalarTS* tmpa = &a->ts[ia];
2455          thrid = tmpa->thrid;
2456          tyma  = tmpa->tym;
2457          tymb  = 0;
2458          ia++;
2459
2460       } else {
2461          /* both not empty; extract lowest-ThrID'd triple */
2462          ScalarTS* tmpa = &a->ts[ia];
2463          ScalarTS* tmpb = &b->ts[ib];
2464          if (tmpa->thrid < tmpb->thrid) {
2465             /* a has the lowest unconsidered ThrID */
2466             thrid = tmpa->thrid;
2467             tyma  = tmpa->tym;
2468             tymb  = 0;
2469             ia++;
2470          } else if (tmpa->thrid > tmpb->thrid) {
2471             /* b has the lowest unconsidered ThrID */
2472             thrid = tmpb->thrid;
2473             tyma  = 0;
2474             tymb  = tmpb->tym;
2475             ib++;
2476          } else {
2477             /* they both next mention the same ThrID */
2478             tl_assert(tmpa->thrid == tmpb->thrid);
2479             thrid = tmpa->thrid; /* == tmpb->thrid */
2480             tyma  = tmpa->tym;
2481             tymb  = tmpb->tym;
2482             ia++;
2483             ib++;
2484             ncommon++;
2485          }
2486       }
2487
2488       /* having laboriously determined (thr, tyma, tymb), do something
2489          useful with it. */
2490       tymMax = tyma > tymb ? tyma : tymb;
2491       if (tymMax > 0) {
2492          UInt hi = out->usedTS++;
2493          out->ts[hi].thrid = thrid;
2494          out->ts[hi].tym   = tymMax;
2495       }
2496
2497    }
2498
2499    tl_assert(is_sane_VTS(out));
2500    tl_assert(out->usedTS <= out->sizeTS);
2501    tl_assert(out->usedTS == useda + usedb - ncommon);
2502 }
2503
2504
2505 /* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2506    they are, or the first ThrID for which they are not (no valid ThrID
2507    has the value zero).  This rather strange convention is used
2508    because sometimes we want to know the actual index at which they
2509    first differ. */
2510 static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2511 {
2512    Word  ia, ib, useda, usedb;
2513    ULong tyma, tymb;
2514
2515    stats__vts__cmpLEQ++;
2516
2517    tl_assert(a);
2518    tl_assert(b);
2519    useda = a->usedTS;
2520    usedb = b->usedTS;
2521
2522    ia = ib = 0;
2523
2524    while (1) {
2525
2526       /* This logic is to enumerate doubles (tyma, tymb) drawn
2527          from a and b in order, and tyma/b are the relevant
2528          scalar timestamps, taking into account implicit zeroes. */
2529       ThrID thrid;
2530
2531       tl_assert(ia >= 0 && ia <= useda);
2532       tl_assert(ib >= 0 && ib <= usedb);
2533
2534       if        (ia == useda && ib == usedb) {
2535          /* both empty - done */
2536          break;
2537
2538       } else if (ia == useda && ib != usedb) {
2539          /* a empty, use up b */
2540          ScalarTS* tmpb = &b->ts[ib];
2541          tyma  = 0;
2542          tymb  = tmpb->tym;
2543          thrid = tmpb->thrid;
2544          ib++;
2545
2546       } else if (ia != useda && ib == usedb) {
2547          /* b empty, use up a */
2548          ScalarTS* tmpa = &a->ts[ia];
2549          tyma  = tmpa->tym;
2550          thrid = tmpa->thrid;
2551          tymb  = 0;
2552          ia++;
2553
2554       } else {
2555          /* both not empty; extract lowest-ThrID'd triple */
2556          ScalarTS* tmpa = &a->ts[ia];
2557          ScalarTS* tmpb = &b->ts[ib];
2558          if (tmpa->thrid < tmpb->thrid) {
2559             /* a has the lowest unconsidered ThrID */
2560             tyma  = tmpa->tym;
2561             thrid = tmpa->thrid;
2562             tymb  = 0;
2563             ia++;
2564          }
2565          else
2566          if (tmpa->thrid > tmpb->thrid) {
2567             /* b has the lowest unconsidered ThrID */
2568             tyma  = 0;
2569             tymb  = tmpb->tym;
2570             thrid = tmpb->thrid;
2571             ib++;
2572          } else {
2573             /* they both next mention the same ThrID */
2574             tl_assert(tmpa->thrid == tmpb->thrid);
2575             tyma  = tmpa->tym;
2576             thrid = tmpa->thrid;
2577             tymb  = tmpb->tym;
2578             ia++;
2579             ib++;
2580          }
2581       }
2582
2583       /* having laboriously determined (tyma, tymb), do something
2584          useful with it. */
2585       if (tyma > tymb) {
2586          /* not LEQ at this index.  Quit, since the answer is
2587             determined already. */
2588          tl_assert(thrid >= 1024);
2589          return thrid;
2590       }
2591    }
2592
2593    return 0; /* all points are LEQ => return an invalid ThrID */
2594 }
2595
2596
2597 /* Compute an arbitrary structural (total) ordering on the two args,
2598    based on their VCs, so they can be looked up in a table, tree, etc.
2599    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2600    performance critical so there is some effort expended to make it sa
2601    fast as possible.
2602 */
2603 Word VTS__cmp_structural ( VTS* a, VTS* b )
2604 {
2605    /* We just need to generate an arbitrary total ordering based on
2606       a->ts and b->ts.  Preferably do it in a way which comes across likely
2607       differences relatively quickly. */
2608    Word     i;
2609    Word     useda = 0,    usedb = 0;
2610    ScalarTS *ctsa = NULL, *ctsb = NULL;
2611
2612    stats__vts__cmp_structural++;
2613
2614    tl_assert(a);
2615    tl_assert(b);
2616
2617    ctsa = &a->ts[0]; useda = a->usedTS;
2618    ctsb = &b->ts[0]; usedb = b->usedTS;
2619
2620    if (LIKELY(useda == usedb)) {
2621       ScalarTS *tmpa = NULL, *tmpb = NULL;
2622       stats__vts__cmp_structural_slow++;
2623       /* Same length vectors.  Find the first difference, if any, as
2624          fast as possible. */
2625       for (i = 0; i < useda; i++) {
2626          tmpa = &ctsa[i];
2627          tmpb = &ctsb[i];
2628          if (LIKELY(tmpa->tym == tmpb->tym
2629                     && tmpa->thrid == tmpb->thrid))
2630             continue;
2631          else
2632             break;
2633       }
2634       if (UNLIKELY(i == useda)) {
2635          /* They're identical. */
2636          return 0;
2637       } else {
2638          tl_assert(i >= 0 && i < useda);
2639          if (tmpa->tym < tmpb->tym) return -1;
2640          if (tmpa->tym > tmpb->tym) return 1;
2641          if (tmpa->thrid < tmpb->thrid) return -1;
2642          if (tmpa->thrid > tmpb->thrid) return 1;
2643          /* we just established them as non-identical, hence: */
2644       }
2645       /*NOTREACHED*/
2646       tl_assert(0);
2647    }
2648
2649    if (useda < usedb) return -1;
2650    if (useda > usedb) return 1;
2651    /*NOTREACHED*/
2652    tl_assert(0);
2653 }
2654
2655
2656 /* Debugging only.  Display the given VTS.
2657 */
2658 static void VTS__show ( const VTS* vts )
2659 {
2660    Word      i, n;
2661    tl_assert(vts);
2662
2663    VG_(printf)("[");
2664    n =  vts->usedTS;
2665    for (i = 0; i < n; i++) {
2666       const ScalarTS *st = &vts->ts[i];
2667       VG_(printf)(i < n-1 ? "%d:%llu " : "%d:%llu", st->thrid, (ULong)st->tym);
2668    }
2669    VG_(printf)("]");
2670 }
2671
2672
2673 /* Debugging only.  Return vts[index], so to speak.
2674 */
2675 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2676 {
2677    UWord i, n;
2678    ThrID idx_thrid = Thr__to_ThrID(idx);
2679    stats__vts__indexat_slow++;
2680    tl_assert(vts);
2681    n = vts->usedTS;
2682    for (i = 0; i < n; i++) {
2683       ScalarTS* st = &vts->ts[i];
2684       if (st->thrid == idx_thrid)
2685          return st->tym;
2686    }
2687    return 0;
2688 }
2689
2690
2691 /* See comment on prototype above.
2692 */
2693 static void VTS__declare_thread_very_dead ( Thr* thr )
2694 {
2695    if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2696
2697    tl_assert(thr->llexit_done);
2698    tl_assert(thr->joinedwith_done);
2699
2700    ThrID nyu;
2701    nyu = Thr__to_ThrID(thr);
2702    VG_(addToXA)( verydead_thread_table_not_pruned, &nyu );
2703
2704    /* We can only get here if we're assured that we'll never again
2705       need to look at this thread's ::viR or ::viW.  Set them to
2706       VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2707       mostly so that we don't wind up pruning them (as that would be
2708       nonsensical: the only interesting ScalarTS entry for a dead
2709       thread is its own index, and the pruning will remove that.). */
2710    VtsID__rcdec(thr->viR);
2711    VtsID__rcdec(thr->viW);
2712    thr->viR = VtsID_INVALID;
2713    thr->viW = VtsID_INVALID;
2714 }
2715
2716
2717 /////////////////////////////////////////////////////////////////
2718 /////////////////////////////////////////////////////////////////
2719 //                                                             //
2720 // SECTION END vts primitives                                  //
2721 //                                                             //
2722 /////////////////////////////////////////////////////////////////
2723 /////////////////////////////////////////////////////////////////
2724
2725
2726
2727 /////////////////////////////////////////////////////////////////
2728 /////////////////////////////////////////////////////////////////
2729 //                                                             //
2730 // SECTION BEGIN main library                                  //
2731 //                                                             //
2732 /////////////////////////////////////////////////////////////////
2733 /////////////////////////////////////////////////////////////////
2734
2735
2736 /////////////////////////////////////////////////////////
2737 //                                                     //
2738 // VTS set                                             //
2739 //                                                     //
2740 /////////////////////////////////////////////////////////
2741
2742 static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2743
2744 static void vts_set_init ( void )
2745 {
2746    tl_assert(!vts_set);
2747    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2748                          HG_(free),
2749                          (Word(*)(UWord,UWord))VTS__cmp_structural );
2750 }
2751
2752 /* Given a VTS, look in vts_set to see if we already have a
2753    structurally identical one.  If yes, return the pair (True, pointer
2754    to the existing one).  If no, clone this one, add the clone to the
2755    set, and return (False, pointer to the clone). */
2756 static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2757 {
2758    UWord keyW, valW;
2759    stats__vts_set__focaa++;
2760    tl_assert(cand->id == VtsID_INVALID);
2761    /* lookup cand (by value) */
2762    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2763       /* found it */
2764       tl_assert(valW == 0);
2765       /* if this fails, cand (by ref) was already present (!) */
2766       tl_assert(keyW != (UWord)cand);
2767       *res = (VTS*)keyW;
2768       return True;
2769    } else {
2770       /* not present.  Clone, add and return address of clone. */
2771       stats__vts_set__focaa_a++;
2772       VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2773       tl_assert(clone != cand);
2774       VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2775       *res = clone;
2776       return False;
2777    }
2778 }
2779
2780
2781 /////////////////////////////////////////////////////////
2782 //                                                     //
2783 // VTS table                                           //
2784 //                                                     //
2785 /////////////////////////////////////////////////////////
2786
2787 static void VtsID__invalidate_caches ( void ); /* fwds */
2788
2789 /* A type to hold VTS table entries.  Invariants:
2790    If .vts == NULL, then this entry is not in use, so:
2791    - .rc == 0
2792    - this entry is on the freelist (unfortunately, does not imply
2793      any constraints on value for u.freelink)
2794    If .vts != NULL, then this entry is in use:
2795    - .vts is findable in vts_set
2796    - .vts->id == this entry number
2797    - no specific value for .rc (even 0 is OK)
2798    - this entry is not on freelist, so u.freelink == VtsID_INVALID
2799 */
2800 typedef
2801    struct {
2802       VTS*  vts;      /* vts, in vts_set */
2803       UWord rc;       /* reference count - enough for entire aspace */
2804       union {
2805          VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2806          VtsID remap;    /* used only during pruning, for used entries */
2807       } u;
2808       /* u.freelink only used when vts == NULL,
2809          u.remap only used when vts != NULL, during pruning. */
2810    }
2811    VtsTE;
2812
2813 /* The VTS table. */
2814 static XArray* /* of VtsTE */ vts_tab = NULL;
2815
2816 /* An index into the VTS table, indicating the start of the list of
2817    free (available for use) entries.  If the list is empty, this is
2818    VtsID_INVALID. */
2819 static VtsID vts_tab_freelist = VtsID_INVALID;
2820
2821 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
2822    vts_tab equals or exceeds this size.  After GC, the value here is
2823    set appropriately so as to check for the next GC point. */
2824 static Word vts_next_GC_at = 1000;
2825
2826 static void vts_tab_init ( void )
2827 {
2828    vts_tab = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2829                          HG_(free), sizeof(VtsTE) );
2830    vts_tab_freelist = VtsID_INVALID;
2831 }
2832
2833 /* Add ii to the free list, checking that it looks out-of-use. */
2834 static void add_to_free_list ( VtsID ii )
2835 {
2836    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2837    tl_assert(ie->vts == NULL);
2838    tl_assert(ie->rc == 0);
2839    tl_assert(ie->u.freelink == VtsID_INVALID);
2840    ie->u.freelink = vts_tab_freelist;
2841    vts_tab_freelist = ii;
2842 }
2843
2844 /* Get an entry from the free list.  This will return VtsID_INVALID if
2845    the free list is empty. */
2846 static VtsID get_from_free_list ( void )
2847 {
2848    VtsID  ii;
2849    VtsTE* ie;
2850    if (vts_tab_freelist == VtsID_INVALID)
2851       return VtsID_INVALID;
2852    ii = vts_tab_freelist;
2853    ie = VG_(indexXA)( vts_tab, ii );
2854    tl_assert(ie->vts == NULL);
2855    tl_assert(ie->rc == 0);
2856    vts_tab_freelist = ie->u.freelink;
2857    return ii;
2858 }
2859
2860 /* Produce a new VtsID that can be used, either by getting it from
2861    the freelist, or, if that is empty, by expanding vts_tab. */
2862 static VtsID get_new_VtsID ( void )
2863 {
2864    VtsID ii;
2865    VtsTE te;
2866    ii = get_from_free_list();
2867    if (ii != VtsID_INVALID)
2868       return ii;
2869    te.vts = NULL;
2870    te.rc = 0;
2871    te.u.freelink = VtsID_INVALID;
2872    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2873    return ii;
2874 }
2875
2876
2877 /* Indirect callback from lib_zsm. */
2878 static void VtsID__rcinc ( VtsID ii )
2879 {
2880    VtsTE* ie;
2881    /* VG_(indexXA) does a range check for us */
2882    ie = VG_(indexXA)( vts_tab, ii );
2883    tl_assert(ie->vts); /* else it's not in use */
2884    tl_assert(ie->rc < ~0UL); /* else we can't continue */
2885    tl_assert(ie->vts->id == ii);
2886    ie->rc++;
2887 }
2888
2889 /* Indirect callback from lib_zsm. */
2890 static void VtsID__rcdec ( VtsID ii )
2891 {
2892    VtsTE* ie;
2893    /* VG_(indexXA) does a range check for us */
2894    ie = VG_(indexXA)( vts_tab, ii );
2895    tl_assert(ie->vts); /* else it's not in use */
2896    tl_assert(ie->rc > 0); /* else RC snafu */
2897    tl_assert(ie->vts->id == ii);
2898    ie->rc--;
2899 }
2900
2901
2902 /* Look up 'cand' in our collection of VTSs.  If present, return the
2903    VtsID for the pre-existing version.  If not present, clone it, add
2904    the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2905    it, and return that. */
2906 static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2907 {
2908    VTS* in_tab = NULL;
2909    tl_assert(cand->id == VtsID_INVALID);
2910    Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2911    tl_assert(in_tab);
2912    if (already_have) {
2913       /* We already have a copy of 'cand'.  Use that. */
2914       VtsTE* ie;
2915       tl_assert(in_tab->id != VtsID_INVALID);
2916       ie = VG_(indexXA)( vts_tab, in_tab->id );
2917       tl_assert(ie->vts == in_tab);
2918       return in_tab->id;
2919    } else {
2920       VtsID  ii = get_new_VtsID();
2921       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2922       ie->vts = in_tab;
2923       ie->rc = 0;
2924       ie->u.freelink = VtsID_INVALID;
2925       in_tab->id = ii;
2926       return ii;
2927    }
2928 }
2929
2930
2931 static void show_vts_stats ( const HChar* caller )
2932 {
2933    UWord nSet, nTab, nLive;
2934    ULong totrc;
2935    UWord n, i;
2936    nSet = VG_(sizeFM)( vts_set );
2937    nTab = VG_(sizeXA)( vts_tab );
2938    totrc = 0;
2939    nLive = 0;
2940    n = VG_(sizeXA)( vts_tab );
2941    for (i = 0; i < n; i++) {
2942       VtsTE* ie = VG_(indexXA)( vts_tab, i );
2943       if (ie->vts) {
2944          nLive++;
2945          totrc += (ULong)ie->rc;
2946       } else {
2947          tl_assert(ie->rc == 0);
2948       }
2949    }
2950    VG_(printf)("  show_vts_stats %s\n", caller);
2951    VG_(printf)("    vts_tab size %4lu\n", nTab);
2952    VG_(printf)("    vts_tab live %4lu\n", nLive);
2953    VG_(printf)("    vts_set size %4lu\n", nSet);
2954    VG_(printf)("        total rc %4llu\n", totrc);
2955 }
2956
2957
2958 /* --- Helpers for VtsID pruning --- */
2959
2960 static
2961 void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2962                    /*MOD*/XArray* /* of VtsTE */ new_tab,
2963                    VtsID* ii )
2964 {
2965    VtsTE *old_te, *new_te;
2966    VtsID old_id, new_id;
2967    /* We're relying here on VG_(indexXA)'s range checking to assert on
2968       any stupid values, in particular *ii == VtsID_INVALID. */
2969    old_id = *ii;
2970    old_te = VG_(indexXA)( old_tab, old_id );
2971    old_te->rc--;
2972    new_id = old_te->u.remap;
2973    new_te = VG_(indexXA)( new_tab, new_id );
2974    new_te->rc++;
2975    *ii = new_id;
2976 }
2977
2978 static
2979 void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2980                             /*MOD*/XArray* /* of VtsTE */ new_tab,
2981                             SVal* s )
2982 {
2983    SVal old_sv, new_sv;
2984    old_sv = *s;
2985    if (SVal__isC(old_sv)) {
2986       VtsID rMin, wMin;
2987       rMin = SVal__unC_Rmin(old_sv);
2988       wMin = SVal__unC_Wmin(old_sv);
2989       remap_VtsID( old_tab, new_tab, &rMin );
2990       remap_VtsID( old_tab, new_tab, &wMin );
2991       new_sv = SVal__mkC( rMin, wMin );
2992       *s = new_sv;
2993   }
2994 }
2995
2996
2997 /* NOT TO BE CALLED FROM WITHIN libzsm. */
2998 __attribute__((noinline))
2999 static void vts_tab__do_GC ( Bool show_stats )
3000 {
3001    UWord i, nTab, nLive, nFreed;
3002
3003    /* ---------- BEGIN VTS GC ---------- */
3004    /* check this is actually necessary. */
3005    tl_assert(vts_tab_freelist == VtsID_INVALID);
3006
3007    /* empty the caches for partial order checks and binary joins.  We
3008       could do better and prune out the entries to be deleted, but it
3009       ain't worth the hassle. */
3010    VtsID__invalidate_caches();
3011
3012    /* First, make the reference counts up to date. */
3013    zsm_flush_cache();
3014
3015    nTab = VG_(sizeXA)( vts_tab );
3016
3017    if (show_stats) {
3018       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
3019       show_vts_stats("before GC");
3020    }
3021
3022    /* Now we can inspect the entire vts_tab.  Any entries with zero
3023       .rc fields are now no longer in use and can be put back on the
3024       free list, removed from vts_set, and deleted. */
3025    nFreed = 0;
3026    for (i = 0; i < nTab; i++) {
3027       Bool present;
3028       UWord oldK = 0, oldV = 12345;
3029       VtsTE* te = VG_(indexXA)( vts_tab, i );
3030       if (te->vts == NULL) {
3031          tl_assert(te->rc == 0);
3032          continue; /* already on the free list (presumably) */
3033       }
3034       if (te->rc > 0)
3035          continue; /* in use */
3036       /* Ok, we got one we can free. */
3037       tl_assert(te->vts->id == i);
3038       /* first, remove it from vts_set. */
3039       present = VG_(delFromFM)( vts_set,
3040                                 &oldK, &oldV, (UWord)te->vts );
3041       tl_assert(present); /* else it isn't in vts_set ?! */
3042       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3043       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
3044       /* now free the VTS itself */
3045       VTS__delete(te->vts);
3046       te->vts = NULL;
3047       /* and finally put this entry on the free list */
3048       tl_assert(te->u.freelink == VtsID_INVALID); /* can't already be on it */
3049       add_to_free_list( i );
3050       nFreed++;
3051    }
3052
3053    /* Now figure out when the next GC should be.  We'll allow the
3054       number of VTSs to double before GCing again.  Except of course
3055       that since we can't (or, at least, don't) shrink vts_tab, we
3056       can't set the threshold value smaller than it. */
3057    tl_assert(nFreed <= nTab);
3058    nLive = nTab - nFreed;
3059    tl_assert(nLive >= 0 && nLive <= nTab);
3060    vts_next_GC_at = 2 * nLive;
3061    if (vts_next_GC_at < nTab)
3062       vts_next_GC_at = nTab;
3063
3064    if (show_stats) {
3065       show_vts_stats("after GC");
3066       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
3067    }
3068
3069    stats__vts_tab_GC++;
3070    if (VG_(clo_stats)) {
3071       tl_assert(nTab > 0);
3072       VG_(message)(Vg_DebugMsg,
3073                    "libhb: VTS GC: #%lu  old size %lu  live %lu  (%2llu%%)\n",
3074                    stats__vts_tab_GC,
3075                    nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
3076    }
3077    /* ---------- END VTS GC ---------- */
3078
3079    /* Decide whether to do VTS pruning.  We have one of three
3080       settings. */
3081    static UInt pruning_auto_ctr = 0; /* do not make non-static */
3082
3083    Bool do_pruning = False;
3084    switch (HG_(clo_vts_pruning)) {
3085       case 0: /* never */
3086          break;
3087       case 1: /* auto */
3088          do_pruning = (++pruning_auto_ctr % 5) == 0;
3089          break;
3090       case 2: /* always */
3091          do_pruning = True;
3092          break;
3093       default:
3094          tl_assert(0);
3095    }
3096
3097    /* The rest of this routine only handles pruning, so we can
3098       quit at this point if it is not to be done. */
3099    if (!do_pruning)
3100       return;
3101    /* No need to do pruning if no thread died since the last pruning as
3102       no VtsTE can be pruned. */
3103    if (VG_(sizeXA)( verydead_thread_table_not_pruned) == 0)
3104       return;
3105
3106    /* ---------- BEGIN VTS PRUNING ---------- */
3107    /* Sort and check the very dead threads that died since the last pruning.
3108       Sorting is used for the check and so that we can quickly look
3109       up the dead-thread entries as we work through the VTSs. */
3110    verydead_thread_table_sort_and_check (verydead_thread_table_not_pruned);
3111
3112    /* We will run through the old table, and create a new table and
3113       set, at the same time setting the u.remap entries in the old
3114       table to point to the new entries.  Then, visit every VtsID in
3115       the system, and replace all of them with new ones, using the
3116       u.remap entries in the old table.  Finally, we can delete the old
3117       table and set. */
3118
3119    XArray* /* of VtsTE */ new_tab
3120       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
3121                     HG_(free), sizeof(VtsTE) );
3122
3123    /* WordFM VTS* void */
3124    WordFM* new_set
3125       = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
3126                     HG_(free),
3127                     (Word(*)(UWord,UWord))VTS__cmp_structural );
3128
3129    /* Visit each old VTS.  For each one:
3130
3131       * make a pruned version
3132
3133       * search new_set for the pruned version, yielding either
3134         Nothing (not present) or the new VtsID for it.
3135
3136       * if not present, allocate a new VtsID for it, insert (pruned
3137         VTS, new VtsID) in the tree, and set
3138         remap_table[old VtsID] = new VtsID.
3139
3140       * if present, set remap_table[old VtsID] = new VtsID, where
3141         new VtsID was determined by the tree lookup.  Then free up
3142         the clone.
3143    */
3144
3145    UWord nBeforePruning = 0, nAfterPruning = 0;
3146    UWord nSTSsBefore = 0, nSTSsAfter = 0;
3147    VtsID new_VtsID_ctr = 0;
3148
3149    for (i = 0; i < nTab; i++) {
3150
3151       /* For each old VTS .. */
3152       VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
3153       VTS*   old_vts = old_te->vts;
3154
3155       /* Skip it if not in use */
3156       if (old_te->rc == 0) {
3157          tl_assert(old_vts == NULL);
3158          continue;
3159       }
3160       tl_assert(old_te->u.remap == VtsID_INVALID);
3161       tl_assert(old_vts != NULL);
3162       tl_assert(old_vts->id == i);
3163       tl_assert(old_vts->ts != NULL);
3164
3165       /* It is in use. Make a pruned version. */
3166       nBeforePruning++;
3167       nSTSsBefore += old_vts->usedTS;
3168       VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
3169                                    old_vts, verydead_thread_table_not_pruned);
3170       tl_assert(new_vts->sizeTS == new_vts->usedTS);
3171       tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
3172                 == 0x0ddC0ffeeBadF00dULL);
3173
3174       /* Get rid of the old VTS and the tree entry.  It's a bit more
3175          complex to incrementally delete the VTSs now than to nuke
3176          them all after we're done, but the upside is that we don't
3177          wind up temporarily storing potentially two complete copies
3178          of each VTS and hence spiking memory use. */
3179       UWord oldK = 0, oldV = 12345;
3180       Bool  present = VG_(delFromFM)( vts_set,
3181                                       &oldK, &oldV, (UWord)old_vts );
3182       tl_assert(present); /* else it isn't in vts_set ?! */
3183       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3184       tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
3185       /* now free the VTS itself */
3186       VTS__delete(old_vts);
3187       old_te->vts = NULL;
3188       old_vts = NULL;
3189
3190       /* NO MENTIONS of old_vts allowed beyond this point. */
3191
3192       /* Ok, we have the pruned copy in new_vts.  See if a
3193          structurally identical version is already present in new_set.
3194          If so, delete the one we just made and move on; if not, add
3195          it. */
3196       VTS*  identical_version = NULL;
3197       UWord valW = 12345;
3198       if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
3199                         (UWord)new_vts)) {
3200          // already have it
3201          tl_assert(valW == 0);
3202          tl_assert(identical_version != NULL);
3203          tl_assert(identical_version != new_vts);
3204          VTS__delete(new_vts);
3205          new_vts = identical_version;
3206          tl_assert(new_vts->id != VtsID_INVALID);
3207       } else {
3208          tl_assert(valW == 12345);
3209          tl_assert(identical_version == NULL);
3210          new_vts->id = new_VtsID_ctr++;
3211          Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
3212          tl_assert(!b);
3213          VtsTE new_te;
3214          new_te.vts      = new_vts;
3215          new_te.rc       = 0;
3216          new_te.u.freelink = VtsID_INVALID;
3217          Word j = VG_(addToXA)( new_tab, &new_te );
3218          tl_assert(j <= i);
3219          tl_assert(j == new_VtsID_ctr - 1);
3220          // stats
3221          nAfterPruning++;
3222          nSTSsAfter += new_vts->usedTS;
3223       }
3224       old_te->u.remap = new_vts->id;
3225
3226    } /* for (i = 0; i < nTab; i++) */
3227
3228    /* Move very dead thread from verydead_thread_table_not_pruned to
3229       verydead_thread_table. Sort and check verydead_thread_table
3230       to verify a thread was reported very dead only once. */
3231    {
3232       UWord nBT = VG_(sizeXA)( verydead_thread_table_not_pruned);
3233
3234       for (i = 0; i < nBT; i++) {
3235          ThrID thrid =
3236             *(ThrID*)VG_(indexXA)( verydead_thread_table_not_pruned, i );
3237          VG_(addToXA)( verydead_thread_table, &thrid );
3238       }
3239       verydead_thread_table_sort_and_check (verydead_thread_table);
3240       VG_(dropHeadXA) (verydead_thread_table_not_pruned, nBT);
3241    }
3242
3243    /* At this point, we have:
3244       * the old VTS table, with its u.remap entries set,
3245         and with all .vts == NULL.
3246       * the old VTS tree should be empty, since it and the old VTSs
3247         it contained have been incrementally deleted was we worked
3248         through the old table.
3249       * the new VTS table, with all .rc == 0, all u.freelink and u.remap
3250         == VtsID_INVALID.
3251       * the new VTS tree.
3252    */
3253    tl_assert( VG_(sizeFM)(vts_set) == 0 );
3254
3255    /* Now actually apply the mapping. */
3256    /* Visit all the VtsIDs in the entire system.  Where do we expect
3257       to find them?
3258       (a) in shadow memory -- the LineZs and LineFs
3259       (b) in our collection of struct _Thrs.
3260       (c) in our collection of struct _SOs.
3261       Nowhere else, AFAICS.  Not in the zsm cache, because that just
3262       got invalidated.
3263
3264       Using the u.remap fields in vts_tab, map each old VtsID to a new
3265       VtsID.  For each old VtsID, dec its rc; and for each new one,
3266       inc it.  This sets up the new refcounts, and it also gives a
3267       cheap sanity check of the old ones: all old refcounts should be
3268       zero after this operation.
3269    */
3270
3271    /* Do the mappings for (a) above: iterate over the Primary shadow
3272       mem map (WordFM Addr SecMap*). */
3273    UWord secmapW = 0;
3274    VG_(initIterFM)( map_shmem );
3275    while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3276       UWord   j;
3277       SecMap* sm = (SecMap*)secmapW;
3278       tl_assert(sm->magic == SecMap_MAGIC);
3279       /* Deal with the LineZs */
3280       for (i = 0; i < N_SECMAP_ZLINES; i++) {
3281          LineZ* lineZ = &sm->linesZ[i];
3282          if (lineZ->dict[0] != SVal_INVALID) {
3283             for (j = 0; j < 4; j++)
3284                remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3285          } else {
3286             LineF* lineF = SVal2Ptr (lineZ->dict[1]);
3287             for (j = 0; j < N_LINE_ARANGE; j++)
3288                remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3289          }
3290       }
3291    }
3292    VG_(doneIterFM)( map_shmem );
3293
3294    /* Do the mappings for (b) above: visit our collection of struct
3295       _Thrs. */
3296    Thread* hgthread = get_admin_threads();
3297    tl_assert(hgthread);
3298    while (hgthread) {
3299       Thr* hbthr = hgthread->hbthr;
3300       tl_assert(hbthr);
3301       /* Threads that are listed in the prunable set have their viR
3302          and viW set to VtsID_INVALID, so we can't mess with them. */
3303       if (hbthr->llexit_done && hbthr->joinedwith_done) {
3304          tl_assert(hbthr->viR == VtsID_INVALID);
3305          tl_assert(hbthr->viW == VtsID_INVALID);
3306          hgthread = hgthread->admin;
3307          continue;
3308       }
3309       remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3310       remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3311       hgthread = hgthread->admin;
3312    }
3313
3314    /* Do the mappings for (c) above: visit the struct _SOs. */
3315    SO* so = admin_SO;
3316    while (so) {
3317       if (so->viR != VtsID_INVALID)
3318          remap_VtsID( vts_tab, new_tab, &so->viR );
3319       if (so->viW != VtsID_INVALID)
3320          remap_VtsID( vts_tab, new_tab, &so->viW );
3321       so = so->admin_next;
3322    }
3323
3324    /* So, we're nearly done (with this incredibly complex operation).
3325       Check the refcounts for the old VtsIDs all fell to zero, as
3326       expected.  Any failure is serious. */
3327    for (i = 0; i < nTab; i++) {
3328       VtsTE* te = VG_(indexXA)( vts_tab, i );
3329       tl_assert(te->vts == NULL);
3330       /* This is the assert proper.  Note we're also asserting
3331          zeroness for old entries which are unmapped.  That's OK. */
3332       tl_assert(te->rc == 0);
3333    }
3334
3335    /* Install the new table and set. */
3336    VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3337    vts_set = new_set;
3338    VG_(deleteXA)( vts_tab );
3339    vts_tab = new_tab;
3340
3341    /* The freelist of vts_tab entries is empty now, because we've
3342       compacted all of the live entries at the low end of the
3343       table. */
3344    vts_tab_freelist = VtsID_INVALID;
3345
3346    /* Sanity check vts_set and vts_tab. */
3347
3348    /* Because all the live entries got slid down to the bottom of vts_tab: */
3349    tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3350
3351    /* Assert that the vts_tab and vts_set entries point at each other
3352       in the required way */
3353    UWord wordK = 0, wordV = 0;
3354    VG_(initIterFM)( vts_set );
3355    while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3356       tl_assert(wordK != 0);
3357       tl_assert(wordV == 0);
3358       VTS* vts = (VTS*)wordK;
3359       tl_assert(vts->id != VtsID_INVALID);
3360       VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3361       tl_assert(te->vts == vts);
3362    }
3363    VG_(doneIterFM)( vts_set );
3364
3365    /* Also iterate over the table, and check each entry is
3366       plausible. */
3367    nTab = VG_(sizeXA)( vts_tab );
3368    for (i = 0; i < nTab; i++) {
3369       VtsTE* te = VG_(indexXA)( vts_tab, i );
3370       tl_assert(te->vts);
3371       tl_assert(te->vts->id == i);
3372       tl_assert(te->rc > 0); /* 'cos we just GC'd */
3373       tl_assert(te->u.freelink == VtsID_INVALID); /* in use */
3374       /* value of te->u.remap  not relevant */
3375    }
3376
3377    /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3378    stats__vts_pruning++;
3379    if (VG_(clo_stats)) {
3380       tl_assert(nTab > 0);
3381       VG_(message)(
3382          Vg_DebugMsg,
3383          "libhb: VTS PR: #%lu  before %lu (avg sz %lu)  "
3384             "after %lu (avg sz %lu)\n",
3385          stats__vts_pruning,
3386          nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3387          nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3388       );
3389    }
3390    /* ---------- END VTS PRUNING ---------- */
3391 }
3392
3393
3394 /////////////////////////////////////////////////////////
3395 //                                                     //
3396 // Vts IDs                                             //
3397 //                                                     //
3398 /////////////////////////////////////////////////////////
3399
3400 //////////////////////////
3401 /* A temporary, max-sized VTS which is used as a temporary (the first
3402    argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3403 static VTS* temp_max_sized_VTS = NULL;
3404
3405 //////////////////////////
3406 static ULong stats__cmpLEQ_queries = 0;
3407 static ULong stats__cmpLEQ_misses  = 0;
3408 static ULong stats__join2_queries  = 0;
3409 static ULong stats__join2_misses   = 0;
3410
3411 static inline UInt ROL32 ( UInt w, Int n ) {
3412    w = (w << n) | (w >> (32-n));
3413    return w;
3414 }
3415 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3416    UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3417    return hash % nTab;
3418 }
3419
3420 #define N_CMPLEQ_CACHE 1023
3421 static
3422    struct { VtsID vi1; VtsID vi2; Bool leq; }
3423    cmpLEQ_cache[N_CMPLEQ_CACHE];
3424
3425 #define N_JOIN2_CACHE 1023
3426 static
3427    struct { VtsID vi1; VtsID vi2; VtsID res; }
3428    join2_cache[N_JOIN2_CACHE];
3429
3430 static void VtsID__invalidate_caches ( void ) {
3431    Int i;
3432    for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3433       cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3434       cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3435       cmpLEQ_cache[i].leq = False;
3436    }
3437    for (i = 0; i < N_JOIN2_CACHE; i++) {
3438      join2_cache[i].vi1 = VtsID_INVALID;
3439      join2_cache[i].vi2 = VtsID_INVALID;
3440      join2_cache[i].res = VtsID_INVALID;
3441    }
3442 }
3443 //////////////////////////
3444
3445 //static Bool VtsID__is_valid ( VtsID vi ) {
3446 //   VtsTE* ve;
3447 //   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3448 //      return False;
3449 //   ve = VG_(indexXA)( vts_tab, vi );
3450 //   if (!ve->vts)
3451 //      return False;
3452 //   tl_assert(ve->vts->id == vi);
3453 //   return True;
3454 //}
3455
3456 static VTS* VtsID__to_VTS ( VtsID vi ) {
3457    VtsTE* te = VG_(indexXA)( vts_tab, vi );
3458    tl_assert(te->vts);
3459    return te->vts;
3460 }
3461
3462 static void VtsID__pp ( VtsID vi ) {
3463    VTS* vts = VtsID__to_VTS(vi);
3464    VTS__show( vts );
3465 }
3466
3467 /* compute partial ordering relation of vi1 and vi2. */
3468 __attribute__((noinline))
3469 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3470    UInt hash;
3471    Bool leq;
3472    VTS  *v1, *v2;
3473    //if (vi1 == vi2) return True;
3474    tl_assert(vi1 != vi2);
3475    ////++
3476    stats__cmpLEQ_queries++;
3477    hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3478    if (cmpLEQ_cache[hash].vi1 == vi1
3479        && cmpLEQ_cache[hash].vi2 == vi2)
3480       return cmpLEQ_cache[hash].leq;
3481    stats__cmpLEQ_misses++;
3482    ////--
3483    v1  = VtsID__to_VTS(vi1);
3484    v2  = VtsID__to_VTS(vi2);
3485    leq = VTS__cmpLEQ( v1, v2 ) == 0;
3486    ////++
3487    cmpLEQ_cache[hash].vi1 = vi1;
3488    cmpLEQ_cache[hash].vi2 = vi2;
3489    cmpLEQ_cache[hash].leq = leq;
3490    ////--
3491    return leq;
3492 }
3493 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3494    return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3495 }
3496
3497 /* compute binary join */
3498 __attribute__((noinline))
3499 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3500    UInt  hash;
3501    VtsID res;
3502    VTS   *vts1, *vts2;
3503    //if (vi1 == vi2) return vi1;
3504    tl_assert(vi1 != vi2);
3505    ////++
3506    stats__join2_queries++;
3507    hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3508    if (join2_cache[hash].vi1 == vi1
3509        && join2_cache[hash].vi2 == vi2)
3510       return join2_cache[hash].res;
3511    stats__join2_misses++;
3512    ////--
3513    vts1 = VtsID__to_VTS(vi1);
3514    vts2 = VtsID__to_VTS(vi2);
3515    temp_max_sized_VTS->usedTS = 0;
3516    VTS__join(temp_max_sized_VTS, vts1,vts2);
3517    res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3518    ////++
3519    join2_cache[hash].vi1 = vi1;
3520    join2_cache[hash].vi2 = vi2;
3521    join2_cache[hash].res = res;
3522    ////--
3523    return res;
3524 }
3525 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3526    return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3527 }
3528
3529 /* create a singleton VTS, namely [thr:1] */
3530 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3531    temp_max_sized_VTS->usedTS = 0;
3532    VTS__singleton(temp_max_sized_VTS, thr,tym);
3533    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3534 }
3535
3536 /* tick operation, creates value 1 if specified index is absent */
3537 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3538    VTS* vts = VtsID__to_VTS(vi);
3539    temp_max_sized_VTS->usedTS = 0;
3540    VTS__tick(temp_max_sized_VTS, idx,vts);
3541    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3542 }
3543
3544 /* index into a VTS (only for assertions) */
3545 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3546    VTS* vts = VtsID__to_VTS(vi);
3547    return VTS__indexAt_SLOW( vts, idx );
3548 }
3549
3550 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3551    any, really) element in vi1 which is pointwise greater-than the
3552    corresponding element in vi2.  If no such element exists, return
3553    NULL.  This needs to be fairly quick since it is called every time
3554    a race is detected. */
3555 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3556 {
3557    VTS  *vts1, *vts2;
3558    Thr*  diffthr;
3559    ThrID diffthrid;
3560    tl_assert(vi1 != vi2);
3561    vts1 = VtsID__to_VTS(vi1);
3562    vts2 = VtsID__to_VTS(vi2);
3563    tl_assert(vts1 != vts2);
3564    diffthrid = VTS__cmpLEQ(vts1, vts2);
3565    diffthr = Thr__from_ThrID(diffthrid);
3566    tl_assert(diffthr); /* else they are LEQ ! */
3567    return diffthr;
3568 }
3569
3570
3571 /////////////////////////////////////////////////////////
3572 //                                                     //
3573 // Filters                                             //
3574 //                                                     //
3575 /////////////////////////////////////////////////////////
3576
3577 /* Forget everything we know -- clear the filter and let everything
3578    through.  This needs to be as fast as possible, since it is called
3579    every time the running thread changes, and every time a thread's
3580    vector clocks change, which can be quite frequent.  The obvious
3581    fast way to do this is simply to stuff in tags which we know are
3582    not going to match anything, since they're not aligned to the start
3583    of a line. */
3584 static void Filter__clear ( Filter* fi, const HChar* who )
3585 {
3586    UWord i;
3587    if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3588    for (i = 0; i < FI_NUM_LINES; i += 8) {
3589       fi->tags[i+0] = 1; /* impossible value -- cannot match */
3590       fi->tags[i+1] = 1;
3591       fi->tags[i+2] = 1;
3592       fi->tags[i+3] = 1;
3593       fi->tags[i+4] = 1;
3594       fi->tags[i+5] = 1;
3595       fi->tags[i+6] = 1;
3596       fi->tags[i+7] = 1;
3597    }
3598    tl_assert(i == FI_NUM_LINES);
3599 }
3600
3601 /* Clearing an arbitrary range in the filter.  Unfortunately
3602    we have to do this due to core-supplied new/die-mem events. */
3603
3604 static void Filter__clear_1byte ( Filter* fi, Addr a )
3605 {
3606    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3607    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3608    FiLine* line   = &fi->lines[lineno];
3609    UWord   loff   = (a - atag) / 8;
3610    UShort  mask   = 0x3 << (2 * (a & 7));
3611    /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3612    if (LIKELY( fi->tags[lineno] == atag )) {
3613       /* hit.  clear the bits. */
3614       UShort  u16  = line->u16s[loff];
3615       line->u16s[loff] = u16 & ~mask; /* clear them */
3616    } else {
3617       /* miss.  The filter doesn't hold this address, so ignore. */
3618    }
3619 }
3620
3621 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3622 {
3623    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3624    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3625    FiLine* line   = &fi->lines[lineno];
3626    UWord   loff   = (a - atag) / 8;
3627    if (LIKELY( fi->tags[lineno] == atag )) {
3628       line->u16s[loff] = 0;
3629    } else {
3630     /* miss.  The filter doesn't hold this address, so ignore. */
3631    }
3632 }
3633
3634 /* Only used to verify the fast Filter__clear_range */
3635 __attribute__((unused))
3636 static void Filter__clear_range_SLOW ( Filter* fi, Addr a, UWord len )
3637 {
3638    tl_assert (CHECK_ZSM);
3639
3640    /* slowly do part preceding 8-alignment */
3641    while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3642       Filter__clear_1byte( fi, a );
3643       a++;
3644       len--;
3645    }
3646    /* vector loop */
3647    while (len >= 8) {
3648       Filter__clear_8bytes_aligned( fi, a );
3649       a += 8;
3650       len -= 8;
3651    }
3652    /* slowly do tail */
3653    while (UNLIKELY(len > 0)) {
3654       Filter__clear_1byte( fi, a );
3655       a++;
3656       len--;
3657    }
3658 }
3659
3660 static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3661 {
3662 #  if CHECK_ZSM > 0
3663    /* We check the below more complex algorithm with the simple one.
3664       This check is very expensive : we do first the slow way on a
3665       copy of the data, then do it the fast way. On RETURN, we check
3666       the two values are equal. */
3667    Filter fi_check = *fi;
3668    Filter__clear_range_SLOW(&fi_check, a, len);
3669 #  define RETURN goto check_and_return
3670 #  else
3671 #  define RETURN return
3672 #  endif
3673
3674    Addr    begtag = FI_GET_TAG(a);       /* tag of range begin */
3675
3676    Addr    end = a + len - 1;
3677    Addr    endtag = FI_GET_TAG(end); /* tag of range end. */
3678
3679    UWord rlen = len; /* remaining length to clear */
3680
3681    Addr    c = a; /* Current position we are clearing. */
3682    UWord   clineno = FI_GET_LINENO(c); /* Current lineno we are clearing */
3683    FiLine* cline; /* Current line we are clearing */
3684    UWord   cloff; /* Current offset in line we are clearing, when clearing
3685                      partial lines. */
3686
3687    UShort u16;
3688
3689    STATIC_ASSERT (FI_LINE_SZB == 32);
3690    // Below assumes filter lines are 32 bytes
3691
3692    if (LIKELY(fi->tags[clineno] == begtag)) {
3693       /* LIKELY for the heavy caller VG_(unknown_SP_update). */
3694       /* First filter line matches begtag.
3695          If c is not at the filter line begin, the below will clear
3696          the filter line bytes starting from c. */
3697       cline = &fi->lines[clineno];
3698       cloff = (c - begtag) / 8;
3699
3700       /* First the byte(s) needed to reach 8-alignment */
3701       if (UNLIKELY(!VG_IS_8_ALIGNED(c))) {
3702          /* hiB is the nr of bytes (higher addresses) from c to reach
3703             8-aligment. */
3704          UWord hiB = 8 - (c & 7);
3705          /* Compute 2-bit/byte mask representing hiB bytes [c..c+hiB[
3706             mask is  C000 , F000, FC00, FF00, FFC0, FFF0 or FFFC for the byte
3707             range    7..7   6..7  5..7  4..7  3..7  2..7    1..7 */
3708          UShort mask = 0xFFFF << (16 - 2*hiB);
3709
3710          u16  = cline->u16s[cloff];
3711          if (LIKELY(rlen >= hiB)) {
3712             cline->u16s[cloff] = u16 & ~mask; /* clear all hiB from c */
3713             rlen -= hiB;
3714             c += hiB;
3715             cloff += 1;
3716          } else {
3717             /* Only have the bits for rlen bytes bytes. */
3718             mask = mask & ~(0xFFFF << (16 - 2*(hiB-rlen)));
3719             cline->u16s[cloff] = u16 & ~mask; /* clear rlen bytes from c. */
3720             RETURN;  // We have cleared all what we can.
3721          }
3722       }
3723       /* c is now 8 aligned. Clear by 8 aligned bytes,
3724          till c is filter-line aligned */
3725       while (!VG_IS_32_ALIGNED(c) && rlen >= 8) {
3726          cline->u16s[cloff] = 0;
3727          c += 8;
3728          rlen -= 8;
3729          cloff += 1;
3730       }
3731    } else {
3732       c = begtag + FI_LINE_SZB;
3733       if (c > end)
3734          RETURN;   // We have cleared all what we can.
3735       rlen -= c - a;
3736    }
3737    // We have changed c, so re-establish clineno.
3738    clineno = FI_GET_LINENO(c);
3739
3740    if (rlen >= FI_LINE_SZB) {
3741       /* Here, c is filter line-aligned. Clear all full lines that
3742          overlap with the range starting at c, made of a full lines */
3743       UWord nfull = rlen / FI_LINE_SZB;
3744       UWord full_len = nfull * FI_LINE_SZB;
3745       rlen -= full_len;
3746       if (nfull > FI_NUM_LINES)
3747          nfull = FI_NUM_LINES; // no need to check several times the same entry.
3748
3749       for (UWord n = 0; n < nfull; n++) {
3750          if (UNLIKELY(address_in_range(fi->tags[clineno], c, full_len))) {
3751             cline = &fi->lines[clineno];
3752             cline->u16s[0] = 0;
3753             cline->u16s[1] = 0;
3754             cline->u16s[2] = 0;
3755             cline->u16s[3] = 0;
3756             STATIC_ASSERT (4 == sizeof(cline->u16s)/sizeof(cline->u16s[0]));
3757          }
3758          clineno++;
3759          if (UNLIKELY(clineno == FI_NUM_LINES))
3760             clineno = 0;
3761       }
3762
3763       c += full_len;
3764       clineno = FI_GET_LINENO(c);
3765    }
3766
3767    if (CHECK_ZSM) {
3768       tl_assert(VG_IS_8_ALIGNED(c));
3769       tl_assert(clineno == FI_GET_LINENO(c));
3770    }
3771
3772    /* Do the last filter line, if it was not cleared as a full filter line */
3773    if (UNLIKELY(rlen > 0) && fi->tags[clineno] == endtag) {
3774       cline = &fi->lines[clineno];
3775       cloff = (c - endtag) / 8;
3776       if (CHECK_ZSM) tl_assert(FI_GET_TAG(c) == endtag);
3777
3778       /* c is 8 aligned. Clear by 8 aligned bytes, till we have less than
3779          8 bytes. */
3780       while (rlen >= 8) {
3781          cline->u16s[cloff] = 0;
3782          c += 8;
3783          rlen -= 8;
3784          cloff += 1;
3785       }
3786       /* Then the remaining byte(s) */
3787       if (rlen > 0) {
3788          /* nr of bytes from c to reach end. */
3789          UWord loB = rlen;
3790          /* Compute mask representing loB bytes [c..c+loB[ :
3791             mask is 0003, 000F, 003F, 00FF, 03FF, 0FFF or 3FFF */
3792          UShort mask = 0xFFFF >> (16 - 2*loB);
3793
3794          u16  = cline->u16s[cloff];
3795          cline->u16s[cloff] = u16 & ~mask; /* clear all loB from c */
3796       }
3797    }
3798
3799 #  if CHECK_ZSM > 0
3800    check_and_return:
3801    tl_assert (VG_(memcmp)(&fi_check, fi, sizeof(fi_check)) == 0);
3802 #  endif
3803 #  undef RETURN
3804 }
3805
3806 /* ------ Read handlers for the filter. ------ */
3807
3808 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3809 {
3810    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3811       return False;
3812    {
3813      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3814      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3815      FiLine* line   = &fi->lines[lineno];
3816      UWord   loff   = (a - atag) / 8;
3817      UShort  mask   = 0xAAAA;
3818      if (LIKELY( fi->tags[lineno] == atag )) {
3819         /* hit.  check line and update. */
3820         UShort u16  = line->u16s[loff];
3821         Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3822         line->u16s[loff] = u16 | mask; /* set them */
3823         return ok;
3824      } else {
3825         /* miss.  nuke existing line and re-use it. */
3826         UWord i;
3827         fi->tags[lineno] = atag;
3828         for (i = 0; i < FI_LINE_SZB / 8; i++)
3829            line->u16s[i] = 0;
3830         line->u16s[loff] = mask;
3831         return False;
3832      }
3833    }
3834 }
3835
3836 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3837 {
3838    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3839       return False;
3840    {
3841      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3842      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3843      FiLine* line   = &fi->lines[lineno];
3844      UWord   loff   = (a - atag) / 8;
3845      UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3846      if (LIKELY( fi->tags[lineno] == atag )) {
3847         /* hit.  check line and update. */
3848         UShort  u16  = line->u16s[loff];
3849         Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3850         line->u16s[loff] = u16 | mask; /* set them */
3851         return ok;
3852      } else {
3853         /* miss.  nuke existing line and re-use it. */
3854         UWord   i;
3855         fi->tags[lineno] = atag;
3856         for (i = 0; i < FI_LINE_SZB / 8; i++)
3857            line->u16s[i] = 0;
3858         line->u16s[loff] = mask;
3859         return False;
3860      }
3861    }
3862 }
3863
3864 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3865 {
3866    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3867       return False;
3868    {
3869      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3870      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3871      FiLine* line   = &fi->lines[lineno];
3872      UWord   loff   = (a - atag) / 8;
3873      UShort  mask   = 0xA << (2 * (a & 6));
3874      /* mask is A000, 0A00, 00A0 or 000A */
3875      if (LIKELY( fi->tags[lineno] == atag )) {
3876         /* hit.  check line and update. */
3877         UShort  u16  = line->u16s[loff];
3878         Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3879         line->u16s[loff] = u16 | mask; /* set them */
3880         return ok;
3881      } else {
3882         /* miss.  nuke existing line and re-use it. */
3883         UWord   i;
3884         fi->tags[lineno] = atag;
3885         for (i = 0; i < FI_LINE_SZB / 8; i++)
3886            line->u16s[i] = 0;
3887         line->u16s[loff] = mask;
3888         return False;
3889      }
3890    }
3891 }
3892
3893 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3894 {
3895    {
3896      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3897      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3898      FiLine* line   = &fi->lines[lineno];
3899      UWord   loff   = (a - atag) / 8;
3900      UShort  mask   = 0x2 << (2 * (a & 7));
3901      /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3902      if (LIKELY( fi->tags[lineno] == atag )) {
3903         /* hit.  check line and update. */
3904         UShort  u16  = line->u16s[loff];
3905         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3906         line->u16s[loff] = u16 | mask; /* set them */
3907         return ok;
3908      } else {
3909         /* miss.  nuke existing line and re-use it. */
3910         UWord   i;
3911         fi->tags[lineno] = atag;
3912         for (i = 0; i < FI_LINE_SZB / 8; i++)
3913            line->u16s[i] = 0;
3914         line->u16s[loff] = mask;
3915         return False;
3916      }
3917    }
3918 }
3919
3920
3921 /* ------ Write handlers for the filter. ------ */
3922
3923 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3924 {
3925    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3926       return False;
3927    {
3928      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3929      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3930      FiLine* line   = &fi->lines[lineno];
3931      UWord   loff   = (a - atag) / 8;
3932      UShort  mask   = 0xFFFF;
3933      if (LIKELY( fi->tags[lineno] == atag )) {
3934         /* hit.  check line and update. */
3935         UShort u16  = line->u16s[loff];
3936         Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3937         line->u16s[loff] = u16 | mask; /* set them */
3938         return ok;
3939      } else {
3940         /* miss.  nuke existing line and re-use it. */
3941         UWord i;
3942         fi->tags[lineno] = atag;
3943         for (i = 0; i < FI_LINE_SZB / 8; i++)
3944            line->u16s[i] = 0;
3945         line->u16s[loff] = mask;
3946         return False;
3947      }
3948    }
3949 }
3950
3951 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3952 {
3953    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3954       return False;
3955    {
3956      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3957      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3958      FiLine* line   = &fi->lines[lineno];
3959      UWord   loff   = (a - atag) / 8;
3960      UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3961      if (LIKELY( fi->tags[lineno] == atag )) {
3962         /* hit.  check line and update. */
3963         UShort  u16  = line->u16s[loff];
3964         Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3965         line->u16s[loff] = u16 | mask; /* set them */
3966         return ok;
3967      } else {
3968         /* miss.  nuke existing line and re-use it. */
3969         UWord   i;
3970         fi->tags[lineno] = atag;
3971         for (i = 0; i < FI_LINE_SZB / 8; i++)
3972            line->u16s[i] = 0;
3973         line->u16s[loff] = mask;
3974         return False;
3975      }
3976    }
3977 }
3978
3979 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3980 {
3981    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3982       return False;
3983    {
3984      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3985      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3986      FiLine* line   = &fi->lines[lineno];
3987      UWord   loff   = (a - atag) / 8;
3988      UShort  mask   = 0xF << (2 * (a & 6));
3989      /* mask is F000, 0F00, 00F0 or 000F */
3990      if (LIKELY( fi->tags[lineno] == atag )) {
3991         /* hit.  check line and update. */
3992         UShort  u16  = line->u16s[loff];
3993         Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3994         line->u16s[loff] = u16 | mask; /* set them */
3995         return ok;
3996      } else {
3997         /* miss.  nuke existing line and re-use it. */
3998         UWord   i;
3999         fi->tags[lineno] = atag;
4000         for (i = 0; i < FI_LINE_SZB / 8; i++)
4001            line->u16s[i] = 0;
4002         line->u16s[loff] = mask;
4003         return False;
4004      }
4005    }
4006 }
4007
4008 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
4009 {
4010    {
4011      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
4012      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
4013      FiLine* line   = &fi->lines[lineno];
4014      UWord   loff   = (a - atag) / 8;
4015      UShort  mask   = 0x3 << (2 * (a & 7));
4016      /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
4017      if (LIKELY( fi->tags[lineno] == atag )) {
4018         /* hit.  check line and update. */
4019         UShort  u16  = line->u16s[loff];
4020         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
4021         line->u16s[loff] = u16 | mask; /* set them */
4022         return ok;
4023      } else {
4024         /* miss.  nuke existing line and re-use it. */
4025         UWord   i;
4026         fi->tags[lineno] = atag;
4027         for (i = 0; i < FI_LINE_SZB / 8; i++)
4028            line->u16s[i] = 0;
4029         line->u16s[loff] = mask;
4030         return False;
4031      }
4032    }
4033 }
4034
4035
4036 /////////////////////////////////////////////////////////
4037 //                                                     //
4038 // Threads                                             //
4039 //                                                     //
4040 /////////////////////////////////////////////////////////
4041
4042 /* Maps ThrID values to their Thr*s (which contain ThrID values that
4043    should point back to the relevant slot in the array.  Lowest
4044    numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
4045 static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
4046
4047 /* And a counter to dole out ThrID values.  For rationale/background,
4048    see comments on definition of ScalarTS (far) above. */
4049 static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
4050
4051 static ThrID Thr__to_ThrID ( Thr* thr ) {
4052    return thr->thrid;
4053 }
4054 static Thr* Thr__from_ThrID ( UInt thrid ) {
4055    Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
4056    tl_assert(thr->thrid == thrid);
4057    return thr;
4058 }
4059
4060 /* True if the cached rcec for thr is valid and can be used to build the
4061    current stack trace just by changing the last frame to the current IP. */
4062 static inline Bool cached_rcec_valid(Thr *thr)
4063 {
4064    UWord cached_stackvalid = VG_(get_SP_s1) (thr->hgthread->coretid);
4065    return cached_stackvalid != 0;
4066 }
4067 /* Set the validity of the cached rcec of thr. */
4068 static inline void set_cached_rcec_validity(Thr *thr, Bool valid)
4069 {
4070    VG_(set_SP_s1) (thr->hgthread->coretid, valid);
4071 }
4072
4073 static Thr* Thr__new ( void )
4074 {
4075    Thr* thr = HG_(zalloc)
4076       ( "libhb.Thr__new.1",
4077         sizeof(Thr) + HG_(clo_history_backtrace_size) * sizeof(UWord));
4078    // We need to add the size of the frames in the cached_rcec (last member of
4079    // _Thr).
4080
4081    thr->viR = VtsID_INVALID;
4082    thr->viW = VtsID_INVALID;
4083    thr->llexit_done = False;
4084    thr->joinedwith_done = False;
4085    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
4086    if (HG_(clo_history_level) == 1)
4087       thr->local_Kws_n_stacks
4088          = VG_(newXA)( HG_(zalloc),
4089                        "libhb.Thr__new.3 (local_Kws_and_stacks)",
4090                        HG_(free), sizeof(ULong_n_EC) );
4091    /* Make an 'empty' cached rcec in thr. */
4092    thr->cached_rcec.magic = RCEC_MAGIC;
4093    thr->cached_rcec.rc = 0;
4094    thr->cached_rcec.rcX = 0;
4095    thr->cached_rcec.next = NULL;
4096
4097    /* Add this Thr* <-> ThrID binding to the mapping, and
4098       cross-check */
4099    if (!thrid_to_thr_map) {
4100       thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
4101                                      HG_(free), sizeof(Thr*) );
4102    }
4103
4104    if (thrid_counter >= ThrID_MAX_VALID) {
4105       /* We're hosed.  We have to stop. */
4106       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
4107    }
4108
4109    thr->thrid = thrid_counter++;
4110    Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
4111    tl_assert(ix + 1024 == thr->thrid);
4112
4113    return thr;
4114 }
4115
4116 static void note_local_Kw_n_stack_for ( Thr* thr )
4117 {
4118    Word       nPresent;
4119    ULong_n_EC pair;
4120    tl_assert(thr);
4121
4122    // We only collect this info at history level 1 (approx)
4123    if (HG_(clo_history_level) != 1)
4124       return;
4125
4126    /* This is the scalar Kw for thr. */
4127    pair.ull = VtsID__indexAt( thr->viW, thr );
4128    pair.ec  = main_get_EC( thr );
4129    tl_assert(pair.ec);
4130    tl_assert(thr->local_Kws_n_stacks);
4131
4132    /* check that we're not adding duplicates */
4133    nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4134
4135    /* Throw away old stacks, if necessary.  We can't accumulate stuff
4136       indefinitely. */
4137    if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
4138       VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
4139       nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4140       if (0)
4141          VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
4142                      thr, pair.ull, pair.ec );
4143    }
4144
4145    if (nPresent > 0) {
4146       ULong_n_EC* prevPair
4147          = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
4148       tl_assert( prevPair->ull <= pair.ull );
4149    }
4150
4151    if (nPresent == 0)
4152       pair.ec = NULL;
4153
4154    VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
4155
4156    if (0)
4157       VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
4158                   thr, pair.ull, pair.ec );
4159    if (0)
4160       VG_(pp_ExeContext)(pair.ec);
4161 }
4162
4163 static Int cmp__ULong_n_EC__by_ULong ( const ULong_n_EC* pair1,
4164                                        const ULong_n_EC* pair2 )
4165 {
4166    if (pair1->ull < pair2->ull) return -1;
4167    if (pair1->ull > pair2->ull) return 1;
4168    return 0;
4169 }
4170
4171
4172 /////////////////////////////////////////////////////////
4173 //                                                     //
4174 // Shadow Values                                       //
4175 //                                                     //
4176 /////////////////////////////////////////////////////////
4177
4178 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by
4179 // hb_zsm.h.  We have to do everything else here.
4180
4181 /* SVal is 64 bit unsigned int.
4182
4183       <---------30--------->    <---------30--------->
4184    00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
4185    10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
4186    11 0--------------------0 00 0--------------------0   A: SVal_INVALID
4187
4188 */
4189 #define SVAL_TAGMASK (3ULL << 62)
4190
4191 static inline Bool SVal__isC ( SVal s ) {
4192    return (0ULL << 62) == (s & SVAL_TAGMASK);
4193 }
4194 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
4195    //tl_assert(VtsID__is_valid(rmini));
4196    //tl_assert(VtsID__is_valid(wmini));
4197    return (((ULong)rmini) << 32) | ((ULong)wmini);
4198 }
4199 static inline VtsID SVal__unC_Rmin ( SVal s ) {
4200    tl_assert(SVal__isC(s));
4201    return (VtsID)(s >> 32);
4202 }
4203 static inline VtsID SVal__unC_Wmin ( SVal s ) {
4204    tl_assert(SVal__isC(s));
4205    return (VtsID)(s & 0xFFFFFFFFULL);
4206 }
4207
4208 static inline Bool SVal__isA ( SVal s ) {
4209    return (2ULL << 62) == (s & SVAL_TAGMASK);
4210 }
4211 __attribute__((unused))
4212 static inline SVal SVal__mkA ( void ) {
4213    return 2ULL << 62;
4214 }
4215
4216 /* Direct callback from lib_zsm. */
4217 static inline void SVal__rcinc ( SVal s ) {
4218    if (SVal__isC(s)) {
4219       VtsID__rcinc( SVal__unC_Rmin(s) );
4220       VtsID__rcinc( SVal__unC_Wmin(s) );
4221    }
4222 }
4223
4224 /* Direct callback from lib_zsm. */
4225 static inline void SVal__rcdec ( SVal s ) {
4226    if (SVal__isC(s)) {
4227       VtsID__rcdec( SVal__unC_Rmin(s) );
4228       VtsID__rcdec( SVal__unC_Wmin(s) );
4229    }
4230 }
4231
4232 static inline void *SVal2Ptr (SVal s)
4233 {
4234    return (void*)(UWord)s;
4235 }
4236
4237 static inline SVal Ptr2SVal (void* ptr)
4238 {
4239    return (SVal)(UWord)ptr;
4240 }
4241
4242
4243
4244 /////////////////////////////////////////////////////////
4245 //                                                     //
4246 // Change-event map2                                   //
4247 //                                                     //
4248 /////////////////////////////////////////////////////////
4249
4250 /* This is in two parts:
4251
4252    1. A hash table of RCECs.  This is a set of reference-counted stack
4253       traces.  When the reference count of a stack trace becomes zero,
4254       it is removed from the set and freed up.  The intent is to have
4255       a set of stack traces which can be referred to from (2), but to
4256       only represent each one once.  The set is indexed/searched by
4257       ordering on the stack trace vectors.
4258
4259    2. A Hash table of OldRefs.  These store information about each old
4260       ref that we need to record.  Hash table key is the address of the
4261       location for which the information is recorded.  For LRU
4262       purposes, each OldRef in the hash table is also on a doubly
4263       linked list maintaining the order in which the OldRef were most
4264       recently accessed.
4265       Each OldRef also maintains the stamp at which it was last accessed.
4266       With these stamps, we can quickly check which of 2 OldRef is the
4267       'newest', without having to scan the full list of LRU OldRef.
4268
4269       The important part of an OldRef is, however, its acc component.
4270       This binds a TSW triple (thread, size, R/W) to an RCEC.
4271
4272       We allocate a maximum of VG_(clo_conflict_cache_size) OldRef.
4273       Then we do exact LRU discarding.  For each discarded OldRef we must
4274       of course decrement the reference count on the RCEC it
4275       refers to, in order that entries from (1) eventually get
4276       discarded too.
4277 */
4278
4279 static UWord stats__evm__lookup_found = 0;
4280 static UWord stats__evm__lookup_notfound = 0;
4281
4282 static UWord stats__ctxt_eq_tsw_eq_rcec = 0;
4283 static UWord stats__ctxt_eq_tsw_neq_rcec = 0;
4284 static UWord stats__ctxt_neq_tsw_neq_rcec = 0;
4285 static UWord stats__ctxt_rcdec_calls = 0;
4286 static UWord stats__ctxt_rcec_gc_discards = 0;
4287
4288 static UWord stats__ctxt_tab_curr = 0;
4289 static UWord stats__ctxt_tab_max  = 0;
4290
4291 static UWord stats__ctxt_tab_qs   = 0;
4292 static UWord stats__ctxt_tab_cmps = 0;
4293
4294
4295 ///////////////////////////////////////////////////////
4296 //// Part (1): A hash table of RCECs
4297 ///
4298
4299 //#define N_RCEC_TAB 98317 /* prime */
4300 #define N_RCEC_TAB 196613 /* prime */
4301
4302 //////////// BEGIN RCEC pool allocator
4303 static PoolAlloc* rcec_pool_allocator;
4304 static RCEC* alloc_RCEC ( void ) {
4305    return VG_(allocEltPA) ( rcec_pool_allocator );
4306 }
4307
4308 static void free_RCEC ( RCEC* rcec ) {
4309    tl_assert(rcec->magic == RCEC_MAGIC);
4310    VG_(freeEltPA)( rcec_pool_allocator, rcec );
4311 }
4312 //////////// END RCEC pool allocator
4313
4314 static RCEC** contextTab = NULL; /* hash table of RCEC*s */
4315
4316 /* Count of allocated RCEC having ref count > 0 */
4317 static UWord RCEC_referenced = 0;
4318
4319 /* True if the frames of ec1 and ec2 are different. */
4320 static Bool RCEC__differs_by_frames ( RCEC* ec1, RCEC* ec2 ) {
4321    Word i;
4322    if (CHECK_CEM) {
4323       tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
4324       tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
4325    }
4326    if (ec1->frames_hash != ec2->frames_hash) return True;
4327    for (i = 0; i < HG_(clo_history_backtrace_size); i++) {
4328       if (ec1->frames[i] != ec2->frames[i]) return True;
4329    }
4330    return False;
4331 }
4332
4333 /* Dec the ref of this RCEC. */
4334 static void ctxt__rcdec ( RCEC* ec )
4335 {
4336    stats__ctxt_rcdec_calls++;
4337    if (CHECK_CEM)
4338       tl_assert(ec && ec->magic == RCEC_MAGIC);
4339    tl_assert(ec->rc > 0);
4340    ec->rc--;
4341    if (ec->rc == 0)
4342       RCEC_referenced--;
4343 }
4344
4345 static void ctxt__rcinc ( RCEC* ec )
4346 {
4347    if (CHECK_CEM)
4348       tl_assert(ec && ec->magic == RCEC_MAGIC);
4349    if (ec->rc == 0)
4350       RCEC_referenced++;
4351    ec->rc++;
4352 }
4353
4354
4355 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
4356    move it one step closer to the front of the list, so as to make
4357    subsequent searches for it cheaper. */
4358 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
4359 {
4360    RCEC *ec0, *ec1, *ec2;
4361    if (ec == *headp)
4362       tl_assert(0); /* already at head of list */
4363    tl_assert(ec != NULL);
4364    ec0 = *headp;
4365    ec1 = NULL;
4366    ec2 = NULL;
4367    while (True) {
4368       if (ec0 == NULL || ec0 == ec) break;
4369       ec2 = ec1;
4370       ec1 = ec0;
4371       ec0 = ec0->next;
4372    }
4373    tl_assert(ec0 == ec);
4374    if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
4375       RCEC* tmp;
4376       /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
4377          predecessor.  Swap ec0 and ec1, that is, move ec0 one step
4378          closer to the start of the list. */
4379       tl_assert(ec2->next == ec1);
4380       tl_assert(ec1->next == ec0);
4381       tmp = ec0->next;
4382       ec2->next = ec0;
4383       ec0->next = ec1;
4384       ec1->next = tmp;
4385    }
4386    else
4387    if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
4388       /* it's second in the list. */
4389       tl_assert(*headp == ec1);
4390       tl_assert(ec1->next == ec0);
4391       ec1->next = ec0->next;
4392       ec0->next = ec1;
4393       *headp = ec0;
4394    }
4395 }
4396
4397
4398 /* Find the given RCEC in the tree, and return a pointer to it.  Or,
4399    if not present, add the given one to the tree (by making a copy of
4400    it, so the caller can immediately deallocate the original) and
4401    return a pointer to the copy.  The caller can safely have 'example'
4402    on its stack, since we will always return a pointer to a copy of
4403    it, not to the original.  Note that the inserted node will have .rc
4404    of zero and so the caller must immediately increment it. */
4405 __attribute__((noinline))
4406 static RCEC* ctxt__find_or_add ( RCEC* example )
4407 {
4408    UWord hent;
4409    RCEC* copy;
4410
4411    if (CHECK_CEM) {
4412       /* Note that the single caller of ctxt__find_or_add always provides
4413          &thr->cached_rcec as argument. The sanity of thr->cached_rcec is always
4414          checked with a thread terminates. */
4415       tl_assert(example && example->magic == RCEC_MAGIC);
4416       tl_assert(example->rc == 0);
4417    }
4418
4419    /* Search the hash table to see if we already have it. */
4420    stats__ctxt_tab_qs++;
4421    hent = example->frames_hash % N_RCEC_TAB;
4422    copy = contextTab[hent];
4423    while (1) {
4424       if (!copy) break;
4425       if (CHECK_CEM)
4426          tl_assert(copy->magic == RCEC_MAGIC);
4427       stats__ctxt_tab_cmps++;
4428       if (!RCEC__differs_by_frames(copy, example)) break;
4429       copy = copy->next;
4430    }
4431
4432    if (copy) {
4433       tl_assert(copy != example);
4434       /* optimisation: if it's not at the head of its list, move 1
4435          step fwds, to make future searches cheaper */
4436       if (copy != contextTab[hent]) {
4437          move_RCEC_one_step_forward( &contextTab[hent], copy );
4438       }
4439    } else {
4440       copy = alloc_RCEC();
4441       tl_assert(copy != example);
4442       *copy = *example;
4443       for (Word i = 0; i < HG_(clo_history_backtrace_size); i++)
4444          copy->frames[i] = example->frames[i];
4445       copy->next = contextTab[hent];
4446       contextTab[hent] = copy;
4447       stats__ctxt_tab_curr++;
4448       if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4449          stats__ctxt_tab_max = stats__ctxt_tab_curr;
4450    }
4451    return copy;
4452 }
4453
4454 static inline UWord ROLW ( UWord w, Int n )
4455 {
4456    Int bpw = 8 * sizeof(UWord);
4457    w = (w << n) | (w >> (bpw-n));
4458    return w;
4459 }
4460
4461 static UWord stats__cached_rcec_identical = 0;
4462 static UWord stats__cached_rcec_updated = 0;
4463 static UWord stats__cached_rcec_fresh = 0;
4464 static UWord stats__cached_rcec_diff = 0;
4465 static UWord stats__cached_rcec_diff_known_reason = 0;
4466
4467 /* Check if the cached rcec in thr corresponds to the current
4468    stacktrace of the thread. Returns True if ok, False otherwise.
4469    This is just used for debugging the cached rcec logic, activated
4470    using --hg-sanity-flags=xx1xxx i.e. SCE_ACCESS flag.
4471    When this flag is activated, a call to this function will happen each time
4472    a stack trace is needed for a memory access. */
4473 __attribute__((noinline))
4474 static Bool check_cached_rcec_ok (Thr* thr, Addr previous_frame0)
4475 {
4476    Bool  ok = True;
4477    UInt  i;
4478    UWord frames[HG_(clo_history_backtrace_size)];
4479    UWord sps[HG_(clo_history_backtrace_size)];
4480    UWord fps[HG_(clo_history_backtrace_size)];
4481    const DiEpoch cur_ep = VG_(current_DiEpoch)();
4482
4483    for (i = 0; i < HG_(clo_history_backtrace_size); i++)
4484       frames[i] = sps[i] = fps[i] = 0;
4485    VG_(get_StackTrace)( thr->hgthread->coretid, &frames[0],
4486                         HG_(clo_history_backtrace_size),
4487                         &sps[0], &fps[0], 0);
4488    for (i = 0; i < HG_(clo_history_backtrace_size); i++) {
4489       if ( thr->cached_rcec.frames[i] != frames[i] ) {
4490          /* There are a bunch of "normal" reasons for which a stack
4491             derived from the cached rcec differs from frames. */
4492          const HChar *reason = NULL;
4493
4494          /* Old linkers (e.g. RHEL5) gave no cfi unwind information in the PLT
4495             section (fix was added in binutils around June 2011).
4496             Without PLT unwind info, stacktrace in the PLT section are
4497             missing an entry. E.g. the cached stacktrace is:
4498               ==4463==    at 0x2035C0: ___tls_get_addr (dl-tls.c:753)
4499               ==4463==    by 0x33B7F9: __libc_thread_freeres
4500                                                 (in /lib/libc-2.11.2.so)
4501               ==4463==    by 0x39BA4F: start_thread (pthread_create.c:307)
4502               ==4463==    by 0x2F107D: clone (clone.S:130)
4503            while the 'check stacktrace' is
4504               ==4463==    at 0x2035C0: ___tls_get_addr (dl-tls.c:753)
4505               ==4463==    by 0x33B82D: strerror_thread_freeres
4506                                                 (in /lib/libc-2.11.2.so)
4507               ==4463==    by 0x33B7F9: __libc_thread_freeres
4508                                                 (in /lib/libc-2.11.2.so)
4509               ==4463==    by 0x39BA4F: start_thread (pthread_create.c:307)
4510               ==4463==    by 0x2F107D: clone (clone.S:130)
4511            No cheap/easy way to detect or fix that. */
4512
4513          /* It seems that sometimes, the CFI unwind info looks wrong
4514             for a 'ret' instruction. E.g. here is the unwind info
4515             for a 'retq' on gcc20 (amd64, Debian 7)
4516                 [0x4e3ddfe .. 0x4e3ddfe]: let cfa=oldSP+48 in RA=*(cfa+-8)
4517                                                       SP=cfa+0 BP=*(cfa+-24)
4518             This unwind info looks doubtful, as the RA should be at oldSP.
4519             No easy way to detect this problem.
4520             This gives a difference between cached rcec and
4521             current stack trace: the cached rcec is correct. */
4522
4523          /* When returning from main, unwind info becomes erratic.
4524             So, by default, only report errors for main and above,
4525             unless asked to show below main. */
4526          if (reason == NULL) {
4527             UInt fr_main;
4528             Vg_FnNameKind fr_kind = Vg_FnNameNormal;
4529             for (fr_main = 0;
4530                  fr_main < HG_(clo_history_backtrace_size);
4531                  fr_main++) {
4532                fr_kind = VG_(get_fnname_kind_from_IP)
4533                                 (cur_ep, frames[fr_main]);
4534                if (fr_kind == Vg_FnNameMain || fr_kind == Vg_FnNameBelowMain)
4535                   break;
4536             }
4537             UInt kh_main;
4538             Vg_FnNameKind kh_kind = Vg_FnNameNormal;
4539             for (kh_main = 0;
4540                  kh_main < HG_(clo_history_backtrace_size);
4541                  kh_main++) {
4542                kh_kind = VG_(get_fnname_kind_from_IP)
4543                                 (cur_ep, thr->cached_rcec.frames[kh_main]);
4544                if (kh_kind == Vg_FnNameMain || kh_kind == Vg_FnNameBelowMain)
4545                   break;
4546             }
4547             if (kh_main == fr_main
4548                 && kh_kind == fr_kind
4549                 && (kh_main < i || (kh_main == i
4550                                     && kh_kind == Vg_FnNameBelowMain))) {
4551                // found main or below main before the difference
4552                reason = "Below main";
4553             }
4554          }
4555
4556          /* We have places where the stack is missing some internal
4557             pthread functions. For such stacktraces, GDB reports only
4558             one function, telling:
4559                #0  0xf7fa81fe in _L_unlock_669 ()
4560                               from /lib/i386-linux-gnu/libpthread.so.0
4561                Backtrace stopped: previous frame identical to
4562                                             this frame (corrupt stack?)
4563
4564             This is when sps and fps are identical.
4565             The cached stack trace is then
4566                ==3336==    at 0x40641FE: _L_unlock_669
4567                                               (pthread_mutex_unlock.c:310)
4568                ==3336==    by 0x40302BE: pthread_mutex_unlock
4569                                               (hg_intercepts.c:710)
4570                ==3336==    by 0x80486AF: main (cond_timedwait_test.c:14)
4571            while the 'check stacktrace' is
4572                ==3336==    at 0x40641FE: _L_unlock_669
4573                                               (pthread_mutex_unlock.c:310)
4574                ==3336==    by 0x4064206: _L_unlock_669
4575                                               (pthread_mutex_unlock.c:310)
4576                ==3336==    by 0x4064132: __pthread_mutex_unlock_usercnt
4577                                               (pthread_mutex_unlock.c:57)
4578                ==3336==    by 0x40302BE: pthread_mutex_unlock
4579                                                (hg_intercepts.c:710)
4580                ==3336==    by 0x80486AF: main (cond_timedwait_test.c:14) */
4581          if (reason == NULL) {
4582             if ((i > 0
4583                       && sps[i] == sps[i-1] && fps[i] == fps[i-1])
4584                 || (i < HG_(clo_history_backtrace_size)-1
4585                       && sps[i] == sps[i+1] && fps[i] == fps[i+1])) {
4586                reason = "previous||next frame: identical sp and fp";
4587             }
4588          }
4589          if (reason == NULL) {
4590             if ((i > 0
4591                       && fps[i] == fps[i-1])
4592                 || (i < HG_(clo_history_backtrace_size)-1
4593                       && fps[i] == fps[i+1])) {
4594                reason = "previous||next frame: identical fp";
4595             }
4596          }
4597
4598          /* When we have a read or write 'in the middle of a push instruction',
4599             then the normal backtrace is not very good, while the helgrind
4600             stacktrace is better, as it undoes the not yet fully finished
4601             push instruction before getting the stacktrace. */
4602          if (reason == NULL && thr->hgthread->first_sp_delta != 0) {
4603             reason = "fixupSP probably needed for check stacktrace";
4604          }
4605
4606          /* Unwinding becomes hectic when running the exit handlers.
4607             None of GDB, cached stacktrace and check stacktrace corresponds.
4608             So, if we find __run_exit_handlers, ignore the difference. */
4609          if (reason == NULL) {
4610             const HChar *fnname;
4611             for (UInt f = 0; f < HG_(clo_history_backtrace_size); f++) {
4612                if (VG_(get_fnname)( cur_ep, frames[f], &fnname)
4613                    && VG_(strcmp) ("__run_exit_handlers", fnname) == 0) {
4614                   reason = "exit handlers";
4615                   break;
4616                }
4617             }
4618          }
4619
4620          // Show what we have found for this difference
4621          if (reason == NULL) {
4622             ok = False;
4623             stats__cached_rcec_diff++;
4624          } else {
4625             ok = True;
4626             stats__cached_rcec_diff_known_reason++;
4627          }
4628          if (!ok || VG_(clo_verbosity) > 2) {
4629             Bool save_show_below_main = VG_(clo_show_below_main);
4630             VG_(clo_show_below_main) = True;
4631             /* The below error msg reports an unexpected diff in 'frame %d'.
4632                The (maybe wrong) pc found in the cached stacktrace is
4633                'cached_pc %p' while an unwind gives the (maybe wrong)
4634                'check_pc %p'.
4635                After, 'previous_frame0 %p' tells where the cached stacktrace
4636                was taken.
4637                This is then followed by the full resulting cache stack trace
4638                and the full stack trace found doing unwind.
4639                Such a diff can have various origins:
4640                  * a bug in the unwinder, when the cached stack trace was taken
4641                    at 'previous_frame0'
4642                  * a bug in the unwinder, when the check stack trace was taken
4643                    (i.e. at current pc).
4644                  * a missing 'invalidate cache stack trace' somewhere in the
4645                    instructions between 'previous_frame0' and current_pc.
4646                To investigate the last case, typically, disass the range of
4647                instructions where an invalidate cached stack might miss. */
4648             VG_(printf)("%s diff tid %u frame %u "
4649                         "cached_pc %p check_pc %p\n",
4650                         reason ? reason : "unexpected",
4651                         thr->hgthread->coretid,
4652                         i,
4653                         (void*)thr->cached_rcec.frames[i],
4654                         (void*)frames[i]);
4655             VG_(printf)("cached stack trace previous_frame0 %p\n",
4656                         (void*)previous_frame0);
4657             VG_(pp_StackTrace)(cur_ep, &previous_frame0, 1);
4658             VG_(printf)("resulting cached stack trace:\n");
4659             VG_(pp_StackTrace)(cur_ep, thr->cached_rcec.frames,
4660                                HG_(clo_history_backtrace_size));
4661             VG_(printf)("check stack trace:\n");
4662             VG_(pp_StackTrace)(cur_ep, frames, HG_(clo_history_backtrace_size));
4663
4664             VG_(show_sched_status) (False,  // host_stacktrace
4665                                     False,  // stack_usage
4666                                     False); // exited_threads
4667             if (VG_(clo_vgdb_error) == 1234567890) // HACK TO ALLOW TO DEBUG
4668                VG_(gdbserver) ( thr->hgthread->coretid );
4669             VG_(clo_show_below_main) = save_show_below_main;
4670          }
4671          break; // Stop giving more errors for this stacktrace.
4672       }
4673    }
4674    return ok;
4675 }
4676
4677 __attribute__((noinline))
4678 static RCEC* get_RCEC ( Thr* thr )
4679 {
4680    UInt  i;
4681    UWord hash;
4682    Addr  previous_frame0 = 0; // Assignment needed to silence gcc
4683    RCEC  *res;
4684    const Bool thr_cached_rcec_valid = cached_rcec_valid(thr);
4685    const Addr cur_ip = VG_(get_IP)(thr->hgthread->coretid);
4686
4687    if (DEBUG_CACHED_RCEC)
4688       VG_(printf)("get rcec tid %u at IP %p SP %p"
4689                   " first_sp_delta %ld cached valid %d\n",
4690                   thr->hgthread->coretid,
4691                   (void*)cur_ip,
4692                   (void*)VG_(get_SP)(thr->hgthread->coretid),
4693                   thr->hgthread->first_sp_delta, thr_cached_rcec_valid);
4694
4695    /* If we have a valid cached rcec, derive the new rcec from the cached one
4696       and update the cached one.
4697       Otherwise, compute a fresh rcec. */
4698
4699    if (thr_cached_rcec_valid) {
4700       /* Update the stacktrace of the cached rcec with the current IP */
4701       previous_frame0 = thr->cached_rcec.frames[0];
4702       thr->cached_rcec.frames[0] = cur_ip;
4703
4704 #     if defined(VGP_x86_linux)
4705       // See m_stacktrace.c kludge
4706       extern Addr VG_(client__dl_sysinfo_int80);
4707       /// #include pub_core_clientstate needed for the above ????
4708       /// or move the above into a pub_tool_??? tool_stacktrace.h maybe ????
4709       if (VG_(client__dl_sysinfo_int80) != 0 /* we know its address */
4710           && cur_ip >= VG_(client__dl_sysinfo_int80)
4711           && cur_ip < VG_(client__dl_sysinfo_int80)+3
4712           ) {
4713          thr->cached_rcec.frames[0]
4714             = (ULong) *(Addr*)(UWord)VG_(get_SP)(thr->hgthread->coretid);
4715       }
4716 #     endif
4717
4718       if (previous_frame0 == thr->cached_rcec.frames[0])
4719          stats__cached_rcec_identical++;
4720       else
4721          stats__cached_rcec_updated++;
4722    } else {
4723       /* Compute a fresh stacktrace. */
4724       main_get_stacktrace( thr, &thr->cached_rcec.frames[0],
4725                            HG_(clo_history_backtrace_size) );
4726       if (DEBUG_CACHED_RCEC) {
4727          Bool save_show_below_main = VG_(clo_show_below_main);
4728          VG_(clo_show_below_main) = True;
4729          VG_(printf)("caching stack trace:\n");
4730          VG_(pp_StackTrace)(VG_(current_DiEpoch)(),
4731                             &thr->cached_rcec.frames[0],
4732                             HG_(clo_history_backtrace_size));
4733          VG_(clo_show_below_main) = save_show_below_main;
4734       }
4735       stats__cached_rcec_fresh++;
4736    }
4737
4738    hash = 0;
4739    for (i = 0; i < HG_(clo_history_backtrace_size); i++) {
4740       hash ^= thr->cached_rcec.frames[i];
4741       hash = ROLW(hash, 19);
4742    }
4743    thr->cached_rcec.frames_hash = hash;
4744    res = ctxt__find_or_add( &thr->cached_rcec );
4745
4746    if (UNLIKELY(HG_(clo_sanity_flags) & SCE_ACCESS)
4747        && thr_cached_rcec_valid) {
4748       /* In case the cached and check differ, invalidate the cached rcec.
4749          We have less duplicated diffs reported afterwards. */
4750       if (!check_cached_rcec_ok (thr, previous_frame0))
4751          set_cached_rcec_validity(thr, False);
4752    } else {
4753       if (HG_(clo_delta_stacktrace) && !thr_cached_rcec_valid)
4754             set_cached_rcec_validity(thr, True);
4755    }
4756
4757    return res;
4758 }
4759
4760 ///////////////////////////////////////////////////////
4761 //// Part (2):
4762 ///  A hashtable guest-addr -> OldRef, that refers to (1)
4763 ///  Note: we use the guest address as key. This means that the entries
4764 ///  for multiple threads accessing the same address will land in the same
4765 ///  bucket. It might be nice to have a better distribution of the
4766 ///  OldRef in the hashtable by using ask key the guestaddress ^ tsw.
4767 ///  The problem is that when a race is reported on a ga, we need to retrieve
4768 ///  efficiently the accesses to ga by other threads, only using the ga.
4769 ///  Measurements on firefox have shown that the chain length is reasonable.
4770
4771 /* Records an access: a thread, a context (size & writeness) and the
4772    number of held locks. The size (1,2,4,8) is stored as is in szB.
4773    Note that szB uses more bits than needed to store a size up to 8.
4774    This allows to use a TSW as a fully initialised UInt e.g. in
4775    cmp_oldref_tsw. If needed, a more compact representation of szB
4776    can be done (e.g. use only 4 bits, or use only 2 bits and encode the
4777    size (1,2,4,8) as 00 = 1, 01 = 2, 10 = 4, 11 = 8. */
4778 typedef
4779    struct {
4780       UInt      thrid  : SCALARTS_N_THRBITS;
4781       UInt      szB    : 32 - SCALARTS_N_THRBITS - 1;
4782       UInt      isW    : 1;
4783    } TSW; // Thread+Size+Writeness
4784 typedef
4785    struct {
4786       TSW       tsw;
4787       WordSetID locksHeldW;
4788       RCEC*     rcec;
4789    }
4790    Thr_n_RCEC;
4791
4792 typedef
4793    struct OldRef {
4794       struct OldRef *ht_next; // to link hash table nodes together.
4795       UWord  ga; // hash_table key, == address for which we record an access.
4796       struct OldRef *prev; // to refs older than this one
4797       struct OldRef *next; // to refs newer that this one
4798       UWord stamp; // allows to order (by time of access) 2 OldRef
4799       Thr_n_RCEC acc;
4800    }
4801    OldRef;
4802
4803 /* Returns the or->tsw as an UInt */
4804 static inline UInt oldref_tsw (const OldRef* or)
4805 {
4806    return *(const UInt*)(&or->acc.tsw);
4807 }
4808
4809 /* Compare the tsw component for 2 OldRef.
4810    Used for OldRef hashtable (which already verifies equality of the
4811    'key' part. */
4812 static Word cmp_oldref_tsw (const void* node1, const void* node2 )
4813 {
4814    const UInt tsw1 = oldref_tsw(node1);
4815    const UInt tsw2 = oldref_tsw(node2);
4816
4817    if (tsw1 < tsw2) return -1;
4818    if (tsw1 > tsw2) return  1;
4819    return 0;
4820 }
4821
4822
4823 //////////// BEGIN OldRef pool allocator
4824 static PoolAlloc* oldref_pool_allocator;
4825 // Note: We only allocate elements in this pool allocator, we never free them.
4826 // We stop allocating elements at VG_(clo_conflict_cache_size).
4827 //////////// END OldRef pool allocator
4828
4829 static OldRef mru;
4830 static OldRef lru;
4831 // A double linked list, chaining all OldREf in a mru/lru order.
4832 // mru/lru are sentinel nodes.
4833 // Whenever an oldref is re-used, its position is changed as the most recently
4834 // used (i.e. pointed to by mru.prev).
4835 // When a new oldref is needed, it is allocated from the pool
4836 //  if we have not yet reached --conflict-cache-size.
4837 // Otherwise, if all oldref have already been allocated,
4838 // the least recently used (i.e. pointed to by lru.next) is re-used.
4839 // When an OldRef is used, it is moved as the most recently used entry
4840 // (i.e. pointed to by mru.prev).
4841
4842 // Removes r from the double linked list
4843 // Note: we do not need to test for special cases such as
4844 // NULL next or prev pointers, because we have sentinel nodes
4845 // at both sides of the list. So, a node is always forward and
4846 // backward linked.
4847 static inline void OldRef_unchain(OldRef *r)
4848 {
4849    r->next->prev = r->prev;
4850    r->prev->next = r->next;
4851 }
4852
4853 // Insert new as the newest OldRef
4854 // Similarly to OldRef_unchain, no need to test for NULL
4855 // pointers, as e.g. mru.prev is always guaranteed to point
4856 // to a non NULL node (lru when the list is empty).
4857 static inline void OldRef_newest(OldRef *new)
4858 {
4859    new->next = &mru;
4860    new->prev = mru.prev;
4861    mru.prev = new;
4862    new->prev->next = new;
4863 }
4864
4865
4866 static VgHashTable* oldrefHT    = NULL; /* Hash table* OldRef* */
4867 static UWord     oldrefHTN    = 0;    /* # elems in oldrefHT */
4868 /* Note: the nr of ref in the oldrefHT will always be equal to
4869    the nr of elements that were allocated from the OldRef pool allocator
4870    as we never free an OldRef : we just re-use them. */
4871
4872
4873 /* allocates a new OldRef or re-use the lru one if all allowed OldRef
4874    have already been allocated. */
4875 static OldRef* alloc_or_reuse_OldRef ( void )
4876 {
4877    if (oldrefHTN < HG_(clo_conflict_cache_size)) {
4878       oldrefHTN++;
4879       return VG_(allocEltPA) ( oldref_pool_allocator );
4880    } else {
4881       OldRef *oldref_ht;
4882       OldRef *oldref = lru.next;
4883
4884       OldRef_unchain(oldref);
4885       oldref_ht = VG_(HT_gen_remove) (oldrefHT, oldref, cmp_oldref_tsw);
4886       tl_assert (oldref == oldref_ht);
4887       ctxt__rcdec( oldref->acc.rcec );
4888       return oldref;
4889    }
4890 }
4891
4892
4893 inline static UInt min_UInt ( UInt a, UInt b ) {
4894    return a < b ? a : b;
4895 }
4896
4897 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4898    first interval is lower, 1 if the first interval is higher, and 0
4899    if there is any overlap.  Redundant paranoia with casting is there
4900    following what looked distinctly like a bug in gcc-4.1.2, in which
4901    some of the comparisons were done signedly instead of
4902    unsignedly. */
4903 /* Copied from exp-ptrcheck/sg_main.c */
4904 static inline Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4905                                             Addr a2, SizeT n2 ) {
4906    UWord a1w = (UWord)a1;
4907    UWord n1w = (UWord)n1;
4908    UWord a2w = (UWord)a2;
4909    UWord n2w = (UWord)n2;
4910    tl_assert(n1w > 0 && n2w > 0);
4911    if (a1w + n1w <= a2w) return -1L;
4912    if (a2w + n2w <= a1w) return 1L;
4913    return 0;
4914 }
4915
4916 static UWord event_map_stamp = 0; // Used to stamp each OldRef when touched.
4917
4918 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4919 {
4920    OldRef  example;
4921    OldRef* ref;
4922    RCEC*   rcec;
4923
4924    tl_assert(thr);
4925    ThrID thrid = thr->thrid;
4926    tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4927
4928    WordSetID locksHeldW = thr->hgthread->locksetW;
4929
4930    rcec = get_RCEC( thr );
4931
4932    /* Look in the oldrefHT to see if we already have a record for this
4933       address/thr/sz/isW. */
4934    example.ga = a;
4935    example.acc.tsw = (TSW) {.thrid = thrid,
4936                             .szB = szB,
4937                             .isW = (UInt)(isW & 1)};
4938    ref = VG_(HT_gen_lookup) (oldrefHT, &example, cmp_oldref_tsw);
4939
4940    if (ref) {
4941       /* We already have a record for this address and this (thrid, R/W,
4942          size) triple. */
4943       tl_assert (ref->ga == a);
4944
4945       /* thread 'thr' has an entry.  Update its RCEC, if it differs. */
4946       if (rcec == ref->acc.rcec)
4947          stats__ctxt_eq_tsw_eq_rcec++;
4948       else {
4949          stats__ctxt_eq_tsw_neq_rcec++;
4950          ctxt__rcdec( ref->acc.rcec );
4951          ctxt__rcinc(rcec);
4952          ref->acc.rcec       = rcec;
4953       }
4954       tl_assert(ref->acc.tsw.thrid == thrid);
4955       /* Update the stamp, RCEC and the W-held lockset. */
4956       ref->stamp = event_map_stamp;
4957       ref->acc.locksHeldW = locksHeldW;
4958
4959       OldRef_unchain(ref);
4960       OldRef_newest(ref);
4961
4962    } else {
4963       tl_assert (szB == 4 || szB == 8 ||szB == 1 || szB == 2);
4964       // We only need to check the size the first time we insert a ref.
4965       // Check for most frequent cases first
4966       // Note: we could support a szB up to 1 << (32 - SCALARTS_N_THRBITS - 1)
4967
4968       /* We don't have a record for this address+triple.  Create a new one. */
4969       stats__ctxt_neq_tsw_neq_rcec++;
4970       ref = alloc_or_reuse_OldRef();
4971       ref->ga = a;
4972       ref->acc.tsw = (TSW) {.thrid  = thrid,
4973                             .szB    = szB,
4974                             .isW    = (UInt)(isW & 1)};
4975       ref->stamp = event_map_stamp;
4976       ref->acc.locksHeldW = locksHeldW;
4977       ref->acc.rcec       = rcec;
4978       ctxt__rcinc(rcec);
4979
4980       VG_(HT_add_node) ( oldrefHT, ref );
4981       OldRef_newest (ref);
4982    }
4983    event_map_stamp++;
4984 }
4985
4986
4987 /* Extract info from the conflicting-access machinery.
4988    Returns the most recent conflicting access with thr/[a, a+szB[/isW. */
4989 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4990                               /*OUT*/Thr**        resThr,
4991                               /*OUT*/SizeT*       resSzB,
4992                               /*OUT*/Bool*        resIsW,
4993                               /*OUT*/WordSetID*   locksHeldW,
4994                               Thr* thr, Addr a, SizeT szB, Bool isW )
4995 {
4996    Word    i, j;
4997    OldRef *ref = NULL;
4998    SizeT  ref_szB = 0;
4999
5000    OldRef *cand_ref;
5001    SizeT  cand_ref_szB;
5002    Addr   cand_a;
5003
5004    Addr toCheck[15];
5005    Int  nToCheck = 0;
5006
5007    tl_assert(thr);
5008    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
5009
5010    ThrID thrid = thr->thrid;
5011
5012    toCheck[nToCheck++] = a;
5013    for (i = -7; i < (Word)szB; i++) {
5014       if (i != 0)
5015          toCheck[nToCheck++] = a + i;
5016    }
5017    tl_assert(nToCheck <= 15);
5018
5019    /* Now see if we can find a suitable matching event for
5020       any of the addresses in toCheck[0 .. nToCheck-1]. */
5021    for (j = 0; j < nToCheck; j++) {
5022
5023       cand_a = toCheck[j];
5024       //      VG_(printf)("test %ld %p\n", j, cand_a);
5025
5026       /* Find the first HT element for this address.
5027          We might have several of these. They will be linked via ht_next.
5028          We however need to check various elements as the list contains
5029          all elements that map to the same bucket. */
5030       for (cand_ref = VG_(HT_lookup)( oldrefHT, cand_a );
5031            cand_ref; cand_ref = cand_ref->ht_next) {
5032          if (cand_ref->ga != cand_a)
5033             /* OldRef for another address in this HT bucket. Ignore. */
5034             continue;
5035
5036          if (cand_ref->acc.tsw.thrid == thrid)
5037             /* This is an access by the same thread, but we're only
5038                interested in accesses from other threads.  Ignore. */
5039             continue;
5040
5041          if ((!cand_ref->acc.tsw.isW) && (!isW))
5042             /* We don't want to report a read racing against another
5043                read; that's stupid.  So in this case move on. */
5044             continue;
5045
5046          cand_ref_szB        = cand_ref->acc.tsw.szB;
5047          if (cmp_nonempty_intervals(a, szB, cand_a, cand_ref_szB) != 0)
5048             /* No overlap with the access we're asking about.  Ignore. */
5049             continue;
5050
5051          /* We have a match. Keep this match if it is newer than
5052             the previous match. Note that stamp are Unsigned Words, and
5053             for long running applications, event_map_stamp might have cycled.
5054             So, 'roll' each stamp using event_map_stamp to have the
5055             stamps in the good order, in case event_map_stamp recycled. */
5056          if (!ref
5057              || (ref->stamp - event_map_stamp)
5058                    < (cand_ref->stamp - event_map_stamp)) {
5059             ref = cand_ref;
5060             ref_szB = cand_ref_szB;
5061          }
5062       }
5063
5064       if (ref) {
5065          /* return with success */
5066          Int n, maxNFrames;
5067          RCEC*     ref_rcec = ref->acc.rcec;
5068          tl_assert(ref->acc.tsw.thrid);
5069          tl_assert(ref_rcec);
5070          tl_assert(ref_rcec->magic == RCEC_MAGIC);
5071          tl_assert(ref_szB >= 1);
5072          /* Count how many non-zero frames we have. */
5073          maxNFrames = min_UInt(HG_(clo_history_backtrace_size),
5074                                VG_(clo_backtrace_size));
5075          for (n = 0; n < maxNFrames; n++) {
5076             if (0 == ref_rcec->frames[n]) break;
5077          }
5078          *resEC      = VG_(make_ExeContext_from_StackTrace)(&ref_rcec->frames[0],
5079                                                             n);
5080          *resThr     = Thr__from_ThrID(ref->acc.tsw.thrid);
5081          *resSzB     = ref_szB;
5082          *resIsW     = ref->acc.tsw.isW;
5083          *locksHeldW = ref->acc.locksHeldW;
5084          stats__evm__lookup_found++;
5085          return True;
5086       }
5087
5088       /* consider next address in toCheck[] */
5089    } /* for (j = 0; j < nToCheck; j++) */
5090
5091    /* really didn't find anything. */
5092    stats__evm__lookup_notfound++;
5093    return False;
5094 }
5095
5096
5097 void libhb_event_map_access_history ( Addr a, SizeT szB, Access_t fn )
5098 {
5099    OldRef *ref = lru.next;
5100    SizeT ref_szB;
5101    Int n;
5102
5103    while (ref != &mru) {
5104       ref_szB = ref->acc.tsw.szB;
5105       if (cmp_nonempty_intervals(a, szB, ref->ga, ref_szB) == 0) {
5106          RCEC* ref_rcec = ref->acc.rcec;
5107          for (n = 0; n < HG_(clo_history_backtrace_size); n++) {
5108             if (0 == ref_rcec->frames[n]) {
5109                break;
5110             }
5111          }
5112          (*fn)(&ref_rcec->frames[0], n,
5113                Thr__from_ThrID(ref->acc.tsw.thrid),
5114                ref->ga,
5115                ref_szB,
5116                ref->acc.tsw.isW,
5117                ref->acc.locksHeldW);
5118       }
5119       tl_assert (ref->next == &mru
5120                  || ((ref->stamp - event_map_stamp)
5121                         < ref->next->stamp - event_map_stamp));
5122       ref = ref->next;
5123    }
5124 }
5125
5126 static void event_map_init ( void )
5127 {
5128    Word i;
5129
5130    /* Context (RCEC) pool allocator */
5131    rcec_pool_allocator
5132       = VG_(newPA) (
5133          sizeof(RCEC) + 2 * HG_(clo_history_backtrace_size) * sizeof(UWord),
5134          1000 /* RCECs per pool */,
5135          HG_(zalloc),
5136          "libhb.event_map_init.1 (RCEC pools)",
5137          HG_(free)
5138          );
5139
5140    /* Context table */
5141    tl_assert(!contextTab);
5142    contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
5143                              N_RCEC_TAB * sizeof(RCEC*) );
5144    for (i = 0; i < N_RCEC_TAB; i++)
5145       contextTab[i] = NULL;
5146
5147    /* Oldref pool allocator */
5148    oldref_pool_allocator = VG_(newPA)(
5149                                sizeof(OldRef),
5150                                1000 /* OldRefs per pool */,
5151                                HG_(zalloc),
5152                                "libhb.event_map_init.3 (OldRef pools)",
5153                                HG_(free)
5154                             );
5155
5156    /* Oldref hashtable */
5157    tl_assert(!oldrefHT);
5158    oldrefHT = VG_(HT_construct) ("libhb.event_map_init.4 (oldref hashtable)");
5159
5160    oldrefHTN = 0;
5161    mru.prev = &lru;
5162    mru.next = NULL;
5163    lru.prev = NULL;
5164    lru.next = &mru;
5165    mru.acc = (Thr_n_RCEC) {.tsw = {.thrid = 0,
5166                                    .szB = 0,
5167                                    .isW = 0},
5168                            .locksHeldW = 0,
5169                            .rcec = NULL};
5170    lru.acc = mru.acc;
5171 }
5172
5173 static void event_map__check_reference_counts ( void )
5174 {
5175    RCEC*   rcec;
5176    OldRef* oldref;
5177    Word    i;
5178    UWord   nEnts = 0;
5179
5180    /* Set the 'check' reference counts to zero.  Also, optionally
5181       check that the real reference counts are non-zero.  We allow
5182       these to fall to zero before a GC, but the GC must get rid of
5183       all those that are zero, hence none should be zero after a
5184       GC. */
5185    for (i = 0; i < N_RCEC_TAB; i++) {
5186       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
5187          nEnts++;
5188          tl_assert(rcec);
5189          tl_assert(rcec->magic == RCEC_MAGIC);
5190          rcec->rcX = 0;
5191       }
5192    }
5193
5194    /* check that the stats are sane */
5195    tl_assert(nEnts == stats__ctxt_tab_curr);
5196    tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
5197
5198    /* visit all the referencing points, inc check ref counts */
5199    VG_(HT_ResetIter)( oldrefHT );
5200    oldref = VG_(HT_Next)( oldrefHT );
5201    while (oldref) {
5202       tl_assert (oldref->acc.tsw.thrid);
5203       tl_assert (oldref->acc.rcec);
5204       tl_assert (oldref->acc.rcec->magic == RCEC_MAGIC);
5205       oldref->acc.rcec->rcX++;
5206       oldref = VG_(HT_Next)( oldrefHT );
5207    }
5208
5209    /* compare check ref counts with actual */
5210    for (i = 0; i < N_RCEC_TAB; i++) {
5211       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
5212          tl_assert(rcec->rc == rcec->rcX);
5213       }
5214    }
5215 }
5216
5217 __attribute__((noinline))
5218 static void do_RCEC_GC ( void )
5219 {
5220    UInt i;
5221
5222    if (VG_(clo_stats)) {
5223       static UInt ctr = 1;
5224       VG_(message)(Vg_DebugMsg,
5225                   "libhb: RCEC GC: #%u  %lu slots,"
5226                    " %lu cur ents(ref'd %lu),"
5227                    " %lu max ents\n",
5228                    ctr++,
5229                    (UWord)N_RCEC_TAB,
5230                    stats__ctxt_tab_curr, RCEC_referenced,
5231                    stats__ctxt_tab_max );
5232    }
5233    tl_assert (stats__ctxt_tab_curr > RCEC_referenced);
5234
5235    /* Throw away all RCECs with zero reference counts */
5236    for (i = 0; i < N_RCEC_TAB; i++) {
5237       RCEC** pp = &contextTab[i];
5238       RCEC*  p  = *pp;
5239       while (p) {
5240          if (p->rc == 0) {
5241             *pp = p->next;
5242             free_RCEC(p);
5243             p = *pp;
5244             tl_assert(stats__ctxt_tab_curr > 0);
5245             stats__ctxt_rcec_gc_discards++;
5246             stats__ctxt_tab_curr--;
5247          } else {
5248             pp = &p->next;
5249             p = p->next;
5250          }
5251       }
5252    }
5253
5254    tl_assert (stats__ctxt_tab_curr == RCEC_referenced);
5255 }
5256
5257 /////////////////////////////////////////////////////////
5258 //                                                     //
5259 // Core MSM                                            //
5260 //                                                     //
5261 /////////////////////////////////////////////////////////
5262
5263 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
5264    Nov 08, and again after [...],
5265    June 09. */
5266
5267 static ULong stats__msmcread         = 0;
5268 static ULong stats__msmcread_change  = 0;
5269 static ULong stats__msmcwrite        = 0;
5270 static ULong stats__msmcwrite_change = 0;
5271
5272 /* Some notes on the H1 history mechanism:
5273
5274    Transition rules are:
5275
5276    read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
5277    write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
5278
5279    After any access by a thread T to a location L, L's constraint pair
5280    (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
5281
5282    After a race by thread T conflicting with some previous access by
5283    some other thread U, for a location with constraint (before
5284    processing the later access) (Cr,Cw), then Cw[U] is the segment in
5285    which the previously access lies.
5286
5287    Hence in record_race_info, we pass in Cfailed and Kfailed, which
5288    are compared so as to find out which thread(s) this access
5289    conflicts with.  Once that is established, we also require the
5290    pre-update Cw for the location, so we can index into it for those
5291    threads, to get the scalar clock values for the point at which the
5292    former accesses were made.  (In fact we only bother to do any of
5293    this for an arbitrarily chosen one of the conflicting threads, as
5294    that's simpler, it avoids flooding the user with vast amounts of
5295    mostly useless information, and because the program is wrong if it
5296    contains any races at all -- so we don't really need to show all
5297    conflicting access pairs initially, so long as we only show none if
5298    none exist).
5299
5300    ---
5301
5302    That requires the auxiliary proof that
5303
5304       (Cr `join` Kw)[T] == Kw[T]
5305
5306    Why should that be true?  Because for any thread T, Kw[T] >= the
5307    scalar clock value for T known by any other thread.  In other
5308    words, because T's value for its own scalar clock is at least as up
5309    to date as the value for it known by any other thread (that is true
5310    for both the R- and W- scalar clocks).  Hence no other thread will
5311    be able to feed in a value for that element (indirectly via a
5312    constraint) which will exceed Kw[T], and hence the join cannot
5313    cause that particular element to advance.
5314 */
5315
5316 __attribute__((noinline))
5317 static void record_race_info ( Thr* acc_thr,
5318                                Addr acc_addr, SizeT szB, Bool isWrite,
5319                                VtsID Cfailed,
5320                                VtsID Kfailed,
5321                                VtsID Cw )
5322 {
5323    /* Call here to report a race.  We just hand it onwards to
5324       HG_(record_error_Race).  If that in turn discovers that the
5325       error is going to be collected, then, at history_level 2, that
5326       queries the conflicting-event map.  The alternative would be to
5327       query it right here.  But that causes a lot of pointless queries
5328       for errors which will shortly be discarded as duplicates, and
5329       can become a performance overhead; so we defer the query until
5330       we know the error is not a duplicate. */
5331
5332    /* Stacks for the bounds of the (or one of the) conflicting
5333       segment(s).  These are only set at history_level 1. */
5334    ExeContext* hist1_seg_start = NULL;
5335    ExeContext* hist1_seg_end   = NULL;
5336    Thread*     hist1_conf_thr  = NULL;
5337
5338    tl_assert(acc_thr);
5339    tl_assert(acc_thr->hgthread);
5340    tl_assert(acc_thr->hgthread->hbthr == acc_thr);
5341    tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
5342
5343    if (HG_(clo_history_level) == 1) {
5344       Bool found;
5345       Word firstIx, lastIx;
5346       ULong_n_EC key;
5347
5348       /* At history_level 1, we must round up the relevant stack-pair
5349          for the conflicting segment right now.  This is because
5350          deferring it is complex; we can't (easily) put Kfailed and
5351          Cfailed into the XError and wait for later without
5352          getting tied up in difficulties with VtsID reference
5353          counting.  So just do it now. */
5354       Thr*  confThr;
5355       ULong confTym = 0;
5356       /* Which thread are we in conflict with?  There may be more than
5357          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
5358          (in fact it's the one with the lowest Thr* value). */
5359       confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
5360       /* This must exist!  since if it was NULL then there's no
5361          conflict (semantics of return value of
5362          VtsID__findFirst_notLEQ), and msmc{read,write}, which has
5363          called us, just checked exactly this -- that there was in
5364          fact a race. */
5365       tl_assert(confThr);
5366
5367       /* Get the scalar clock value that the conflicting thread
5368          introduced into the constraint.  A careful examination of the
5369          base machine rules shows that this must be the same as the
5370          conflicting thread's scalar clock when it created this
5371          constraint.  Hence we know the scalar clock of the
5372          conflicting thread when the conflicting access was made. */
5373       confTym = VtsID__indexAt( Cfailed, confThr );
5374
5375       /* Using this scalar clock, index into the conflicting thread's
5376          collection of stack traces made each time its vector clock
5377          (hence its scalar clock) changed.  This gives the stack
5378          traces at the start and end of the conflicting segment (well,
5379          as per comment just above, of one of the conflicting
5380          segments, if there are more than one). */
5381       key.ull = confTym;
5382       key.ec  = NULL;
5383       /* tl_assert(confThr); -- asserted just above */
5384       tl_assert(confThr->local_Kws_n_stacks);
5385       firstIx = lastIx = 0;
5386       found = VG_(lookupXA_UNSAFE)(
5387                  confThr->local_Kws_n_stacks,
5388                  &key, &firstIx, &lastIx,
5389                  (XACmpFn_t)cmp__ULong_n_EC__by_ULong
5390               );
5391       if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
5392                          "confTym %llu found %d (%ld,%ld)\n",
5393                          Cfailed, Kfailed, Cw,
5394                          confThr, confTym, found, firstIx, lastIx);
5395       /* We can't indefinitely collect stack traces at VTS
5396          transitions, since we'd eventually run out of memory.  Hence
5397          note_local_Kw_n_stack_for will eventually throw away old
5398          ones, which in turn means we might fail to find index value
5399          confTym in the array. */
5400       if (found) {
5401          ULong_n_EC *pair_start, *pair_end;
5402          pair_start
5403             = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
5404          hist1_seg_start = pair_start->ec;
5405          if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
5406             pair_end
5407                = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
5408                                             lastIx+1 );
5409             /* from properties of VG_(lookupXA) and the comparison fn used: */
5410             tl_assert(pair_start->ull < pair_end->ull);
5411             hist1_seg_end = pair_end->ec;
5412             /* Could do a bit better here.  It may be that pair_end
5413                doesn't have a stack, but the following entries in the
5414                array have the same scalar Kw and to have a stack.  So
5415                we should search a bit further along the array than
5416                lastIx+1 if hist1_seg_end is NULL. */
5417          } else {
5418             if (!confThr->llexit_done)
5419                hist1_seg_end = main_get_EC( confThr );
5420          }
5421          // seg_start could be NULL iff this is the first stack in the thread
5422          //if (seg_start) VG_(pp_ExeContext)(seg_start);
5423          //if (seg_end)   VG_(pp_ExeContext)(seg_end);
5424          hist1_conf_thr = confThr->hgthread;
5425       }
5426    }
5427
5428    HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
5429                            szB, isWrite,
5430                            hist1_conf_thr, hist1_seg_start, hist1_seg_end );
5431 }
5432
5433 static Bool is_sane_SVal_C ( SVal sv ) {
5434    Bool leq;
5435    if (!SVal__isC(sv)) return True;
5436    leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
5437    return leq;
5438 }
5439
5440
5441 /* Compute new state following a read */
5442 static inline SVal msmcread ( SVal svOld,
5443                               /* The following are only needed for
5444                                  creating error reports. */
5445                               Thr* acc_thr,
5446                               Addr acc_addr, SizeT szB )
5447 {
5448    SVal svNew = SVal_INVALID;
5449    stats__msmcread++;
5450
5451    /* Redundant sanity check on the constraints */
5452    if (CHECK_MSM) {
5453       tl_assert(is_sane_SVal_C(svOld));
5454    }
5455
5456    if (LIKELY(SVal__isC(svOld))) {
5457       VtsID tviR  = acc_thr->viR;
5458       VtsID tviW  = acc_thr->viW;
5459       VtsID rmini = SVal__unC_Rmin(svOld);
5460       VtsID wmini = SVal__unC_Wmin(svOld);
5461       Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
5462       if (LIKELY(leq)) {
5463          /* no race */
5464          /* Note: RWLOCK subtlety: use tviW, not tviR */
5465          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5466          goto out;
5467       } else {
5468          /* assert on sanity of constraints. */
5469          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5470          tl_assert(leqxx);
5471          // same as in non-race case
5472          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5473          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
5474                            rmini, /* Cfailed */
5475                            tviR,  /* Kfailed */
5476                            wmini  /* Cw */ );
5477          goto out;
5478       }
5479    }
5480    if (SVal__isA(svOld)) {
5481       /* reading no-access memory (sigh); leave unchanged */
5482       /* check for no pollution */
5483       tl_assert(svOld == SVal_NOACCESS);
5484       svNew = SVal_NOACCESS;
5485       goto out;
5486    }
5487    if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
5488    tl_assert(0);
5489
5490   out:
5491    if (CHECK_MSM) {
5492       tl_assert(is_sane_SVal_C(svNew));
5493    }
5494    if (UNLIKELY(svNew != svOld)) {
5495       tl_assert(svNew != SVal_INVALID);
5496       if (HG_(clo_history_level) >= 2
5497           && SVal__isC(svOld) && SVal__isC(svNew)) {
5498          event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5499          stats__msmcread_change++;
5500       }
5501    }
5502    return svNew;
5503 }
5504
5505
5506 /* Compute new state following a write */
5507 static inline SVal msmcwrite ( SVal svOld,
5508                               /* The following are only needed for
5509                                  creating error reports. */
5510                               Thr* acc_thr,
5511                               Addr acc_addr, SizeT szB )
5512 {
5513    SVal svNew = SVal_INVALID;
5514    stats__msmcwrite++;
5515
5516    /* Redundant sanity check on the constraints */
5517    if (CHECK_MSM) {
5518       tl_assert(is_sane_SVal_C(svOld));
5519    }
5520
5521    if (LIKELY(SVal__isC(svOld))) {
5522       VtsID tviW  = acc_thr->viW;
5523       VtsID wmini = SVal__unC_Wmin(svOld);
5524       Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5525       if (LIKELY(leq)) {
5526          /* no race */
5527          svNew = SVal__mkC( tviW, tviW );
5528          goto out;
5529       } else {
5530          VtsID rmini = SVal__unC_Rmin(svOld);
5531          /* assert on sanity of constraints. */
5532          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5533          tl_assert(leqxx);
5534          // same as in non-race case
5535          // proof: in the non-race case, we have
5536          //    rmini <= wmini (invar on constraints)
5537          //    tviW <= tviR (invar on thread clocks)
5538          //    wmini <= tviW (from run-time check)
5539          // hence from transitivity of <= we have
5540          //    rmini <= wmini <= tviW
5541          // and so join(rmini,tviW) == tviW
5542          // and    join(wmini,tviW) == tviW
5543          // qed.
5544          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5545                             VtsID__join2(wmini, tviW) );
5546          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5547                            wmini, /* Cfailed */
5548                            tviW,  /* Kfailed */
5549                            wmini  /* Cw */ );
5550          goto out;
5551       }
5552    }
5553    if (SVal__isA(svOld)) {
5554       /* writing no-access memory (sigh); leave unchanged */
5555       /* check for no pollution */
5556       tl_assert(svOld == SVal_NOACCESS);
5557       svNew = SVal_NOACCESS;
5558       goto out;
5559    }
5560    if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5561    tl_assert(0);
5562
5563   out:
5564    if (CHECK_MSM) {
5565       tl_assert(is_sane_SVal_C(svNew));
5566    }
5567    if (UNLIKELY(svNew != svOld)) {
5568       tl_assert(svNew != SVal_INVALID);
5569       if (HG_(clo_history_level) >= 2
5570           && SVal__isC(svOld) && SVal__isC(svNew)) {
5571          event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5572          stats__msmcwrite_change++;
5573       }
5574    }
5575    return svNew;
5576 }
5577
5578
5579 /////////////////////////////////////////////////////////
5580 //                                                     //
5581 // Apply core MSM to specific memory locations         //
5582 //                                                     //
5583 /////////////////////////////////////////////////////////
5584
5585 /*------------- ZSM accesses: 8 bit sapply ------------- */
5586
5587 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5588    CacheLine* cl;
5589    UWord      cloff, tno, toff;
5590    SVal       svOld, svNew;
5591    UShort     descr;
5592    stats__cline_cread08s++;
5593    cl    = get_cacheline(a);
5594    cloff = get_cacheline_offset(a);
5595    tno   = get_treeno(a);
5596    toff  = get_tree_offset(a); /* == 0 .. 7 */
5597    descr = cl->descrs[tno];
5598    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5599       SVal* tree = &cl->svals[tno << 3];
5600       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5601       if (CHECK_ZSM)
5602          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5603    }
5604    svOld = cl->svals[cloff];
5605    svNew = msmcread( svOld, thr,a,1 );
5606    if (CHECK_ZSM)
5607       tl_assert(svNew != SVal_INVALID);
5608    cl->svals[cloff] = svNew;
5609 }
5610
5611 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5612    CacheLine* cl;
5613    UWord      cloff, tno, toff;
5614    SVal       svOld, svNew;
5615    UShort     descr;
5616    stats__cline_cwrite08s++;
5617    cl    = get_cacheline(a);
5618    cloff = get_cacheline_offset(a);
5619    tno   = get_treeno(a);
5620    toff  = get_tree_offset(a); /* == 0 .. 7 */
5621    descr = cl->descrs[tno];
5622    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5623       SVal* tree = &cl->svals[tno << 3];
5624       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5625       if (CHECK_ZSM)
5626          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5627    }
5628    svOld = cl->svals[cloff];
5629    svNew = msmcwrite( svOld, thr,a,1 );
5630    if (CHECK_ZSM)
5631       tl_assert(svNew != SVal_INVALID);
5632    cl->svals[cloff] = svNew;
5633 }
5634
5635 /*------------- ZSM accesses: 16 bit sapply ------------- */
5636
5637 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5638    CacheLine* cl;
5639    UWord      cloff, tno, toff;
5640    SVal       svOld, svNew;
5641    UShort     descr;
5642    stats__cline_cread16s++;
5643    if (UNLIKELY(!aligned16(a))) goto slowcase;
5644    cl    = get_cacheline(a);
5645    cloff = get_cacheline_offset(a);
5646    tno   = get_treeno(a);
5647    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5648    descr = cl->descrs[tno];
5649    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5650       if (valid_value_is_below_me_16(descr, toff)) {
5651          goto slowcase;
5652       } else {
5653          SVal* tree = &cl->svals[tno << 3];
5654          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5655       }
5656       if (CHECK_ZSM)
5657          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5658    }
5659    svOld = cl->svals[cloff];
5660    svNew = msmcread( svOld, thr,a,2 );
5661    if (CHECK_ZSM)
5662       tl_assert(svNew != SVal_INVALID);
5663    cl->svals[cloff] = svNew;
5664    return;
5665   slowcase: /* misaligned, or must go further down the tree */
5666    stats__cline_16to8splits++;
5667    zsm_sapply08__msmcread( thr, a + 0 );
5668    zsm_sapply08__msmcread( thr, a + 1 );
5669 }
5670
5671 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5672    CacheLine* cl;
5673    UWord      cloff, tno, toff;
5674    SVal       svOld, svNew;
5675    UShort     descr;
5676    stats__cline_cwrite16s++;
5677    if (UNLIKELY(!aligned16(a))) goto slowcase;
5678    cl    = get_cacheline(a);
5679    cloff = get_cacheline_offset(a);
5680    tno   = get_treeno(a);
5681    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5682    descr = cl->descrs[tno];
5683    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5684       if (valid_value_is_below_me_16(descr, toff)) {
5685          goto slowcase;
5686       } else {
5687          SVal* tree = &cl->svals[tno << 3];
5688          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5689       }
5690       if (CHECK_ZSM)
5691          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5692    }
5693    svOld = cl->svals[cloff];
5694    svNew = msmcwrite( svOld, thr,a,2 );
5695    if (CHECK_ZSM)
5696       tl_assert(svNew != SVal_INVALID);
5697    cl->svals[cloff] = svNew;
5698    return;
5699   slowcase: /* misaligned, or must go further down the tree */
5700    stats__cline_16to8splits++;
5701    zsm_sapply08__msmcwrite( thr, a + 0 );
5702    zsm_sapply08__msmcwrite( thr, a + 1 );
5703 }
5704
5705 /*------------- ZSM accesses: 32 bit sapply ------------- */
5706
5707 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5708    CacheLine* cl;
5709    UWord      cloff, tno, toff;
5710    SVal       svOld, svNew;
5711    UShort     descr;
5712    stats__cline_cread32s++;
5713    if (UNLIKELY(!aligned32(a))) goto slowcase;
5714    cl    = get_cacheline(a);
5715    cloff = get_cacheline_offset(a);
5716    tno   = get_treeno(a);
5717    toff  = get_tree_offset(a); /* == 0 or 4 */
5718    descr = cl->descrs[tno];
5719    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5720       if (valid_value_is_above_me_32(descr, toff)) {
5721          SVal* tree = &cl->svals[tno << 3];
5722          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5723       } else {
5724          goto slowcase;
5725       }
5726       if (CHECK_ZSM)
5727          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5728    }
5729    svOld = cl->svals[cloff];
5730    svNew = msmcread( svOld, thr,a,4 );
5731    if (CHECK_ZSM)
5732       tl_assert(svNew != SVal_INVALID);
5733    cl->svals[cloff] = svNew;
5734    return;
5735   slowcase: /* misaligned, or must go further down the tree */
5736    stats__cline_32to16splits++;
5737    zsm_sapply16__msmcread( thr, a + 0 );
5738    zsm_sapply16__msmcread( thr, a + 2 );
5739 }
5740
5741 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5742    CacheLine* cl;
5743    UWord      cloff, tno, toff;
5744    SVal       svOld, svNew;
5745    UShort     descr;
5746    stats__cline_cwrite32s++;
5747    if (UNLIKELY(!aligned32(a))) goto slowcase;
5748    cl    = get_cacheline(a);
5749    cloff = get_cacheline_offset(a);
5750    tno   = get_treeno(a);
5751    toff  = get_tree_offset(a); /* == 0 or 4 */
5752    descr = cl->descrs[tno];
5753    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5754       if (valid_value_is_above_me_32(descr, toff)) {
5755          SVal* tree = &cl->svals[tno << 3];
5756          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5757       } else {
5758          goto slowcase;
5759       }
5760       if (CHECK_ZSM)
5761          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5762    }
5763    svOld = cl->svals[cloff];
5764    svNew = msmcwrite( svOld, thr,a,4 );
5765    if (CHECK_ZSM)
5766       tl_assert(svNew != SVal_INVALID);
5767    cl->svals[cloff] = svNew;
5768    return;
5769   slowcase: /* misaligned, or must go further down the tree */
5770    stats__cline_32to16splits++;
5771    zsm_sapply16__msmcwrite( thr, a + 0 );
5772    zsm_sapply16__msmcwrite( thr, a + 2 );
5773 }
5774
5775 /*------------- ZSM accesses: 64 bit sapply ------------- */
5776
5777 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5778    CacheLine* cl;
5779    UWord      cloff, tno;
5780    //UWord      toff;
5781    SVal       svOld, svNew;
5782    UShort     descr;
5783    stats__cline_cread64s++;
5784    if (UNLIKELY(!aligned64(a))) goto slowcase;
5785    cl    = get_cacheline(a);
5786    cloff = get_cacheline_offset(a);
5787    tno   = get_treeno(a);
5788    //toff  = get_tree_offset(a); /* == 0, unused */
5789    descr = cl->descrs[tno];
5790    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5791       goto slowcase;
5792    }
5793    svOld = cl->svals[cloff];
5794    svNew = msmcread( svOld, thr,a,8 );
5795    if (CHECK_ZSM)
5796       tl_assert(svNew != SVal_INVALID);
5797    cl->svals[cloff] = svNew;
5798    return;
5799   slowcase: /* misaligned, or must go further down the tree */
5800    stats__cline_64to32splits++;
5801    zsm_sapply32__msmcread( thr, a + 0 );
5802    zsm_sapply32__msmcread( thr, a + 4 );
5803 }
5804
5805 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5806    CacheLine* cl;
5807    UWord      cloff, tno;
5808    //UWord      toff;
5809    SVal       svOld, svNew;
5810    UShort     descr;
5811    stats__cline_cwrite64s++;
5812    if (UNLIKELY(!aligned64(a))) goto slowcase;
5813    cl    = get_cacheline(a);
5814    cloff = get_cacheline_offset(a);
5815    tno   = get_treeno(a);
5816    //toff  = get_tree_offset(a); /* == 0, unused */
5817    descr = cl->descrs[tno];
5818    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5819       goto slowcase;
5820    }
5821    svOld = cl->svals[cloff];
5822    svNew = msmcwrite( svOld, thr,a,8 );
5823    if (CHECK_ZSM)
5824       tl_assert(svNew != SVal_INVALID);
5825    cl->svals[cloff] = svNew;
5826    return;
5827   slowcase: /* misaligned, or must go further down the tree */
5828    stats__cline_64to32splits++;
5829    zsm_sapply32__msmcwrite( thr, a + 0 );
5830    zsm_sapply32__msmcwrite( thr, a + 4 );
5831 }
5832
5833 /*--------------- ZSM accesses: 8 bit swrite --------------- */
5834
5835 static
5836 void zsm_swrite08 ( Addr a, SVal svNew ) {
5837    CacheLine* cl;
5838    UWord      cloff, tno, toff;
5839    UShort     descr;
5840    stats__cline_swrite08s++;
5841    cl    = get_cacheline(a);
5842    cloff = get_cacheline_offset(a);
5843    tno   = get_treeno(a);
5844    toff  = get_tree_offset(a); /* == 0 .. 7 */
5845    descr = cl->descrs[tno];
5846    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5847       SVal* tree = &cl->svals[tno << 3];
5848       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5849       if (CHECK_ZSM)
5850          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5851    }
5852    tl_assert(svNew != SVal_INVALID);
5853    cl->svals[cloff] = svNew;
5854 }
5855
5856 /*--------------- ZSM accesses: 16 bit swrite --------------- */
5857
5858 static
5859 void zsm_swrite16 ( Addr a, SVal svNew ) {
5860    CacheLine* cl;
5861    UWord      cloff, tno, toff;
5862    UShort     descr;
5863    stats__cline_swrite16s++;
5864    if (UNLIKELY(!aligned16(a))) goto slowcase;
5865    cl    = get_cacheline(a);
5866    cloff = get_cacheline_offset(a);
5867    tno   = get_treeno(a);
5868    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5869    descr = cl->descrs[tno];
5870    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5871       if (valid_value_is_below_me_16(descr, toff)) {
5872          /* Writing at this level.  Need to fix up 'descr'. */
5873          cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5874          /* At this point, the tree does not match cl->descr[tno] any
5875             more.  The assignments below will fix it up. */
5876       } else {
5877          /* We can't indiscriminately write on the w16 node as in the
5878             w64 case, as that might make the node inconsistent with
5879             its parent.  So first, pull down to this level. */
5880          SVal* tree = &cl->svals[tno << 3];
5881          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5882       if (CHECK_ZSM)
5883          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5884       }
5885    }
5886    tl_assert(svNew != SVal_INVALID);
5887    cl->svals[cloff + 0] = svNew;
5888    cl->svals[cloff + 1] = SVal_INVALID;
5889    return;
5890   slowcase: /* misaligned */
5891    stats__cline_16to8splits++;
5892    zsm_swrite08( a + 0, svNew );
5893    zsm_swrite08( a + 1, svNew );
5894 }
5895
5896 /*--------------- ZSM accesses: 32 bit swrite --------------- */
5897
5898 static
5899 void zsm_swrite32 ( Addr a, SVal svNew ) {
5900    CacheLine* cl;
5901    UWord      cloff, tno, toff;
5902    UShort     descr;
5903    stats__cline_swrite32s++;
5904    if (UNLIKELY(!aligned32(a))) goto slowcase;
5905    cl    = get_cacheline(a);
5906    cloff = get_cacheline_offset(a);
5907    tno   = get_treeno(a);
5908    toff  = get_tree_offset(a); /* == 0 or 4 */
5909    descr = cl->descrs[tno];
5910    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5911       if (valid_value_is_above_me_32(descr, toff)) {
5912          /* We can't indiscriminately write on the w32 node as in the
5913             w64 case, as that might make the node inconsistent with
5914             its parent.  So first, pull down to this level. */
5915          SVal* tree = &cl->svals[tno << 3];
5916          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5917          if (CHECK_ZSM)
5918             tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5919       } else {
5920          /* Writing at this level.  Need to fix up 'descr'. */
5921          cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5922          /* At this point, the tree does not match cl->descr[tno] any
5923             more.  The assignments below will fix it up. */
5924       }
5925    }
5926    tl_assert(svNew != SVal_INVALID);
5927    cl->svals[cloff + 0] = svNew;
5928    cl->svals[cloff + 1] = SVal_INVALID;
5929    cl->svals[cloff + 2] = SVal_INVALID;
5930    cl->svals[cloff + 3] = SVal_INVALID;
5931    return;
5932   slowcase: /* misaligned */
5933    stats__cline_32to16splits++;
5934    zsm_swrite16( a + 0, svNew );
5935    zsm_swrite16( a + 2, svNew );
5936 }
5937
5938 /*--------------- ZSM accesses: 64 bit swrite --------------- */
5939
5940 static
5941 void zsm_swrite64 ( Addr a, SVal svNew ) {
5942    CacheLine* cl;
5943    UWord      cloff, tno;
5944    //UWord    toff;
5945    stats__cline_swrite64s++;
5946    if (UNLIKELY(!aligned64(a))) goto slowcase;
5947    cl    = get_cacheline(a);
5948    cloff = get_cacheline_offset(a);
5949    tno   = get_treeno(a);
5950    //toff  = get_tree_offset(a); /* == 0, unused */
5951    cl->descrs[tno] = TREE_DESCR_64;
5952    if (CHECK_ZSM)
5953       tl_assert(svNew != SVal_INVALID); /* EXPENSIVE */
5954    cl->svals[cloff + 0] = svNew;
5955    cl->svals[cloff + 1] = SVal_INVALID;
5956    cl->svals[cloff + 2] = SVal_INVALID;
5957    cl->svals[cloff + 3] = SVal_INVALID;
5958    cl->svals[cloff + 4] = SVal_INVALID;
5959    cl->svals[cloff + 5] = SVal_INVALID;
5960    cl->svals[cloff + 6] = SVal_INVALID;
5961    cl->svals[cloff + 7] = SVal_INVALID;
5962    return;
5963   slowcase: /* misaligned */
5964    stats__cline_64to32splits++;
5965    zsm_swrite32( a + 0, svNew );
5966    zsm_swrite32( a + 4, svNew );
5967 }
5968
5969 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5970
5971 static
5972 SVal zsm_sread08 ( Addr a ) {
5973    CacheLine* cl;
5974    UWord      cloff, tno, toff;
5975    UShort     descr;
5976    stats__cline_sread08s++;
5977    cl    = get_cacheline(a);
5978    cloff = get_cacheline_offset(a);
5979    tno   = get_treeno(a);
5980    toff  = get_tree_offset(a); /* == 0 .. 7 */
5981    descr = cl->descrs[tno];
5982    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5983       SVal* tree = &cl->svals[tno << 3];
5984       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5985    }
5986    return cl->svals[cloff];
5987 }
5988
5989 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5990    SVal       sv;
5991    stats__cline_scopy08s++;
5992    sv = zsm_sread08( src );
5993    zsm_swrite08( dst, sv );
5994 }
5995
5996
5997 /* Block-copy states (needed for implementing realloc()).  Note this
5998    doesn't change the filtering arrangements.  The caller of
5999    zsm_scopy_range needs to attend to that. */
6000
6001 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
6002 {
6003    SizeT i;
6004    if (len == 0)
6005       return;
6006
6007    /* assert for non-overlappingness */
6008    tl_assert(src+len <= dst || dst+len <= src);
6009
6010    /* To be simple, just copy byte by byte.  But so as not to wreck
6011       performance for later accesses to dst[0 .. len-1], normalise
6012       destination lines as we finish with them, and also normalise the
6013       line containing the first and last address. */
6014    for (i = 0; i < len; i++) {
6015       Bool normalise
6016          = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
6017            || i == 0       /* first in range */
6018            || i == len-1;  /* last in range */
6019       zsm_scopy08( src+i, dst+i, normalise );
6020    }
6021 }
6022
6023
6024 /* For setting address ranges to a given value.  Has considerable
6025    sophistication so as to avoid generating large numbers of pointless
6026    cache loads/writebacks for large ranges. */
6027
6028 /* Do small ranges in-cache, in the obvious way. */
6029 static
6030 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
6031 {
6032    /* fast track a couple of common cases */
6033    if (len == 4 && aligned32(a)) {
6034       zsm_swrite32( a, svNew );
6035       return;
6036    }
6037    if (len == 8 && aligned64(a)) {
6038       zsm_swrite64( a, svNew );
6039       return;
6040    }
6041
6042    /* be completely general (but as efficient as possible) */
6043    if (len == 0) return;
6044
6045    if (!aligned16(a) && len >= 1) {
6046       zsm_swrite08( a, svNew );
6047       a += 1;
6048       len -= 1;
6049       tl_assert(aligned16(a));
6050    }
6051    if (len == 0) return;
6052
6053    if (!aligned32(a) && len >= 2) {
6054       zsm_swrite16( a, svNew );
6055       a += 2;
6056       len -= 2;
6057       tl_assert(aligned32(a));
6058    }
6059    if (len == 0) return;
6060
6061    if (!aligned64(a) && len >= 4) {
6062       zsm_swrite32( a, svNew );
6063       a += 4;
6064       len -= 4;
6065       tl_assert(aligned64(a));
6066    }
6067    if (len == 0) return;
6068
6069    if (len >= 8) {
6070       tl_assert(aligned64(a));
6071       while (len >= 8) {
6072          zsm_swrite64( a, svNew );
6073          a += 8;
6074          len -= 8;
6075       }
6076       tl_assert(aligned64(a));
6077    }
6078    if (len == 0) return;
6079
6080    if (len >= 4)
6081       tl_assert(aligned32(a));
6082    if (len >= 4) {
6083       zsm_swrite32( a, svNew );
6084       a += 4;
6085       len -= 4;
6086    }
6087    if (len == 0) return;
6088
6089    if (len >= 2)
6090       tl_assert(aligned16(a));
6091    if (len >= 2) {
6092       zsm_swrite16( a, svNew );
6093       a += 2;
6094       len -= 2;
6095    }
6096    if (len == 0) return;
6097
6098    if (len >= 1) {
6099       zsm_swrite08( a, svNew );
6100       //a += 1;
6101       len -= 1;
6102    }
6103    tl_assert(len == 0);
6104 }
6105
6106
6107 /* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
6108    for larger ranges, try to operate directly on the out-of-cache
6109    representation, rather than dragging lines into the cache,
6110    overwriting them, and forcing them out.  This turns out to be an
6111    important performance optimisation.
6112
6113    Note that this doesn't change the filtering arrangements.  The
6114    caller of zsm_sset_range needs to attend to that. */
6115
6116 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
6117 {
6118    tl_assert(svNew != SVal_INVALID);
6119    stats__cache_make_New_arange += (ULong)len;
6120
6121    if (0 && len > 500)
6122       VG_(printf)("make New      ( %#lx, %lu )\n", a, len );
6123
6124    if (0) {
6125       static UWord n_New_in_cache = 0;
6126       static UWord n_New_not_in_cache = 0;
6127       /* tag is 'a' with the in-line offset masked out,
6128          eg a[31]..a[4] 0000 */
6129       Addr       tag = a & ~(N_LINE_ARANGE - 1);
6130       UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
6131       if (LIKELY(tag == cache_shmem.tags0[wix])) {
6132          n_New_in_cache++;
6133       } else {
6134          n_New_not_in_cache++;
6135       }
6136       if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
6137          VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
6138                      n_New_in_cache, n_New_not_in_cache );
6139    }
6140
6141    if (LIKELY(len < 2 * N_LINE_ARANGE)) {
6142       zsm_sset_range_SMALL( a, len, svNew );
6143    } else {
6144       Addr  before_start  = a;
6145       Addr  aligned_start = cacheline_ROUNDUP(a);
6146       Addr  after_start   = cacheline_ROUNDDN(a + len);
6147       UWord before_len    = aligned_start - before_start;
6148       UWord aligned_len   = after_start - aligned_start;
6149       UWord after_len     = a + len - after_start;
6150       tl_assert(before_start <= aligned_start);
6151       tl_assert(aligned_start <= after_start);
6152       tl_assert(before_len < N_LINE_ARANGE);
6153       tl_assert(after_len < N_LINE_ARANGE);
6154       tl_assert(get_cacheline_offset(aligned_start) == 0);
6155       if (get_cacheline_offset(a) == 0) {
6156          tl_assert(before_len == 0);
6157          tl_assert(a == aligned_start);
6158       }
6159       if (get_cacheline_offset(a+len) == 0) {
6160          tl_assert(after_len == 0);
6161          tl_assert(after_start == a+len);
6162       }
6163       if (before_len > 0) {
6164          zsm_sset_range_SMALL( before_start, before_len, svNew );
6165       }
6166       if (after_len > 0) {
6167          zsm_sset_range_SMALL( after_start, after_len, svNew );
6168       }
6169       stats__cache_make_New_inZrep += (ULong)aligned_len;
6170
6171       while (1) {
6172          Addr tag;
6173          UWord wix;
6174          if (aligned_start >= after_start)
6175             break;
6176          tl_assert(get_cacheline_offset(aligned_start) == 0);
6177          tag = aligned_start & ~(N_LINE_ARANGE - 1);
6178          wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
6179          if (tag == cache_shmem.tags0[wix]) {
6180             UWord i;
6181             for (i = 0; i < N_LINE_ARANGE / 8; i++)
6182                zsm_swrite64( aligned_start + i * 8, svNew );
6183          } else {
6184             UWord i;
6185             Word zix;
6186             SecMap* sm;
6187             LineZ* lineZ;
6188             /* This line is not in the cache.  Do not force it in; instead
6189                modify it in-place. */
6190             /* find the Z line to write in and rcdec it or the
6191                associated F line. */
6192             find_Z_for_writing( &sm, &zix, tag );
6193             tl_assert(sm);
6194             tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
6195             lineZ = &sm->linesZ[zix];
6196             lineZ->dict[0] = svNew;
6197             lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
6198             for (i = 0; i < N_LINE_ARANGE/4; i++)
6199                lineZ->ix2s[i] = 0; /* all refer to dict[0] */
6200             rcinc_LineZ(lineZ);
6201          }
6202          aligned_start += N_LINE_ARANGE;
6203          aligned_len -= N_LINE_ARANGE;
6204       }
6205       tl_assert(aligned_start == after_start);
6206       tl_assert(aligned_len == 0);
6207    }
6208 }
6209
6210
6211 /////////////////////////////////////////////////////////
6212 //                                                     //
6213 // Front-filtering accesses                            //
6214 //                                                     //
6215 /////////////////////////////////////////////////////////
6216
6217 static UWord stats__f_ac = 0;
6218 static UWord stats__f_sk = 0;
6219
6220 #if 0
6221 #  define STATS__F_SHOW \
6222      do { \
6223         if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
6224            VG_(printf)("filters: ac %lu sk %lu\n",   \
6225            stats__f_ac, stats__f_sk); \
6226      } while (0)
6227 #else
6228 #  define STATS__F_SHOW /* */
6229 #endif
6230
6231 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
6232    stats__f_ac++;
6233    STATS__F_SHOW;
6234    if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
6235       stats__f_sk++;
6236       return;
6237    }
6238    zsm_sapply08__msmcwrite(thr, a);
6239 }
6240
6241 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
6242    stats__f_ac++;
6243    STATS__F_SHOW;
6244    if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
6245       stats__f_sk++;
6246       return;
6247    }
6248    zsm_sapply16__msmcwrite(thr, a);
6249 }
6250
6251 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
6252    stats__f_ac++;
6253    STATS__F_SHOW;
6254    if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
6255       stats__f_sk++;
6256       return;
6257    }
6258    zsm_sapply32__msmcwrite(thr, a);
6259 }
6260
6261 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
6262    stats__f_ac++;
6263    STATS__F_SHOW;
6264    if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
6265       stats__f_sk++;
6266       return;
6267    }
6268    zsm_sapply64__msmcwrite(thr, a);
6269 }
6270
6271 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
6272 {
6273    /* fast track a couple of common cases */
6274    if (len == 4 && aligned32(a)) {
6275       zsm_sapply32_f__msmcwrite( thr, a );
6276       return;
6277    }
6278    if (len == 8 && aligned64(a)) {
6279       zsm_sapply64_f__msmcwrite( thr, a );
6280       return;
6281    }
6282
6283    /* be completely general (but as efficient as possible) */
6284    if (len == 0) return;
6285
6286    if (!aligned16(a) && len >= 1) {
6287       zsm_sapply08_f__msmcwrite( thr, a );
6288       a += 1;
6289       len -= 1;
6290       tl_assert(aligned16(a));
6291    }
6292    if (len == 0) return;
6293
6294    if (!aligned32(a) && len >= 2) {
6295       zsm_sapply16_f__msmcwrite( thr, a );
6296       a += 2;
6297       len -= 2;
6298       tl_assert(aligned32(a));
6299    }
6300    if (len == 0) return;
6301
6302    if (!aligned64(a) && len >= 4) {
6303       zsm_sapply32_f__msmcwrite( thr, a );
6304       a += 4;
6305       len -= 4;
6306       tl_assert(aligned64(a));
6307    }
6308    if (len == 0) return;
6309
6310    if (len >= 8) {
6311       tl_assert(aligned64(a));
6312       while (len >= 8) {
6313          zsm_sapply64_f__msmcwrite( thr, a );
6314          a += 8;
6315          len -= 8;
6316       }
6317       tl_assert(aligned64(a));
6318    }
6319    if (len == 0) return;
6320
6321    if (len >= 4)
6322       tl_assert(aligned32(a));
6323    if (len >= 4) {
6324       zsm_sapply32_f__msmcwrite( thr, a );
6325       a += 4;
6326       len -= 4;
6327    }
6328    if (len == 0) return;
6329
6330    if (len >= 2)
6331       tl_assert(aligned16(a));
6332    if (len >= 2) {
6333       zsm_sapply16_f__msmcwrite( thr, a );
6334       a += 2;
6335       len -= 2;
6336    }
6337    if (len == 0) return;
6338
6339    if (len >= 1) {
6340       zsm_sapply08_f__msmcwrite( thr, a );
6341       //a += 1;
6342       len -= 1;
6343    }
6344    tl_assert(len == 0);
6345 }
6346
6347 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
6348    stats__f_ac++;
6349    STATS__F_SHOW;
6350    if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
6351       stats__f_sk++;
6352       return;
6353    }
6354    zsm_sapply08__msmcread(thr, a);
6355 }
6356
6357 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
6358    stats__f_ac++;
6359    STATS__F_SHOW;
6360    if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
6361       stats__f_sk++;
6362       return;
6363    }
6364    zsm_sapply16__msmcread(thr, a);
6365 }
6366
6367 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
6368    stats__f_ac++;
6369    STATS__F_SHOW;
6370    if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
6371       stats__f_sk++;
6372       return;
6373    }
6374    zsm_sapply32__msmcread(thr, a);
6375 }
6376
6377 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
6378    stats__f_ac++;
6379    STATS__F_SHOW;
6380    if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
6381       stats__f_sk++;
6382       return;
6383    }
6384    zsm_sapply64__msmcread(thr, a);
6385 }
6386
6387 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
6388 {
6389    /* fast track a couple of common cases */
6390    if (len == 4 && aligned32(a)) {
6391       zsm_sapply32_f__msmcread( thr, a );
6392       return;
6393    }
6394    if (len == 8 && aligned64(a)) {
6395       zsm_sapply64_f__msmcread( thr, a );
6396       return;
6397    }
6398
6399    /* be completely general (but as efficient as possible) */
6400    if (len == 0) return;
6401
6402    if (!aligned16(a) && len >= 1) {
6403       zsm_sapply08_f__msmcread( thr, a );
6404       a += 1;
6405       len -= 1;
6406       tl_assert(aligned16(a));
6407    }
6408    if (len == 0) return;
6409
6410    if (!aligned32(a) && len >= 2) {
6411       zsm_sapply16_f__msmcread( thr, a );
6412       a += 2;
6413       len -= 2;
6414       tl_assert(aligned32(a));
6415    }
6416    if (len == 0) return;
6417
6418    if (!aligned64(a) && len >= 4) {
6419       zsm_sapply32_f__msmcread( thr, a );
6420       a += 4;
6421       len -= 4;
6422       tl_assert(aligned64(a));
6423    }
6424    if (len == 0) return;
6425
6426    if (len >= 8) {
6427       tl_assert(aligned64(a));
6428       while (len >= 8) {
6429          zsm_sapply64_f__msmcread( thr, a );
6430          a += 8;
6431          len -= 8;
6432       }
6433       tl_assert(aligned64(a));
6434    }
6435    if (len == 0) return;
6436
6437    if (len >= 4)
6438       tl_assert(aligned32(a));
6439    if (len >= 4) {
6440       zsm_sapply32_f__msmcread( thr, a );
6441       a += 4;
6442       len -= 4;
6443    }
6444    if (len == 0) return;
6445
6446    if (len >= 2)
6447       tl_assert(aligned16(a));
6448    if (len >= 2) {
6449       zsm_sapply16_f__msmcread( thr, a );
6450       a += 2;
6451       len -= 2;
6452    }
6453    if (len == 0) return;
6454
6455    if (len >= 1) {
6456       zsm_sapply08_f__msmcread( thr, a );
6457       //a += 1;
6458       len -= 1;
6459    }
6460    tl_assert(len == 0);
6461 }
6462
6463 void libhb_Thr_resumes ( Thr* thr )
6464 {
6465    if (0) VG_(printf)("resume %p\n", thr);
6466    tl_assert(thr);
6467    tl_assert(!thr->llexit_done);
6468    Filter__clear(thr->filter, "libhb_Thr_resumes");
6469    /* A kludge, but .. if this thread doesn't have any marker stacks
6470       at all, get one right now.  This is easier than figuring out
6471       exactly when at thread startup we can and can't take a stack
6472       snapshot. */
6473    if (HG_(clo_history_level) == 1) {
6474       tl_assert(thr->local_Kws_n_stacks);
6475       if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
6476          note_local_Kw_n_stack_for(thr);
6477    }
6478 }
6479
6480
6481 /////////////////////////////////////////////////////////
6482 //                                                     //
6483 // Synchronisation objects                             //
6484 //                                                     //
6485 /////////////////////////////////////////////////////////
6486
6487 /* A double linked list of all the SO's. */
6488 SO* admin_SO = NULL;
6489
6490 static SO* SO__Alloc ( void )
6491 {
6492    SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
6493    so->viR   = VtsID_INVALID;
6494    so->viW   = VtsID_INVALID;
6495    so->magic = SO_MAGIC;
6496    /* Add to double linked list */
6497    if (admin_SO) {
6498       tl_assert(admin_SO->admin_prev == NULL);
6499       admin_SO->admin_prev = so;
6500       so->admin_next = admin_SO;
6501    } else {
6502       so->admin_next = NULL;
6503    }
6504    so->admin_prev = NULL;
6505    admin_SO = so;
6506    /* */
6507    return so;
6508 }
6509
6510 static void SO__Dealloc ( SO* so )
6511 {
6512    tl_assert(so);
6513    tl_assert(so->magic == SO_MAGIC);
6514    if (so->viR == VtsID_INVALID) {
6515       tl_assert(so->viW == VtsID_INVALID);
6516    } else {
6517       tl_assert(so->viW != VtsID_INVALID);
6518       VtsID__rcdec(so->viR);
6519       VtsID__rcdec(so->viW);
6520    }
6521    so->magic = 0;
6522    /* Del from double linked list */
6523    if (so->admin_prev)
6524       so->admin_prev->admin_next = so->admin_next;
6525    if (so->admin_next)
6526       so->admin_next->admin_prev = so->admin_prev;
6527    if (so == admin_SO)
6528       admin_SO = so->admin_next;
6529    /* */
6530    HG_(free)( so );
6531 }
6532
6533
6534 /////////////////////////////////////////////////////////
6535 //                                                     //
6536 // Top Level API                                       //
6537 //                                                     //
6538 /////////////////////////////////////////////////////////
6539
6540 static void show_thread_state ( const HChar* str, Thr* t )
6541 {
6542    if (1) return;
6543    if (t->viR == t->viW) {
6544       VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6545       VtsID__pp( t->viR );
6546       VG_(printf)("%s","\n");
6547    } else {
6548       VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6549       VtsID__pp( t->viR );
6550       VG_(printf)(" viW %u==", t->viW);
6551       VtsID__pp( t->viW );
6552       VG_(printf)("%s","\n");
6553    }
6554 }
6555
6556
6557 Thr* libhb_init (
6558         void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6559         ExeContext* (*get_EC)( Thr* )
6560      )
6561 {
6562    Thr*  thr;
6563    VtsID vi;
6564
6565    // We will have to have to store a large number of these,
6566    // so make sure they're the size we expect them to be.
6567    STATIC_ASSERT(sizeof(ScalarTS) == 8);
6568
6569    /* because first 1024 unusable */
6570    STATIC_ASSERT(SCALARTS_N_THRBITS >= 11);
6571    /* so as to fit in a UInt w/ 5 bits to spare (see defn of
6572       Thr_n_RCEC and TSW). */
6573    STATIC_ASSERT(SCALARTS_N_THRBITS <= 27);
6574
6575    /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6576       (32-bit).  It's not correctness-critical, but there are a lot of
6577       them, so it's important from a space viewpoint.  Unfortunately
6578       we simply can't pack it into 2 words on a 32-bit target. */
6579    STATIC_ASSERT(   (sizeof(UWord) == 8 && sizeof(Thr_n_RCEC) == 16)
6580                  || (sizeof(UWord) == 4 && sizeof(Thr_n_RCEC) == 12));
6581    STATIC_ASSERT(sizeof(TSW) == sizeof(UInt));
6582
6583    /* Word sets really are 32 bits.  Even on a 64 bit target. */
6584    STATIC_ASSERT(sizeof(WordSetID) == 4);
6585    STATIC_ASSERT(sizeof(WordSet) == sizeof(WordSetID));
6586
6587    tl_assert(get_stacktrace);
6588    tl_assert(get_EC);
6589    main_get_stacktrace   = get_stacktrace;
6590    main_get_EC           = get_EC;
6591
6592    // No need to initialise hg_wordfm.
6593    // No need to initialise hg_wordset.
6594
6595    /* Allocated once and never deallocated.  Used as a temporary in
6596       VTS singleton, tick and join operations. */
6597    temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6598    temp_max_sized_VTS->id = VtsID_INVALID;
6599    verydead_thread_tables_init();
6600    vts_set_init();
6601    vts_tab_init();
6602    event_map_init();
6603    VtsID__invalidate_caches();
6604
6605    // initialise shadow memory
6606    zsm_init( );
6607
6608    thr = Thr__new();
6609    vi  = VtsID__mk_Singleton( thr, 1 );
6610    thr->viR = vi;
6611    thr->viW = vi;
6612    VtsID__rcinc(thr->viR);
6613    VtsID__rcinc(thr->viW);
6614
6615    show_thread_state("  root", thr);
6616    return thr;
6617 }
6618
6619
6620 Thr* libhb_create ( Thr* parent )
6621 {
6622    /* The child's VTSs are copies of the parent's VTSs, but ticked at
6623       the child's index.  Since the child's index is guaranteed
6624       unique, it has never been seen before, so the implicit value
6625       before the tick is zero and after that is one. */
6626    Thr* child = Thr__new();
6627
6628    child->viR = VtsID__tick( parent->viR, child );
6629    child->viW = VtsID__tick( parent->viW, child );
6630    Filter__clear(child->filter, "libhb_create(child)");
6631    VtsID__rcinc(child->viR);
6632    VtsID__rcinc(child->viW);
6633    /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6634       early for that - it may not have a valid TId yet.  So, let
6635       libhb_Thr_resumes pick it up the first time the thread runs. */
6636
6637    tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6638    tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6639
6640    /* and the parent has to move along too */
6641    VtsID__rcdec(parent->viR);
6642    VtsID__rcdec(parent->viW);
6643    parent->viR = VtsID__tick( parent->viR, parent );
6644    parent->viW = VtsID__tick( parent->viW, parent );
6645    Filter__clear(parent->filter, "libhb_create(parent)");
6646    VtsID__rcinc(parent->viR);
6647    VtsID__rcinc(parent->viW);
6648    note_local_Kw_n_stack_for( parent );
6649
6650    show_thread_state(" child", child);
6651    show_thread_state("parent", parent);
6652
6653    return child;
6654 }
6655
6656 /* Shut down the library, and print stats (in fact that's _all_
6657    this is for. */
6658 void libhb_shutdown ( Bool show_stats )
6659 {
6660    if (show_stats) {
6661       VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6662       VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6663                   stats__secmaps_allocd,
6664                   stats__secmap_ga_space_covered);
6665       VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6666                   stats__secmap_linesZ_allocd,
6667                   stats__secmap_linesZ_bytes);
6668       VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)"
6669                   " (%'10lu used)\n",
6670                   VG_(sizePA) (LineF_pool_allocator),
6671                   VG_(sizePA) (LineF_pool_allocator) * sizeof(LineF),
6672                   shmem__SecMap_used_linesF());
6673       VG_(printf)(" secmaps: %'10lu in map (can be scanGCed %'5lu)"
6674                   " #%lu scanGC \n",
6675                   stats__secmaps_in_map_shmem,
6676                   shmem__SecMap_do_GC(False /* really do GC */),
6677                   stats__secmaps_scanGC);
6678       tl_assert (VG_(sizeFM) (map_shmem) == stats__secmaps_in_map_shmem);
6679       VG_(printf)(" secmaps: %'10lu in freelist,"
6680                   " total (scanGCed %'lu, ssetGCed %'lu)\n",
6681                   SecMap_freelist_length(),
6682                   stats__secmaps_scanGCed,
6683                   stats__secmaps_ssetGCed);
6684       VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6685                   stats__secmaps_search, stats__secmaps_search_slow);
6686
6687       VG_(printf)("%s","\n");
6688       VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6689                   stats__cache_totrefs, stats__cache_totmisses );
6690       VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6691                   stats__cache_Z_fetches, stats__cache_F_fetches );
6692       VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6693                   stats__cache_Z_wbacks, stats__cache_F_wbacks );
6694       VG_(printf)("   cache: %'14lu flushes_invals\n",
6695                   stats__cache_flushes_invals );
6696       VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6697                   stats__cache_make_New_arange,
6698                   stats__cache_make_New_inZrep);
6699
6700       VG_(printf)("%s","\n");
6701       VG_(printf)("   cline: %'10lu normalises\n",
6702                   stats__cline_normalises );
6703       VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6704                   stats__cline_cread64s,
6705                   stats__cline_cread32s,
6706                   stats__cline_cread16s,
6707                   stats__cline_cread08s );
6708       VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6709                   stats__cline_cwrite64s,
6710                   stats__cline_cwrite32s,
6711                   stats__cline_cwrite16s,
6712                   stats__cline_cwrite08s );
6713       VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6714                   stats__cline_swrite64s,
6715                   stats__cline_swrite32s,
6716                   stats__cline_swrite16s,
6717                   stats__cline_swrite08s );
6718       VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6719                   stats__cline_sread08s, stats__cline_scopy08s );
6720       VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu"
6721                   "    2to1 %'12lu\n",
6722                   stats__cline_64to32splits, stats__cline_32to16splits,
6723                   stats__cline_16to8splits );
6724       VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu"
6725                   "    2to1 %'12lu\n",
6726                   stats__cline_64to32pulldown, stats__cline_32to16pulldown,
6727                   stats__cline_16to8pulldown );
6728       if (0)
6729       VG_(printf)("   cline: sizeof(CacheLineZ) %ld,"
6730                   " covers %ld bytes of arange\n",
6731                   (Word)sizeof(LineZ),
6732                   (Word)N_LINE_ARANGE);
6733
6734       VG_(printf)("%s","\n");
6735
6736       VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6737                   stats__msmcread, stats__msmcread_change);
6738       VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6739                   stats__msmcwrite, stats__msmcwrite_change);
6740       VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6741                   stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6742       VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6743                   stats__join2_queries, stats__join2_misses);
6744
6745       VG_(printf)("%s","\n");
6746       VG_(printf)("   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6747                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6748       VG_(printf)("   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6749                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow);
6750       VG_(printf)("   libhb: VTSset: find__or__clone_and_add %'lu"
6751                   " (%'lu allocd)\n",
6752                    stats__vts_set__focaa, stats__vts_set__focaa_a );
6753       VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6754                    stats__vts__indexat_slow );
6755
6756       VG_(printf)("%s","\n");
6757       VG_(printf)(
6758          "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6759          VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6760       );
6761       VG_(printf)("   libhb: #%lu vts_tab GC    #%lu vts pruning\n",
6762                   stats__vts_tab_GC, stats__vts_pruning);
6763       VG_(printf)( "   libhb: %lu entries in vts_set\n",
6764                    VG_(sizeFM)( vts_set ) );
6765
6766       VG_(printf)("%s","\n");
6767       {
6768          UInt live = 0;
6769          UInt llexit_done = 0;
6770          UInt joinedwith_done = 0;
6771          UInt llexit_and_joinedwith_done = 0;
6772
6773          Thread* hgthread = get_admin_threads();
6774          tl_assert(hgthread);
6775          while (hgthread) {
6776             Thr* hbthr = hgthread->hbthr;
6777             tl_assert(hbthr);
6778             if (hbthr->llexit_done && hbthr->joinedwith_done)
6779                llexit_and_joinedwith_done++;
6780             else if (hbthr->llexit_done)
6781                llexit_done++;
6782             else if (hbthr->joinedwith_done)
6783                joinedwith_done++;
6784             else
6785                live++;
6786             hgthread = hgthread->admin;
6787          }
6788          VG_(printf)("   libhb: threads live: %u exit_and_joinedwith %u"
6789                      " exit %u joinedwith %u\n",
6790                      live, llexit_and_joinedwith_done,
6791                      llexit_done, joinedwith_done);
6792          VG_(printf)("   libhb: %d verydead_threads, "
6793                      "%d verydead_threads_not_pruned\n",
6794                      (int) VG_(sizeXA)( verydead_thread_table),
6795                      (int) VG_(sizeXA)( verydead_thread_table_not_pruned));
6796          tl_assert (VG_(sizeXA)( verydead_thread_table)
6797                     + VG_(sizeXA)( verydead_thread_table_not_pruned)
6798                     == llexit_and_joinedwith_done);
6799       }
6800
6801       VG_(printf)("%s","\n");
6802       VG_(printf)( "   libhb: oldrefHTN %lu (%'d bytes)\n",
6803                    oldrefHTN, (int)(oldrefHTN * sizeof(OldRef)));
6804       tl_assert (oldrefHTN == VG_(HT_count_nodes) (oldrefHT));
6805       VG_(printf)( "   libhb: oldref lookup found=%lu notfound=%lu\n",
6806                    stats__evm__lookup_found, stats__evm__lookup_notfound);
6807       if (VG_(clo_verbosity) > 1)
6808          VG_(HT_print_stats) (oldrefHT, cmp_oldref_tsw);
6809       VG_(printf)( "   libhb: oldref bind tsw/rcec "
6810                    "==/==:%'lu ==/!=:%'lu !=/!=:%'lu\n",
6811                    stats__ctxt_eq_tsw_eq_rcec, stats__ctxt_eq_tsw_neq_rcec,
6812                    stats__ctxt_neq_tsw_neq_rcec);
6813       VG_(printf)( "   libhb: ctxt__rcdec calls %'lu. rcec gc discards %'lu\n",
6814                    stats__ctxt_rcdec_calls, stats__ctxt_rcec_gc_discards);
6815       VG_(printf)( "   libhb: contextTab: %lu slots,"
6816                    " %lu cur ents(ref'd %lu),"
6817                    " %lu max ents\n",
6818                    (UWord)N_RCEC_TAB,
6819                    stats__ctxt_tab_curr, RCEC_referenced,
6820                    stats__ctxt_tab_max );
6821       VG_(printf) ("   libhb: stats__cached_rcec "
6822                    "identical %'lu updated %'lu fresh %'lu\n",
6823                    stats__cached_rcec_identical, stats__cached_rcec_updated,
6824                    stats__cached_rcec_fresh);
6825       if (stats__cached_rcec_diff > 0)
6826          VG_(printf) ("   libhb: stats__cached_rcec diff unk reason%'lu\n",
6827                       stats__cached_rcec_diff);
6828       if (stats__cached_rcec_diff_known_reason > 0)
6829          VG_(printf) ("   libhb: stats__cached_rcec diff known reason %'lu\n",
6830                       stats__cached_rcec_diff_known_reason);
6831
6832       {
6833 #        define  MAXCHAIN 10
6834          UInt chains[MAXCHAIN+1]; // [MAXCHAIN] gets all chains >= MAXCHAIN
6835          UInt non0chain = 0;
6836          UInt n;
6837          UInt i;
6838          RCEC *p;
6839
6840          for (i = 0; i <= MAXCHAIN; i++) chains[i] = 0;
6841          for (i = 0; i < N_RCEC_TAB; i++) {
6842             n = 0;
6843             for (p = contextTab[i]; p; p = p->next)
6844                n++;
6845             if (n < MAXCHAIN)
6846                chains[n]++;
6847             else
6848                chains[MAXCHAIN]++;
6849             if (n > 0)
6850                non0chain++;
6851          }
6852          VG_(printf)( "   libhb: contextTab chain of [length]=nchain."
6853                       " Avg chain len %3.1f\n"
6854                       "        ",
6855                       (Double)stats__ctxt_tab_curr
6856                       / (Double)(non0chain ? non0chain : 1));
6857          for (i = 0; i <= MAXCHAIN; i++) {
6858             if (chains[i] != 0)
6859                 VG_(printf)( "[%u%s]=%u ",
6860                              i, i == MAXCHAIN ? "+" : "",
6861                              chains[i]);
6862          }
6863          VG_(printf)( "\n");
6864 #        undef MAXCHAIN
6865       }
6866       VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6867                    stats__ctxt_tab_qs,
6868                    stats__ctxt_tab_cmps );
6869 #if 0
6870       VG_(printf)("sizeof(CacheLine)   = %zu\n", sizeof(CacheLine));
6871       VG_(printf)("sizeof(LineZ)       = %zu\n", sizeof(LineZ));
6872       VG_(printf)("sizeof(LineF)       = %zu\n", sizeof(LineF));
6873       VG_(printf)("sizeof(SecMap)      = %zu\n", sizeof(SecMap));
6874       VG_(printf)("sizeof(Cache)       = %zu\n", sizeof(Cache));
6875       VG_(printf)("sizeof(SMCacheEnt)  = %zu\n", sizeof(SMCacheEnt));
6876       VG_(printf)("sizeof(CountedSVal) = %zu\n", sizeof(CountedSVal));
6877       VG_(printf)("sizeof(VTS)         = %zu\n", sizeof(VTS));
6878       VG_(printf)("sizeof(ScalarTS)    = %zu\n", sizeof(ScalarTS));
6879       VG_(printf)("sizeof(VtsTE)       = %zu\n", sizeof(VtsTE));
6880
6881       VG_(printf)("sizeof(struct _Thr)     = %zu\n", sizeof(struct _Thr));
6882       VG_(printf)("sizeof(RCEC)     = %zu\n", sizeof(RCEC));
6883       VG_(printf)("sizeof(struct _SO)     = %zu\n", sizeof(struct _SO));
6884 #endif
6885
6886       VG_(printf)("%s","<<< END libhb stats >>>\n");
6887       VG_(printf)("%s","\n");
6888
6889    }
6890 }
6891
6892 /* Receive notification that a thread has low level exited.  The
6893    significance here is that we do not expect to see any more memory
6894    references from it. */
6895 void libhb_async_exit ( Thr* thr )
6896 {
6897    tl_assert(thr);
6898    tl_assert(!thr->llexit_done);
6899    thr->llexit_done = True;
6900
6901    /* Check nobody messed up with the cached_rcec */
6902    tl_assert (thr->cached_rcec.magic == RCEC_MAGIC);
6903    tl_assert (thr->cached_rcec.rc == 0);
6904    tl_assert (thr->cached_rcec.rcX == 0);
6905    tl_assert (thr->cached_rcec.next == NULL);
6906
6907    /* Just to be sure, declare the cached stack invalid. */
6908    set_cached_rcec_validity(thr, False);
6909
6910    /* free up Filter and local_Kws_n_stacks (well, actually not the
6911       latter ..) */
6912    tl_assert(thr->filter);
6913    HG_(free)(thr->filter);
6914    thr->filter = NULL;
6915
6916    /* Tell the VTS mechanism this thread has exited, so it can
6917       participate in VTS pruning.  Note this can only happen if the
6918       thread has both ll_exited and has been joined with. */
6919    if (thr->joinedwith_done)
6920       VTS__declare_thread_very_dead(thr);
6921
6922    /* Another space-accuracy tradeoff.  Do we want to be able to show
6923       H1 history for conflicts in threads which have since exited?  If
6924       yes, then we better not free up thr->local_Kws_n_stacks.  The
6925       downside is a potential per-thread leak of up to
6926       N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6927       XArray average overcommit factor is (1.5 I'd guess). */
6928    // hence:
6929    // VG_(deleteXA)(thr->local_Kws_n_stacks);
6930    // thr->local_Kws_n_stacks = NULL;
6931 }
6932
6933 /* Receive notification that a thread has been joined with.  The
6934    significance here is that we do not expect to see any further
6935    references to its vector clocks (Thr::viR and Thr::viW). */
6936 void libhb_joinedwith_done ( Thr* thr )
6937 {
6938    tl_assert(thr);
6939    /* Caller must ensure that this is only ever called once per Thr. */
6940    tl_assert(!thr->joinedwith_done);
6941    thr->joinedwith_done = True;
6942    if (thr->llexit_done)
6943       VTS__declare_thread_very_dead(thr);
6944 }
6945
6946
6947 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6948    a Seg that points at a VTS is its one-and-only owner, and ditto for
6949    a SO that points at a VTS. */
6950
6951 SO* libhb_so_alloc ( void )
6952 {
6953    return SO__Alloc();
6954 }
6955
6956 void libhb_so_dealloc ( SO* so )
6957 {
6958    tl_assert(so);
6959    tl_assert(so->magic == SO_MAGIC);
6960    SO__Dealloc(so);
6961 }
6962
6963 /* See comments in libhb.h for details on the meaning of
6964    strong vs weak sends and strong vs weak receives. */
6965 void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6966 {
6967    /* Copy the VTSs from 'thr' into the sync object, and then move
6968       the thread along one step. */
6969
6970    tl_assert(so);
6971    tl_assert(so->magic == SO_MAGIC);
6972
6973    /* stay sane .. a thread's read-clock must always lead or be the
6974       same as its write-clock */
6975    { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6976      tl_assert(leq);
6977    }
6978
6979    /* since we're overwriting the VtsIDs in the SO, we need to drop
6980       any references made by the previous contents thereof */
6981    if (so->viR == VtsID_INVALID) {
6982       tl_assert(so->viW == VtsID_INVALID);
6983       so->viR = thr->viR;
6984       so->viW = thr->viW;
6985       VtsID__rcinc(so->viR);
6986       VtsID__rcinc(so->viW);
6987    } else {
6988       /* In a strong send, we dump any previous VC in the SO and
6989          install the sending thread's VC instead.  For a weak send we
6990          must join2 with what's already there. */
6991       tl_assert(so->viW != VtsID_INVALID);
6992       VtsID__rcdec(so->viR);
6993       VtsID__rcdec(so->viW);
6994       so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6995       so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6996       VtsID__rcinc(so->viR);
6997       VtsID__rcinc(so->viW);
6998    }
6999
7000    /* move both parent clocks along */
7001    VtsID__rcdec(thr->viR);
7002    VtsID__rcdec(thr->viW);
7003    thr->viR = VtsID__tick( thr->viR, thr );
7004    thr->viW = VtsID__tick( thr->viW, thr );
7005    if (!thr->llexit_done) {
7006       Filter__clear(thr->filter, "libhb_so_send");
7007       note_local_Kw_n_stack_for(thr);
7008    }
7009    VtsID__rcinc(thr->viR);
7010    VtsID__rcinc(thr->viW);
7011
7012    if (strong_send)
7013       show_thread_state("s-send", thr);
7014    else
7015       show_thread_state("w-send", thr);
7016 }
7017
7018 void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
7019 {
7020    tl_assert(so);
7021    tl_assert(so->magic == SO_MAGIC);
7022
7023    if (so->viR != VtsID_INVALID) {
7024       tl_assert(so->viW != VtsID_INVALID);
7025
7026       /* Weak receive (basically, an R-acquisition of a R-W lock).
7027          This advances the read-clock of the receiver, but not the
7028          write-clock. */
7029       VtsID__rcdec(thr->viR);
7030       thr->viR = VtsID__join2( thr->viR, so->viR );
7031       VtsID__rcinc(thr->viR);
7032
7033       /* At one point (r10589) it seemed safest to tick the clocks for
7034          the receiving thread after the join.  But on reflection, I
7035          wonder if that might cause it to 'overtake' constraints,
7036          which could lead to missing races.  So, back out that part of
7037          r10589. */
7038       //VtsID__rcdec(thr->viR);
7039       //thr->viR = VtsID__tick( thr->viR, thr );
7040       //VtsID__rcinc(thr->viR);
7041
7042       /* For a strong receive, we also advance the receiver's write
7043          clock, which means the receive as a whole is essentially
7044          equivalent to a W-acquisition of a R-W lock. */
7045       if (strong_recv) {
7046          VtsID__rcdec(thr->viW);
7047          thr->viW = VtsID__join2( thr->viW, so->viW );
7048          VtsID__rcinc(thr->viW);
7049
7050          /* See comment just above, re r10589. */
7051          //VtsID__rcdec(thr->viW);
7052          //thr->viW = VtsID__tick( thr->viW, thr );
7053          //VtsID__rcinc(thr->viW);
7054       }
7055
7056       if (thr->filter)
7057          Filter__clear(thr->filter, "libhb_so_recv");
7058       note_local_Kw_n_stack_for(thr);
7059
7060       if (strong_recv)
7061          show_thread_state("s-recv", thr);
7062       else
7063          show_thread_state("w-recv", thr);
7064
7065    } else {
7066       tl_assert(so->viW == VtsID_INVALID);
7067       /* Deal with degenerate case: 'so' has no vts, so there has been
7068          no message posted to it.  Just ignore this case. */
7069       show_thread_state("d-recv", thr);
7070    }
7071 }
7072
7073 Bool libhb_so_everSent ( SO* so )
7074 {
7075    if (so->viR == VtsID_INVALID) {
7076       tl_assert(so->viW == VtsID_INVALID);
7077       return False;
7078    } else {
7079       tl_assert(so->viW != VtsID_INVALID);
7080       return True;
7081    }
7082 }
7083
7084 #define XXX1 0 // 0x67a106c
7085 #define XXX2 0
7086
7087 static inline Bool TRACEME(Addr a, SizeT szB) {
7088    if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
7089    if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
7090    return False;
7091 }
7092 static void trace ( Thr* thr, Addr a, SizeT szB, const HChar* s )
7093 {
7094   SVal sv = zsm_sread08(a);
7095   VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
7096   show_thread_state("", thr);
7097   VG_(printf)("%s","\n");
7098 }
7099
7100 void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
7101 {
7102    SVal sv = SVal__mkC(thr->viW, thr->viW);
7103    tl_assert(is_sane_SVal_C(sv));
7104    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
7105    zsm_sset_range( a, szB, sv );
7106    Filter__clear_range( thr->filter, a, szB );
7107    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
7108 }
7109
7110 void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
7111 {
7112    /* do nothing */
7113 }
7114
7115
7116 /* Set the lines zix_start till zix_end to NOACCESS. */
7117 static void zsm_secmap_line_range_noaccess (SecMap *sm,
7118                                             UInt zix_start, UInt zix_end)
7119 {
7120    for (UInt lz = zix_start; lz <= zix_end; lz++) {
7121       LineZ* lineZ;
7122       lineZ = &sm->linesZ[lz];
7123       if (lineZ->dict[0] != SVal_INVALID) {
7124          rcdec_LineZ(lineZ);
7125          lineZ->dict[0] = SVal_NOACCESS;
7126          lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
7127       } else {
7128          clear_LineF_of_Z(lineZ);
7129       }
7130       for (UInt i = 0; i < N_LINE_ARANGE/4; i++)
7131          lineZ->ix2s[i] = 0; /* all refer to dict[0] */
7132    }
7133 }
7134
7135 /* Set the given range to SVal_NOACCESS in-place in the secmap.
7136    a must be cacheline aligned. len must be a multiple of a cacheline
7137    and must be < N_SECMAP_ARANGE. */
7138 static void zsm_sset_range_noaccess_in_secmap(Addr a, SizeT len)
7139 {
7140    tl_assert (is_valid_scache_tag (a));
7141    tl_assert (0 == (len & (N_LINE_ARANGE - 1)));
7142    tl_assert (len < N_SECMAP_ARANGE);
7143
7144    SecMap *sm1 = shmem__find_SecMap (a);
7145    SecMap *sm2 = shmem__find_SecMap (a + len - 1);
7146    UWord zix_start = shmem__get_SecMap_offset(a          ) >> N_LINE_BITS;
7147    UWord zix_end   = shmem__get_SecMap_offset(a + len - 1) >> N_LINE_BITS;
7148
7149    if (sm1) {
7150       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm1));
7151       zsm_secmap_line_range_noaccess (sm1, zix_start,
7152                                       sm1 == sm2 ? zix_end : N_SECMAP_ZLINES-1);
7153    }
7154    if (sm2 && sm1 != sm2) {
7155       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm2));
7156       zsm_secmap_line_range_noaccess (sm2, 0, zix_end);
7157    }
7158 }
7159
7160 /* Set the given address range to SVal_NOACCESS.
7161    The SecMaps fully set to SVal_NOACCESS will be pushed in SecMap_freelist. */
7162 static void zsm_sset_range_noaccess (Addr addr, SizeT len)
7163 {
7164    /*
7165        BPC = Before, Partial Cacheline, = addr
7166              (i.e. starting inside a cacheline/inside a SecMap)
7167        BFC = Before, Full Cacheline(s), but not full SecMap
7168              (i.e. starting inside a SecMap)
7169        FSM = Full SecMap(s)
7170              (i.e. starting a SecMap)
7171        AFC = After, Full Cacheline(s), but not full SecMap
7172              (i.e. first address after the full SecMap(s))
7173        APC = After, Partial Cacheline, i.e. first address after the
7174              full CacheLines).
7175        ARE = After Range End = addr+len = first address not part of the range.
7176
7177        If addr     starts a Cacheline, then BPC == BFC.
7178        If addr     starts a SecMap,    then BPC == BFC == FSM.
7179        If addr+len starts a SecMap,    then APC == ARE == AFC
7180        If addr+len starts a Cacheline, then APC == ARE
7181    */
7182    Addr ARE = addr + len;
7183    Addr BPC = addr;
7184    Addr BFC = ROUNDUP(BPC, N_LINE_ARANGE);
7185    Addr FSM = ROUNDUP(BPC, N_SECMAP_ARANGE);
7186    Addr AFC = ROUNDDN(ARE, N_SECMAP_ARANGE);
7187    Addr APC = ROUNDDN(ARE, N_LINE_ARANGE);
7188    SizeT Plen = len; // Plen will be split between the following:
7189    SizeT BPClen;
7190    SizeT BFClen;
7191    SizeT FSMlen;
7192    SizeT AFClen;
7193    SizeT APClen;
7194
7195    /* Consumes from Plen the nr of bytes between from and to.
7196       from and to must be aligned on a multiple of round.
7197       The length consumed will be a multiple of round, with
7198       a maximum of Plen. */
7199 #  define PlenCONSUME(from, to, round, consumed) \
7200    do {                                          \
7201    if (from < to) {                              \
7202       if (to - from < Plen)                      \
7203          consumed = to - from;                   \
7204       else                                       \
7205          consumed = ROUNDDN(Plen, round);        \
7206    } else {                                      \
7207       consumed = 0;                              \
7208    }                                             \
7209    Plen -= consumed; } while (0)
7210
7211    PlenCONSUME(BPC, BFC, 1,               BPClen);
7212    PlenCONSUME(BFC, FSM, N_LINE_ARANGE,   BFClen);
7213    PlenCONSUME(FSM, AFC, N_SECMAP_ARANGE, FSMlen);
7214    PlenCONSUME(AFC, APC, N_LINE_ARANGE,   AFClen);
7215    PlenCONSUME(APC, ARE, 1,               APClen);
7216
7217    if (0)
7218       VG_(printf) ("addr %p[%lu] ARE %p"
7219                    " BPC %p[%lu] BFC %p[%lu] FSM %p[%lu]"
7220                    " AFC %p[%lu] APC %p[%lu]\n",
7221                    (void*)addr, len, (void*)ARE,
7222                    (void*)BPC, BPClen, (void*)BFC, BFClen, (void*)FSM, FSMlen,
7223                    (void*)AFC, AFClen, (void*)APC, APClen);
7224
7225    tl_assert (Plen == 0);
7226
7227    /* Set to NOACCESS pieces before and after not covered by entire SecMaps. */
7228
7229    /* First we set the partial cachelines. This is done through the cache. */
7230    if (BPClen > 0)
7231       zsm_sset_range_SMALL (BPC, BPClen, SVal_NOACCESS);
7232    if (APClen > 0)
7233       zsm_sset_range_SMALL (APC, APClen, SVal_NOACCESS);
7234
7235    /* After this, we will not use the cache anymore. We will directly work
7236       in-place on the z shadow memory in SecMap(s).
7237       So, we invalidate the cachelines for the whole range we are setting
7238       to NOACCESS below. */
7239    shmem__invalidate_scache_range (BFC, APC - BFC);
7240
7241    if (BFClen > 0)
7242       zsm_sset_range_noaccess_in_secmap (BFC, BFClen);
7243    if (AFClen > 0)
7244       zsm_sset_range_noaccess_in_secmap (AFC, AFClen);
7245
7246    if (FSMlen > 0) {
7247       /* Set to NOACCESS all the SecMaps, pushing the SecMaps to the
7248          free list. */
7249       Addr  sm_start = FSM;
7250       while (sm_start < AFC) {
7251          SecMap *sm = shmem__find_SecMap (sm_start);
7252          if (sm) {
7253             Addr gaKey;
7254             SecMap *fm_sm;
7255
7256             if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
7257             for (UInt lz = 0; lz < N_SECMAP_ZLINES; lz++) {
7258                LineZ *lineZ = &sm->linesZ[lz];
7259                if (LIKELY(lineZ->dict[0] != SVal_INVALID))
7260                   rcdec_LineZ(lineZ);
7261                else
7262                   clear_LineF_of_Z(lineZ);
7263             }
7264             if (!VG_(delFromFM)(map_shmem, &gaKey, (UWord*)&fm_sm, sm_start))
7265                tl_assert (0);
7266             stats__secmaps_in_map_shmem--;
7267             tl_assert (gaKey == sm_start);
7268             tl_assert (sm == fm_sm);
7269             stats__secmaps_ssetGCed++;
7270             push_SecMap_on_freelist (sm);
7271          }
7272          sm_start += N_SECMAP_ARANGE;
7273       }
7274       tl_assert (sm_start == AFC);
7275
7276       /* The above loop might have kept copies of freed SecMap in the smCache.
7277          => clear them. */
7278       if (address_in_range(smCache[0].gaKey, FSM, FSMlen)) {
7279          smCache[0].gaKey = 1;
7280          smCache[0].sm = NULL;
7281       }
7282       if (address_in_range(smCache[1].gaKey, FSM, FSMlen)) {
7283          smCache[1].gaKey = 1;
7284          smCache[1].sm = NULL;
7285       }
7286       if (address_in_range(smCache[2].gaKey, FSM, FSMlen)) {
7287          smCache[2].gaKey = 1;
7288          smCache[2].sm = NULL;
7289       }
7290       STATIC_ASSERT (3 == sizeof(smCache)/sizeof(SMCacheEnt));
7291    }
7292 }
7293
7294 void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
7295 {
7296    /* This really does put the requested range in NoAccess.  It's
7297       expensive though. */
7298    SVal sv = SVal_NOACCESS;
7299    tl_assert(is_sane_SVal_C(sv));
7300    if (LIKELY(szB < 2 * N_LINE_ARANGE))
7301       zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7302    else
7303       zsm_sset_range_noaccess (a, szB);
7304    Filter__clear_range( thr->filter, a, szB );
7305 }
7306
7307 /* Works byte at a time. Can be optimised if needed. */
7308 UWord libhb_srange_get_abits (Addr a, UChar *abits, SizeT len)
7309 {
7310    UWord anr = 0; // nr of bytes addressable.
7311
7312    /* Get the accessibility of each byte. Pay attention to not
7313       create SecMap or LineZ when checking if a byte is addressable.
7314
7315       Note: this is used for client request. Performance deemed not critical.
7316       So for simplicity, we work byte per byte.
7317       Performance could be improved  by working with full cachelines
7318       or with full SecMap, when reaching a cacheline or secmap boundary. */
7319    for (SizeT i = 0; i < len; i++) {
7320       SVal       sv = SVal_INVALID;
7321       Addr       b = a + i;
7322       Addr       tag = b & ~(N_LINE_ARANGE - 1);
7323       UWord      wix = (b >> N_LINE_BITS) & (N_WAY_NENT - 1);
7324       UWord      cloff = get_cacheline_offset(b);
7325
7326       /* Note: we do not use get_cacheline(b) to avoid creating cachelines
7327          and/or SecMap for non addressable bytes. */
7328       if (tag == cache_shmem.tags0[wix]) {
7329          CacheLine copy = cache_shmem.lyns0[wix];
7330          /* We work on a copy of the cacheline, as we do not want to
7331             record the client request as a real read.
7332             The below is somewhat similar to zsm_sapply08__msmcread but
7333             avoids side effects on the cache. */
7334          UWord toff = get_tree_offset(b); /* == 0 .. 7 */
7335          UWord tno  = get_treeno(b);
7336          UShort descr = copy.descrs[tno];
7337          if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
7338             SVal* tree = &copy.svals[tno << 3];
7339             copy.descrs[tno] = pulldown_to_8(tree, toff, descr);
7340          }
7341          sv = copy.svals[cloff];
7342       } else {
7343          /* Byte not found in the cacheline. Search for a SecMap. */
7344          SecMap *sm = shmem__find_SecMap(b);
7345          LineZ *lineZ;
7346          if (sm == NULL)
7347             sv = SVal_NOACCESS;
7348          else {
7349             UWord zix = shmem__get_SecMap_offset(b) >> N_LINE_BITS;
7350             lineZ = &sm->linesZ[zix];
7351             if (lineZ->dict[0] == SVal_INVALID) {
7352                LineF *lineF = SVal2Ptr(lineZ->dict[1]);
7353                sv = lineF->w64s[cloff];
7354             } else {
7355                UWord ix = read_twobit_array( lineZ->ix2s, cloff );
7356                sv = lineZ->dict[ix];
7357             }
7358          }
7359       }
7360
7361       tl_assert (sv != SVal_INVALID);
7362       if (sv == SVal_NOACCESS) {
7363          if (abits)
7364             abits[i] = 0x00;
7365       } else {
7366          if (abits)
7367             abits[i] = 0xff;
7368          anr++;
7369       }
7370    }
7371
7372    return anr;
7373 }
7374
7375
7376 void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
7377 {
7378    SVal sv = SVal_NOACCESS;
7379    tl_assert(is_sane_SVal_C(sv));
7380    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
7381    if (LIKELY(szB < 2 * N_LINE_ARANGE))
7382       zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7383    else
7384       zsm_sset_range_noaccess (a, szB);
7385    Filter__clear_range( thr->filter, a, szB );
7386    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
7387 }
7388
7389 Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
7390    tl_assert(thr);
7391    return thr->hgthread;
7392 }
7393
7394 void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
7395    tl_assert(thr);
7396    thr->hgthread = hgthread;
7397 }
7398
7399 void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
7400 {
7401    zsm_scopy_range(src, dst, len);
7402    Filter__clear_range( thr->filter, dst, len );
7403 }
7404
7405 void libhb_maybe_GC ( void )
7406 {
7407    /* GC the unreferenced (zero rc) RCECs when
7408          (1) reaching a significant nr of RCECs (to avoid scanning a contextTab
7409              with mostly NULL ptr)
7410      and (2) approaching the max nr of RCEC (as we have in any case
7411              at least that amount of RCEC in the pool allocator)
7412              Note: the margin allows to avoid a small but constant increase
7413              of the max nr of RCEC due to the fact that libhb_maybe_GC is
7414              not called when the current nr of RCEC exactly reaches the max.
7415      and (3) the nr of referenced RCECs is less than 75% than total nr RCECs.
7416      Avoid growing too much the nr of RCEC keeps the memory use low,
7417      and avoids to have too many elements in the (fixed) contextTab hashtable.
7418    */
7419    if (UNLIKELY(stats__ctxt_tab_curr > N_RCEC_TAB/2
7420                 && stats__ctxt_tab_curr + 1000 >= stats__ctxt_tab_max
7421                 && (stats__ctxt_tab_curr * 3)/4 > RCEC_referenced))
7422       do_RCEC_GC();
7423
7424    /* If there are still no entries available (all the table entries are full),
7425       and we hit the threshold point, then do a GC */
7426    Bool vts_tab_GC = vts_tab_freelist == VtsID_INVALID
7427       && VG_(sizeXA)( vts_tab ) >= vts_next_GC_at;
7428    if (UNLIKELY (vts_tab_GC))
7429       vts_tab__do_GC( False/*don't show stats*/ );
7430
7431    /* scan GC the SecMaps when
7432           (1) no SecMap in the freelist
7433       and (2) the current nr of live secmaps exceeds the threshold. */
7434    if (UNLIKELY(SecMap_freelist == NULL
7435                 && stats__secmaps_in_map_shmem >= next_SecMap_GC_at)) {
7436       // If we did a vts tab GC, then no need to flush the cache again.
7437       if (!vts_tab_GC)
7438          zsm_flush_cache();
7439       shmem__SecMap_do_GC(True);
7440    }
7441
7442    /* Check the reference counts (expensive) */
7443    if (CHECK_CEM)
7444       event_map__check_reference_counts();
7445 }
7446
7447
7448 /////////////////////////////////////////////////////////////////
7449 /////////////////////////////////////////////////////////////////
7450 //                                                             //
7451 // SECTION END main library                                    //
7452 //                                                             //
7453 /////////////////////////////////////////////////////////////////
7454 /////////////////////////////////////////////////////////////////
7455
7456 /*--------------------------------------------------------------------*/
7457 /*--- end                                             libhb_main.c ---*/
7458 /*--------------------------------------------------------------------*/