tools/pdb2mdb/IntHashTable.cs

   1 //-----------------------------------------------------------------------------
   2 //
   3 // Copyright (C) Microsoft Corporation.  All Rights Reserved.
   4 //
   5 //-----------------------------------------------------------------------------
   6 using System;
   7 using System.Collections;
   8
   9 namespace Microsoft.Cci.Pdb {
  10   // The IntHashTable class represents a dictionary of associated keys and
  11   // values with constant lookup time.
  12   //
  13   // Objects used as keys in a hashtable must implement the GetHashCode
  14   // and Equals methods (or they can rely on the default implementations
  15   // inherited from Object if key equality is simply reference
  16   // equality). Furthermore, the GetHashCode and Equals methods of
  17   // a key object must produce the same results given the same parameters
  18   // for the entire time the key is present in the hashtable. In practical
  19   // terms, this means that key objects should be immutable, at least for
  20   // the time they are used as keys in a hashtable.
  21   //
  22   // When entries are added to a hashtable, they are placed into
  23   // buckets based on the hashcode of their keys. Subsequent lookups of
  24   // keys will use the hashcode of the keys to only search a particular
  25   // bucket, thus substantially reducing the number of key comparisons
  26   // required to find an entry. A hashtable's maximum load factor, which
  27   // can be specified when the hashtable is instantiated, determines the
  28   // maximum ratio of hashtable entries to hashtable buckets. Smaller load
  29   // factors cause faster average lookup times at the cost of increased
  30   // memory consumption. The default maximum load factor of 1.0 generally
  31   // provides the best balance between speed and size. As entries are added
  32   // to a hashtable, the hashtable's actual load factor increases, and when
  33   // the actual load factor reaches the maximum load factor value, the
  34   // number of buckets in the hashtable is automatically increased by
  35   // approximately a factor of two (to be precise, the number of hashtable
  36   // buckets is increased to the smallest prime number that is larger than
  37   // twice the current number of hashtable buckets).
  38   //
  39   // Each object provides their own hash function, accessed by calling
  40   // GetHashCode().  However, one can write their own object
  41   // implementing IHashCodeProvider and pass it to a constructor on
  42   // the IntHashTable.  That hash function would be used for all objects in
  43   // the table.
  44   //
  45   // This IntHashTable is implemented to support multiple concurrent readers
  46   // and one concurrent writer without using any synchronization primitives.
  47   // All read methods essentially must protect themselves from a resize
  48   // occuring while they are running.  This was done by enforcing an
  49   // ordering on inserts & removes, as well as removing some member variables
  50   // and special casing the expand code to work in a temporary array instead
  51   // of the live bucket array.  All inserts must set a bucket's value and
  52   // key before setting the hash code & collision field.
  53   //
  54   // By Brian Grunkemeyer, algorithm by Patrick Dussud.
  55   // Version 1.30 2/20/2000
  56   //| <include path='docs/doc[@for="IntHashTable"]/*' />
  57   internal class IntHashTable : IEnumerable {
  58     /*
  59       Implementation Notes:
  60
  61       This IntHashTable uses double hashing.  There are hashsize buckets in
  62       the table, and each bucket can contain 0 or 1 element.  We a bit to
  63       mark whether there's been a collision when we inserted multiple
  64       elements (ie, an inserted item was hashed at least a second time and
  65       we probed this bucket, but it was already in use).  Using the
  66       collision bit, we can terminate lookups & removes for elements that
  67       aren't in the hash table more quickly.  We steal the most
  68       significant bit from the hash code to store the collision bit.
  69
  70       Our hash function is of the following form:
  71
  72       h(key, n) = h1(key) + n*h2(key)
  73
  74       where n is the number of times we've hit a collided bucket and
  75       rehashed (on this particular lookup).  Here are our hash functions:
  76
  77       h1(key) = GetHash(key);  // default implementation calls key.GetHashCode();
  78       h2(key) = 1 + (((h1(key) >> 5) + 1) % (hashsize - 1));
  79
  80       The h1 can return any number.  h2 must return a number between 1 and
  81       hashsize - 1 that is relatively prime to hashsize (not a problem if
  82       hashsize is prime).  (Knuth's Art of Computer Programming, Vol. 3,
  83       p. 528-9)
  84
  85       If this is true, then we are guaranteed to visit every bucket in
  86       exactly hashsize probes, since the least common multiple of hashsize
  87       and h2(key) will be hashsize * h2(key).  (This is the first number
  88       where adding h2 to h1 mod hashsize will be 0 and we will search the
  89       same bucket twice).
  90
  91       We previously used a different h2(key, n) that was not constant.
  92       That is a horrifically bad idea, unless you can prove that series
  93       will never produce any identical numbers that overlap when you mod
  94       them by hashsize, for all subranges from i to i+hashsize, for all i.
  95       It's not worth investigating, since there was no clear benefit from
  96       using that hash function, and it was broken.
  97
  98       For efficiency reasons, we've implemented this by storing h1 and h2
  99       in a temporary, and setting a variable called seed equal to h1.  We
 100       do a probe, and if we collided, we simply add h2 to seed each time
 101       through the loop.
 102
 103       A good test for h2() is to subclass IntHashTable, provide your own
 104       implementation of GetHash() that returns a constant, then add many
 105       items to the hash table.  Make sure Count equals the number of items
 106       you inserted.
 107
 108       -- Brian Grunkemeyer, 10/28/1999
 109     */
 110
 111     // A typical resize algorithm would pick the smallest prime number in this array
 112     // that is larger than twice the previous capacity.
 113     // Suppose our Hashtable currently has capacity x and enough elements are added
 114     // such that a resize needs to occur. Resizing first computes 2x then finds the
 115     // first prime in the table greater than 2x, i.e. if primes are ordered
 116     // p_1, p_2, …, p_i,…, it finds p_n such that p_n-1 < 2x < p_n.
 117     // Doubling is important for preserving the asymptotic complexity of the
 118     // hashtable operations such as add.  Having a prime guarantees that double
 119     // hashing does not lead to infinite loops.  IE, your hash function will be
 120     // h1(key) + i*h2(key), 0 <= i < size.  h2 and the size must be relatively prime.
 121     private static readonly int[] primes = {
 122             3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919,
 123             1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591,
 124             17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437,
 125             187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263,
 126             1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369};
 127
 128     private int GetPrime(int minSize) {
 129       if (minSize < 0) {
 130         throw new ArgumentException("Arg_HTCapacityOverflow");
 131       }
 132       for (int i = 0; i < primes.Length; i++) {
 133         int size = primes[i];
 134         if (size >= minSize) {
 135           return size;
 136         }
 137       }
 138       throw new ArgumentException("Arg_HTCapacityOverflow");
 139     }
 140
 141     // Deleted entries have their key set to buckets
 142
 143     // The hash table data.
 144     // This cannot be serialised
 145     private struct bucket {
 146       internal int key;
 147       internal int hash_coll;   // Store hash code; sign bit means there was a collision.
 148       internal Object val;
 149     }
 150
 151     private bucket[] buckets;
 152
 153     // The total number of entries in the hash table.
 154     private int count;
 155
 156     // The total number of collision bits set in the hashtable
 157     private int occupancy;
 158
 159     private int loadsize;
 160     private int loadFactorPerc;    // 100 = 1.0
 161
 162     private int version;
 163
 164     // Constructs a new hashtable. The hashtable is created with an initial
 165     // capacity of zero and a load factor of 1.0.
 166     //| <include path='docs/doc[@for="IntHashTable.IntHashTable"]/*' />
 167     internal IntHashTable()
 168       : this(0, 100) {
 169     }
 170
 171     // Constructs a new hashtable with the given initial capacity and a load
 172     // factor of 1.0. The capacity argument serves as an indication of
 173     // the number of entries the hashtable will contain. When this number (or
 174     // an approximation) is known, specifying it in the constructor can
 175     // eliminate a number of resizing operations that would otherwise be
 176     // performed when elements are added to the hashtable.
 177     //
 178     //| <include path='docs/doc[@for="IntHashTable.IntHashTable1"]/*' />
 179     internal IntHashTable(int capacity)
 180       : this(capacity, 100) {
 181     }
 182
 183     // Constructs a new hashtable with the given initial capacity and load
 184     // factor. The capacity argument serves as an indication of the
 185     // number of entries the hashtable will contain. When this number (or an
 186     // approximation) is known, specifying it in the constructor can eliminate
 187     // a number of resizing operations that would otherwise be performed when
 188     // elements are added to the hashtable. The loadFactorPerc argument
 189     // indicates the maximum ratio of hashtable entries to hashtable buckets.
 190     // Smaller load factors cause faster average lookup times at the cost of
 191     // increased memory consumption. A load factor of 1.0 generally provides
 192     // the best balance between speed and size.
 193     //
 194     //| <include path='docs/doc[@for="IntHashTable.IntHashTable3"]/*' />
 195     internal IntHashTable(int capacity, int loadFactorPerc) {
 196       if (capacity < 0)
 197         throw new ArgumentOutOfRangeException("capacity", "ArgumentOutOfRange_NeedNonNegNum");
 198       if (!(loadFactorPerc >= 10 && loadFactorPerc <= 100))
 199         throw new ArgumentOutOfRangeException("loadFactorPerc", String.Format("ArgumentOutOfRange_IntHashTableLoadFactor", 10, 100));
 200
 201       // Based on perf work, .72 is the optimal load factor for this table.
 202       this.loadFactorPerc = (loadFactorPerc * 72) / 100;
 203
 204       int hashsize = GetPrime((int)(capacity / this.loadFactorPerc));
 205       buckets = new bucket[hashsize];
 206
 207       loadsize = (int)(this.loadFactorPerc * hashsize) / 100;
 208       if (loadsize >= hashsize)
 209         loadsize = hashsize-1;
 210     }
 211
 212     // Computes the hash function:  H(key, i) = h1(key) + i*h2(key, hashSize).
 213     // The out parameter seed is h1(key), while the out parameter
 214     // incr is h2(key, hashSize).  Callers of this function should
 215     // add incr each time through a loop.
 216     private uint InitHash(int key, int hashsize, out uint seed, out uint incr) {
 217       // Hashcode must be positive.  Also, we must not use the sign bit, since
 218       // that is used for the collision bit.
 219       uint hashcode = (uint)key & 0x7FFFFFFF;
 220       seed = (uint)hashcode;
 221       // Restriction: incr MUST be between 1 and hashsize - 1, inclusive for
 222       // the modular arithmetic to work correctly.  This guarantees you'll
 223       // visit every bucket in the table exactly once within hashsize
 224       // iterations.  Violate this and it'll cause obscure bugs forever.
 225       // If you change this calculation for h2(key), update putEntry too!
 226       incr = (uint)(1 + (((seed >> 5) + 1) % ((uint)hashsize - 1)));
 227       return hashcode;
 228     }
 229
 230     // Adds an entry with the given key and value to this hashtable. An
 231     // ArgumentException is thrown if the key is null or if the key is already
 232     // present in the hashtable.
 233     //
 234     //| <include path='docs/doc[@for="IntHashTable.Add"]/*' />
 235     internal void Add(int key, Object value) {
 236       Insert(key, value, true);
 237     }
 238
 239     // Removes all entries from this hashtable.
 240     //| <include path='docs/doc[@for="IntHashTable.Clear"]/*' />
 241     internal void Clear() {
 242       if (count == 0)
 243         return;
 244
 245       for (int i = 0; i < buckets.Length; i++) {
 246         buckets[i].hash_coll = 0;
 247         buckets[i].key = -1;
 248         buckets[i].val = null;
 249       }
 250
 251       count = 0;
 252       occupancy = 0;
 253     }
 254
 255     // Checks if this hashtable contains an entry with the given key.  This is
 256     // an O(1) operation.
 257     //
 258     //| <include path='docs/doc[@for="IntHashTable.Contains"]/*' />
 259     internal bool Contains(int key) {
 260       if (key < 0) {
 261         throw new ArgumentException("Argument_KeyLessThanZero");
 262       }
 263
 264       uint seed;
 265       uint incr;
 266       // Take a snapshot of buckets, in case another thread resizes table
 267       bucket[] lbuckets = buckets;
 268       uint hashcode = InitHash(key, lbuckets.Length, out seed, out incr);
 269       int ntry = 0;
 270
 271       bucket b;
 272       do {
 273         int bucketNumber = (int)(seed % (uint)lbuckets.Length);
 274         b = lbuckets[bucketNumber];
 275         if (b.val == null) {
 276           return false;
 277         }
 278         if (((b.hash_coll & 0x7FFFFFFF) == hashcode) && b.key == key) {
 279           return true;
 280         }
 281         seed += incr;
 282       } while (b.hash_coll < 0 && ++ntry < lbuckets.Length);
 283       return false;
 284     }
 285
 286     // Returns the value associated with the given key. If an entry with the
 287     // given key is not found, the returned value is null.
 288     //
 289     //| <include path='docs/doc[@for="IntHashTable.this"]/*' />
 290     internal Object this[int key] {
 291       get {
 292         if (key < 0) {
 293           throw new ArgumentException("Argument_KeyLessThanZero");
 294         }
 295         uint seed;
 296         uint incr;
 297         // Take a snapshot of buckets, in case another thread does a resize
 298         bucket[] lbuckets = buckets;
 299         uint hashcode = InitHash(key, lbuckets.Length, out seed, out incr);
 300         int ntry = 0;
 301
 302         bucket b;
 303         do {
 304           int bucketNumber = (int)(seed % (uint)lbuckets.Length);
 305           b = lbuckets[bucketNumber];
 306           if (b.val == null) {
 307             return null;
 308           }
 309           if (((b.hash_coll & 0x7FFFFFFF) == hashcode) && key == b.key) {
 310             return b.val;
 311           }
 312           seed += incr;
 313         } while (b.hash_coll < 0 && ++ntry < lbuckets.Length);
 314         return null;
 315       }
 316       set {
 317         Insert(key, value, false);
 318       }
 319     }
 320
 321     // Increases the bucket count of this hashtable. This method is called from
 322     // the Insert method when the actual load factor of the hashtable reaches
 323     // the upper limit specified when the hashtable was constructed. The number
 324     // of buckets in the hashtable is increased to the smallest prime number
 325     // that is larger than twice the current number of buckets, and the entries
 326     // in the hashtable are redistributed into the new buckets using the cached
 327     // hashcodes.
 328     private void expand() {
 329       rehash(GetPrime(1+buckets.Length*2));
 330     }
 331
 332     // We occationally need to rehash the table to clean up the collision bits.
 333     private void rehash() {
 334       rehash(buckets.Length);
 335     }
 336
 337     private void rehash(int newsize) {
 338
 339       // reset occupancy
 340       occupancy=0;
 341
 342       // Don't replace any internal state until we've finished adding to the
 343       // new bucket[].  This serves two purposes:
 344       //   1) Allow concurrent readers to see valid hashtable contents
 345       //      at all times
 346       //   2) Protect against an OutOfMemoryException while allocating this
 347       //      new bucket[].
 348       bucket[] newBuckets = new bucket[newsize];
 349
 350       // rehash table into new buckets
 351       int nb;
 352       for (nb = 0; nb < buckets.Length; nb++) {
 353         bucket oldb = buckets[nb];
 354         if (oldb.val != null) {
 355           putEntry(newBuckets, oldb.key, oldb.val, oldb.hash_coll & 0x7FFFFFFF);
 356         }
 357       }
 358
 359       // New bucket[] is good to go - replace buckets and other internal state.
 360       version++;
 361       buckets = newBuckets;
 362       loadsize = (int)(loadFactorPerc * newsize) / 100;
 363
 364       if (loadsize >= newsize) {
 365         loadsize = newsize-1;
 366       }
 367
 368       return;
 369     }
 370
 371     // Returns an enumerator for this hashtable.
 372     // If modifications made to the hashtable while an enumeration is
 373     // in progress, the MoveNext and Current methods of the
 374     // enumerator will throw an exception.
 375     //
 376     //| <include path='docs/doc[@for="IntHashTable.IEnumerable.GetEnumerator"]/*' />
 377     IEnumerator IEnumerable.GetEnumerator() {
 378       return new IntHashTableEnumerator(this);
 379     }
 380
 381     // Internal method to compare two keys.
 382     //
 383     // Inserts an entry into this hashtable. This method is called from the Set
 384     // and Add methods. If the add parameter is true and the given key already
 385     // exists in the hashtable, an exception is thrown.
 386     private void Insert(int key, Object nvalue, bool add) {
 387       if (key < 0) {
 388         throw new ArgumentException("Argument_KeyLessThanZero");
 389       }
 390       if (nvalue == null) {
 391         throw new ArgumentNullException("nvalue", "ArgumentNull_Value");
 392       }
 393       if (count >= loadsize) {
 394         expand();
 395       } else if (occupancy > loadsize && count > 100) {
 396         rehash();
 397       }
 398
 399       uint seed;
 400       uint incr;
 401       // Assume we only have one thread writing concurrently.  Modify
 402       // buckets to contain new data, as long as we insert in the right order.
 403       uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
 404       int ntry = 0;
 405       int emptySlotNumber = -1; // We use the empty slot number to cache the first empty slot. We chose to reuse slots
 406       // create by remove that have the collision bit set over using up new slots.
 407
 408       do {
 409         int bucketNumber = (int)(seed % (uint)buckets.Length);
 410
 411         // Set emptySlot number to current bucket if it is the first available bucket that we have seen
 412         // that once contained an entry and also has had a collision.
 413         // We need to search this entire collision chain because we have to ensure that there are no
 414         // duplicate entries in the table.
 415
 416         // Insert the key/value pair into this bucket if this bucket is empty and has never contained an entry
 417         // OR
 418         // This bucket once contained an entry but there has never been a collision
 419         if (buckets[bucketNumber].val == null) {
 420           // If we have found an available bucket that has never had a collision, but we've seen an available
 421           // bucket in the past that has the collision bit set, use the previous bucket instead
 422           if (emptySlotNumber != -1) { // Reuse slot
 423             bucketNumber = emptySlotNumber;
 424           }
 425
 426           // We pretty much have to insert in this order.  Don't set hash
 427           // code until the value & key are set appropriately.
 428           buckets[bucketNumber].val = nvalue;
 429           buckets[bucketNumber].key = key;
 430           buckets[bucketNumber].hash_coll |= (int)hashcode;
 431           count++;
 432           version++;
 433           return;
 434         }
 435
 436         // The current bucket is in use
 437         // OR
 438         // it is available, but has had the collision bit set and we have already found an available bucket
 439         if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&
 440                     key == buckets[bucketNumber].key) {
 441           if (add) {
 442             throw new ArgumentException("Argument_AddingDuplicate__" + buckets[bucketNumber].key);
 443           }
 444           buckets[bucketNumber].val = nvalue;
 445           version++;
 446           return;
 447         }
 448
 449         // The current bucket is full, and we have therefore collided.  We need to set the collision bit
 450         // UNLESS
 451         // we have remembered an available slot previously.
 452         if (emptySlotNumber == -1) {// We don't need to set the collision bit here since we already have an empty slot
 453           if (buckets[bucketNumber].hash_coll >= 0) {
 454             buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
 455             occupancy++;
 456           }
 457         }
 458         seed += incr;
 459       } while (++ntry < buckets.Length);
 460
 461       // This code is here if and only if there were no buckets without a collision bit set in the entire table
 462       if (emptySlotNumber != -1) {
 463         // We pretty much have to insert in this order.  Don't set hash
 464         // code until the value & key are set appropriately.
 465         buckets[emptySlotNumber].val = nvalue;
 466         buckets[emptySlotNumber].key  = key;
 467         buckets[emptySlotNumber].hash_coll |= (int)hashcode;
 468         count++;
 469         version++;
 470         return;
 471
 472       }
 473
 474       // If you see this assert, make sure load factor & count are reasonable.
 475       // Then verify that our double hash function (h2, described at top of file)
 476       // meets the requirements described above. You should never see this assert.
 477       throw new InvalidOperationException("InvalidOperation_HashInsertFailed");
 478     }
 479
 480     private void putEntry(bucket[] newBuckets, int key, Object nvalue, int hashcode) {
 481       uint seed = (uint)hashcode;
 482       uint incr = (uint)(1 + (((seed >> 5) + 1) % ((uint)newBuckets.Length - 1)));
 483
 484       do {
 485         int bucketNumber = (int)(seed % (uint)newBuckets.Length);
 486
 487         if ((newBuckets[bucketNumber].val == null)) {
 488           newBuckets[bucketNumber].val = nvalue;
 489           newBuckets[bucketNumber].key = key;
 490           newBuckets[bucketNumber].hash_coll |= hashcode;
 491           return;
 492         }
 493
 494         if (newBuckets[bucketNumber].hash_coll >= 0) {
 495           newBuckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
 496           occupancy++;
 497         }
 498         seed += incr;
 499       } while (true);
 500     }
 501
 502     // Returns the number of associations in this hashtable.
 503     //
 504     //| <include path='docs/doc[@for="IntHashTable.Count"]/*' />
 505     internal int Count {
 506       get { return count; }
 507     }
 508
 509     // Implements an enumerator for a hashtable. The enumerator uses the
 510     // internal version number of the hashtabke to ensure that no modifications
 511     // are made to the hashtable while an enumeration is in progress.
 512     private class IntHashTableEnumerator : IEnumerator {
 513       private IntHashTable hashtable;
 514       private int bucket;
 515       private int version;
 516       private bool current;
 517       private int currentKey;
 518       private Object currentValue;
 519
 520       internal IntHashTableEnumerator(IntHashTable hashtable) {
 521         this.hashtable = hashtable;
 522         bucket = hashtable.buckets.Length;
 523         version = hashtable.version;
 524         current = false;
 525       }
 526
 527       public bool MoveNext() {
 528         if (version != hashtable.version)
 529           throw new InvalidOperationException("InvalidOperation_EnumFailedVersion");
 530         while (bucket > 0) {
 531           bucket--;
 532           Object val = hashtable.buckets[bucket].val;
 533           if (val != null) {
 534             currentKey = hashtable.buckets[bucket].key;
 535             currentValue = val;
 536             current = true;
 537             return true;
 538           }
 539         }
 540         current = false;
 541         return false;
 542       }
 543
 544       internal int Key {
 545         get {
 546           if (current == false)
 547             throw new InvalidOperationException("InvalidOperation_EnumOpCantHappen");
 548           return currentKey;
 549         }
 550       }
 551
 552       public Object Current {
 553         get {
 554           if (current == false)
 555             throw new InvalidOperationException("InvalidOperation_EnumOpCantHappen");
 556           return currentValue;
 557         }
 558       }
 559
 560       public Object Value {
 561         get {
 562           if (version != hashtable.version)
 563             throw new InvalidOperationException("InvalidOperation_EnumFailedVersion");
 564           if (current == false)
 565             throw new InvalidOperationException("InvalidOperation_EnumOpCantHappen");
 566           return currentValue;
 567         }
 568       }
 569
 570       public void Reset() {
 571         if (version != hashtable.version) throw new InvalidOperationException("InvalidOperation_EnumFailedVersion");
 572         current = false;
 573         bucket = hashtable.buckets.Length;
 574         currentKey = -1;
 575         currentValue = null;
 576       }
 577     }
 578   }
 579 }