src/Utils/QuickHash.h

   1 // TortoiseSVN - a Windows shell extension for easy version control
   2
   3 // Copyright (C) 2007-2007 - TortoiseSVN
   4
   5 // This program is free software; you can redistribute it and/or
   6 // modify it under the terms of the GNU General Public License
   7 // as published by the Free Software Foundation; either version 2
   8 // of the License, or (at your option) any later version.
   9
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software Foundation,
  17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 #pragma once
  20
  21
  22 /**
  23  * A quick linear (array) hash index class. It requires HF to
  24  * provide the following interface:
  25  *
  26  * value_type           data type of the hash
  27  * index_type           type of the index to store the hash
  28  * NO_INDEX                     an index type to mark empty buckets
  29  *
  30  * operator()           value_type -> size_t hash function
  31  * value()                      index_type -> value_type
  32  * equal()                      (value_type, index_type) -> bool
  33  *
  34  * The capacity approximately doubles by each rehash().
  35  * Only insertion and lookup are provided. Collisions are
  36  * resolved using linear probing.
  37  *
  38  * Use statistics() to monitor the cache performance.
  39  */
  40 template<class HF>
  41 class quick_hash
  42 {
  43 public:
  44
  45         typedef typename HF::value_type value_type;
  46         typedef typename HF::index_type index_type;
  47
  48         enum {NO_INDEX = (index_type)(HF::NO_INDEX)};
  49
  50         struct statistics_t
  51         {
  52                 size_t capacity;
  53                 size_t used;
  54                 size_t collisions;
  55                 size_t max_path;
  56                 size_t collision_path_sum;
  57
  58                 statistics_t()
  59                         : capacity (1)
  60                         , used (0)
  61                         , collisions (0)
  62                         , max_path (1)
  63                         , collision_path_sum (0)
  64                 {
  65                 }
  66         };
  67
  68 private:
  69
  70         class prime_grower
  71         {
  72         private:
  73
  74                 static const size_t primes[31];
  75                 statistics_t statistics;
  76                 size_t index;
  77
  78         public:
  79
  80                 prime_grower()
  81                         : index (0)
  82                         , statistics()
  83                 {
  84                         statistics.capacity = primes[index];
  85                 }
  86
  87                 size_t capacity() const
  88                 {
  89                         return statistics.capacity;
  90                 }
  91
  92                 size_t size() const
  93                 {
  94                         return statistics.used;
  95                 }
  96
  97                 size_t collisions() const
  98                 {
  99                         return statistics.collisions;
 100                 }
 101
 102                 void inserted_cleanly()
 103                 {
 104                         ++statistics.used;
 105                 }
 106
 107                 void inserted_collision (size_t path_size)
 108                 {
 109                         ++statistics.used;
 110                         ++statistics.collisions;
 111                         statistics.collision_path_sum += path_size;
 112                         if (statistics.max_path <= path_size)
 113                                 statistics.max_path = path_size + 1;
 114                 }
 115
 116                 void grow()
 117                 {
 118                         statistics.capacity = primes[++index];
 119                         statistics.collisions = 0;
 120                         statistics.used = 0;
 121                         statistics.collision_path_sum = 0;
 122                         statistics.max_path = 1;
 123                 }
 124
 125                 size_t map (size_t hash_value) const
 126                 {
 127                         return hash_value % capacity();
 128                 }
 129
 130                 size_t next (size_t index) const
 131                 {
 132                         return (index + 1381000000) % capacity();
 133                 }
 134
 135                 const statistics_t& get_statistics() const
 136                 {
 137                         return statistics;
 138                 }
 139         };
 140
 141         index_type* data;
 142         prime_grower grower;
 143         HF hf;
 144
 145 private:
 146
 147         /// check if we're allowed to add new entries to the hash
 148         /// without re-hashing.
 149         bool should_grow() const
 150         {
 151                 // grow, if there are many collisions
 152                 // or capacity is almost exceeded
 153                 // There must also be at least one empty entry
 154
 155                 return grower.size() + grower.collisions() + 1 >= grower.capacity();
 156         }
 157
 158         /// initialize the new array before re-hashing
 159         void create_data()
 160         {
 161                 size_t new_capacity = grower.capacity();
 162
 163                 data = new index_type[new_capacity];
 164                 stdext::unchecked_fill_n (data, new_capacity, NO_INDEX);
 165         }
 166
 167         /// add a value to the hash
 168         /// (must not be in it already; hash must not be full)
 169         void internal_insert (const value_type& value, index_type index)
 170         {
 171                 // first try: un-collisioned insertion
 172
 173                 size_t bucket = grower.map (hf (value));
 174                 index_type* target = data + bucket;
 175
 176                 if (*target == NO_INDEX)
 177                 {
 178                         *target = index;
 179                         grower.inserted_cleanly();
 180
 181                         return;
 182                 }
 183
 184                 // collision -> look for an empty bucket
 185
 186                 size_t collision_path_size = 0;
 187                 do
 188                 {
 189                         bucket = grower.next (bucket);
 190                         target = data + bucket;
 191                         ++collision_path_size;
 192                 }
 193                 while (*target != NO_INDEX);
 194
 195                 // insert collisioned item
 196
 197                 *target = index;
 198                 grower.inserted_collision (collision_path_size);
 199         }
 200
 201         void rehash (index_type* old_data, size_t old_data_size)
 202         {
 203                 create_data();
 204
 205                 for (size_t i = 0; i < old_data_size; ++i)
 206                 {
 207                         index_type index = old_data[i];
 208                         if (index != NO_INDEX)
 209                                 internal_insert (hf.value (index), index);
 210                 }
 211
 212                 delete[] old_data;
 213         }
 214
 215 public:
 216
 217         /// construction / destruction
 218         quick_hash (const HF& hash_function)
 219                 : data(NULL)
 220                 , hf (hash_function)
 221                 , grower()
 222         {
 223                 create_data();
 224         }
 225
 226         quick_hash (const quick_hash& rhs)
 227                 : data (NULL)
 228                 , hf (rhs.hf)
 229                 , grower (rhs.grower)
 230         {
 231                 create_data();
 232                 operator= (rhs);
 233         }
 234
 235         ~quick_hash()
 236         {
 237                 delete[] data;
 238         }
 239
 240         /// find the bucket containing the desired value;
 241         /// return NO_INDEX if not contained in hash
 242         index_type find (const value_type& value) const
 243         {
 244                 size_t bucket = grower.map (hf (value));
 245                 index_type index = data[bucket];
 246
 247                 while (index != NO_INDEX)
 248                 {
 249                         // found?
 250
 251                         if (hf.equal (value, index))
 252                                 break;
 253
 254                         // collision -> look at next bucket position
 255
 256                         bucket = grower.next (bucket);
 257                         index = data[bucket];
 258                 }
 259
 260                 // either found or not in hash
 261
 262                 return index;
 263         }
 264
 265         void insert (const value_type& value, index_type index)
 266         {
 267                 assert (find (value) == NO_INDEX);
 268
 269                 if (should_grow())
 270                         reserve (grower.capacity()+1);
 271
 272                 internal_insert (value, index);
 273         }
 274
 275         void reserve (size_t min_bucket_count)
 276         {
 277                 if (size_t(-1) / sizeof (index_type[4]) > min_bucket_count)
 278                         min_bucket_count *= 2;
 279
 280                 index_type* old_data = data;
 281                 size_t old_data_size = grower.capacity();
 282
 283                 while (grower.capacity() < min_bucket_count)
 284                         grower.grow();
 285
 286                 if (grower.capacity() != old_data_size)
 287                         rehash (old_data, old_data_size);
 288         }
 289
 290         /// assignment
 291
 292         quick_hash& operator=(const quick_hash& rhs)
 293         {
 294                 if (grower.capacity() != rhs.grower.capacity())
 295                 {
 296                         delete[] data;
 297                         data = new index_type [rhs.grower.capacity()];
 298                 }
 299
 300                 grower = rhs.grower;
 301
 302                 stdext::unchecked_copy (rhs.data, rhs.data + rhs.grower.capacity(), data);
 303
 304                 return *this;
 305         }
 306
 307         /// get rid of all entries
 308
 309         void clear()
 310         {
 311                 if (grower.size() > 0)
 312                 {
 313                         delete[] data;
 314                         grower = prime_grower();
 315                         create_data();
 316                 }
 317         }
 318
 319         /// efficiently exchange two containers
 320
 321         void swap (quick_hash& rhs)
 322         {
 323                 std::swap (data, rhs.data);
 324
 325                 prime_grower temp = grower;
 326                 grower = rhs.grower;
 327                 rhs.grower = temp;
 328         }
 329
 330         /// read cache performance statistics
 331         const statistics_t& statistics() const
 332         {
 333                 return grower.get_statistics();
 334         }
 335 };
 336
 337 template<class HF>
 338 const size_t quick_hash<HF>::prime_grower::primes[31] =
 339         {1, 3, 7, 17,
 340          31, 67, 127, 257,
 341          509, 1021, 2053, 4099,
 342          8191, 16381, 32771, 65537,
 343          131071, 262147, 524287, 1048573,
 344          2097143, 4194301, 8388617, 16777213,
 345          33554467, 67108859, 134217757, 268435459,
 346          536870909, 1073741827};
 347