src/hashtab.c

   1 /* vi:set ts=8 sts=4 sw=4:
   2  *
   3  * VIM - Vi IMproved    by Bram Moolenaar
   4  *
   5  * Do ":help uganda"  in Vim to read copying and usage conditions.
   6  * Do ":help credits" in Vim to see a list of people who contributed.
   7  * See README.txt for an overview of the Vim source code.
   8  */
   9
  10 /*
  11  * hashtab.c: Handling of a hashtable with Vim-specific properties.
  12  *
  13  * Each item in a hashtable has a NUL terminated string key.  A key can appear
  14  * only once in the table.
  15  *
  16  * A hash number is computed from the key for quick lookup.  When the hashes
  17  * of two different keys point to the same entry an algorithm is used to
  18  * iterate over other entries in the table until the right one is found.
  19  * To make the iteration work removed keys are different from entries where a
  20  * key was never present.
  21  *
  22  * The mechanism has been partly based on how Python Dictionaries are
  23  * implemented.  The algorithm is from Knuth Vol. 3, Sec. 6.4.
  24  *
  25  * The hashtable grows to accommodate more entries when needed.  At least 1/3
  26  * of the entries is empty to keep the lookup efficient (at the cost of extra
  27  * memory).
  28  */
  29
  30 #include "vim.h"
  31
  32 #if defined(FEAT_EVAL) || defined(FEAT_SYN_HL) || defined(PROTO)
  33
  34 #if 0
  35 # define HT_DEBUG       /* extra checks for table consistency  and statistics */
  36
  37 static long hash_count_lookup = 0;      /* count number of hashtab lookups */
  38 static long hash_count_perturb = 0;     /* count number of "misses" */
  39 #endif
  40
  41 /* Magic value for algorithm that walks through the array. */
  42 #define PERTURB_SHIFT 5
  43
  44 static int hash_may_resize __ARGS((hashtab_T *ht, int minitems));
  45
  46 #if 0 /* currently not used */
  47 /*
  48  * Create an empty hash table.
  49  * Returns NULL when out of memory.
  50  */
  51     hashtab_T *
  52 hash_create()
  53 {
  54     hashtab_T *ht;
  55
  56     ht = (hashtab_T *)alloc(sizeof(hashtab_T));
  57     if (ht != NULL)
  58         hash_init(ht);
  59     return ht;
  60 }
  61 #endif
  62
  63 /*
  64  * Initialize an empty hash table.
  65  */
  66     void
  67 hash_init(ht)
  68     hashtab_T *ht;
  69 {
  70     /* This zeroes all "ht_" entries and all the "hi_key" in "ht_smallarray". */
  71     vim_memset(ht, 0, sizeof(hashtab_T));
  72     ht->ht_array = ht->ht_smallarray;
  73     ht->ht_mask = HT_INIT_SIZE - 1;
  74 }
  75
  76 /*
  77  * Free the array of a hash table.  Does not free the items it contains!
  78  * If "ht" is not freed then you should call hash_init() next!
  79  */
  80     void
  81 hash_clear(ht)
  82     hashtab_T *ht;
  83 {
  84     if (ht->ht_array != ht->ht_smallarray)
  85         vim_free(ht->ht_array);
  86 }
  87
  88 /*
  89  * Free the array of a hash table and all the keys it contains.  The keys must
  90  * have been allocated.  "off" is the offset from the start of the allocate
  91  * memory to the location of the key (it's always positive).
  92  */
  93     void
  94 hash_clear_all(ht, off)
  95     hashtab_T   *ht;
  96     int         off;
  97 {
  98     long        todo;
  99     hashitem_T  *hi;
 100
 101     todo = (long)ht->ht_used;
 102     for (hi = ht->ht_array; todo > 0; ++hi)
 103     {
 104         if (!HASHITEM_EMPTY(hi))
 105         {
 106             vim_free(hi->hi_key - off);
 107             --todo;
 108         }
 109     }
 110     hash_clear(ht);
 111 }
 112
 113 /*
 114  * Find "key" in hashtable "ht".  "key" must not be NULL.
 115  * Always returns a pointer to a hashitem.  If the item was not found then
 116  * HASHITEM_EMPTY() is TRUE.  The pointer is then the place where the key
 117  * would be added.
 118  * WARNING: The returned pointer becomes invalid when the hashtable is changed
 119  * (adding, setting or removing an item)!
 120  */
 121     hashitem_T *
 122 hash_find(ht, key)
 123     hashtab_T   *ht;
 124     char_u      *key;
 125 {
 126     return hash_lookup(ht, key, hash_hash(key));
 127 }
 128
 129 /*
 130  * Like hash_find(), but caller computes "hash".
 131  */
 132     hashitem_T *
 133 hash_lookup(ht, key, hash)
 134     hashtab_T   *ht;
 135     char_u      *key;
 136     hash_T      hash;
 137 {
 138     hash_T      perturb;
 139     hashitem_T  *freeitem;
 140     hashitem_T  *hi;
 141     int         idx;
 142
 143 #ifdef HT_DEBUG
 144     ++hash_count_lookup;
 145 #endif
 146
 147     /*
 148      * Quickly handle the most common situations:
 149      * - return if there is no item at all
 150      * - skip over a removed item
 151      * - return if the item matches
 152      */
 153     idx = (int)(hash & ht->ht_mask);
 154     hi = &ht->ht_array[idx];
 155
 156     if (hi->hi_key == NULL)
 157         return hi;
 158     if (hi->hi_key == HI_KEY_REMOVED)
 159         freeitem = hi;
 160     else if (hi->hi_hash == hash && STRCMP(hi->hi_key, key) == 0)
 161         return hi;
 162     else
 163         freeitem = NULL;
 164
 165     /*
 166      * Need to search through the table to find the key.  The algorithm
 167      * to step through the table starts with large steps, gradually becoming
 168      * smaller down to (1/4 table size + 1).  This means it goes through all
 169      * table entries in the end.
 170      * When we run into a NULL key it's clear that the key isn't there.
 171      * Return the first available slot found (can be a slot of a removed
 172      * item).
 173      */
 174     for (perturb = hash; ; perturb >>= PERTURB_SHIFT)
 175     {
 176 #ifdef HT_DEBUG
 177         ++hash_count_perturb;       /* count a "miss" for hashtab lookup */
 178 #endif
 179         idx = (int)((idx << 2) + idx + perturb + 1);
 180         hi = &ht->ht_array[idx & ht->ht_mask];
 181         if (hi->hi_key == NULL)
 182             return freeitem == NULL ? hi : freeitem;
 183         if (hi->hi_hash == hash
 184                 && hi->hi_key != HI_KEY_REMOVED
 185                 && STRCMP(hi->hi_key, key) == 0)
 186             return hi;
 187         if (hi->hi_key == HI_KEY_REMOVED && freeitem == NULL)
 188             freeitem = hi;
 189     }
 190 }
 191
 192 /*
 193  * Print the efficiency of hashtable lookups.
 194  * Useful when trying different hash algorithms.
 195  * Called when exiting.
 196  */
 197     void
 198 hash_debug_results()
 199 {
 200 #ifdef HT_DEBUG
 201     fprintf(stderr, "\r\n\r\n\r\n\r\n");
 202     fprintf(stderr, "Number of hashtable lookups: %ld\r\n", hash_count_lookup);
 203     fprintf(stderr, "Number of perturb loops: %ld\r\n", hash_count_perturb);
 204     fprintf(stderr, "Percentage of perturb loops: %ld%%\r\n",
 205                                 hash_count_perturb * 100 / hash_count_lookup);
 206 #endif
 207 }
 208
 209 /*
 210  * Add item with key "key" to hashtable "ht".
 211  * Returns FAIL when out of memory or the key is already present.
 212  */
 213     int
 214 hash_add(ht, key)
 215     hashtab_T   *ht;
 216     char_u      *key;
 217 {
 218     hash_T      hash = hash_hash(key);
 219     hashitem_T  *hi;
 220
 221     hi = hash_lookup(ht, key, hash);
 222     if (!HASHITEM_EMPTY(hi))
 223     {
 224         EMSG2(_(e_intern2), "hash_add()");
 225         return FAIL;
 226     }
 227     return hash_add_item(ht, hi, key, hash);
 228 }
 229
 230 /*
 231  * Add item "hi" with "key" to hashtable "ht".  "key" must not be NULL and
 232  * "hi" must have been obtained with hash_lookup() and point to an empty item.
 233  * "hi" is invalid after this!
 234  * Returns OK or FAIL (out of memory).
 235  */
 236     int
 237 hash_add_item(ht, hi, key, hash)
 238     hashtab_T   *ht;
 239     hashitem_T  *hi;
 240     char_u      *key;
 241     hash_T      hash;
 242 {
 243     /* If resizing failed before and it fails again we can't add an item. */
 244     if (ht->ht_error && hash_may_resize(ht, 0) == FAIL)
 245         return FAIL;
 246
 247     ++ht->ht_used;
 248     if (hi->hi_key == NULL)
 249         ++ht->ht_filled;
 250     hi->hi_key = key;
 251     hi->hi_hash = hash;
 252
 253     /* When the space gets low may resize the array. */
 254     return hash_may_resize(ht, 0);
 255 }
 256
 257 #if 0  /* not used */
 258 /*
 259  * Overwrite hashtable item "hi" with "key".  "hi" must point to the item that
 260  * is to be overwritten.  Thus the number of items in the hashtable doesn't
 261  * change.
 262  * Although the key must be identical, the pointer may be different, thus it's
 263  * set anyway (the key is part of an item with that key).
 264  * The caller must take care of freeing the old item.
 265  * "hi" is invalid after this!
 266  */
 267     void
 268 hash_set(hi, key)
 269     hashitem_T  *hi;
 270     char_u      *key;
 271 {
 272     hi->hi_key = key;
 273 }
 274 #endif
 275
 276 /*
 277  * Remove item "hi" from  hashtable "ht".  "hi" must have been obtained with
 278  * hash_lookup().
 279  * The caller must take care of freeing the item itself.
 280  */
 281     void
 282 hash_remove(ht, hi)
 283     hashtab_T   *ht;
 284     hashitem_T  *hi;
 285 {
 286     --ht->ht_used;
 287     hi->hi_key = HI_KEY_REMOVED;
 288     hash_may_resize(ht, 0);
 289 }
 290
 291 /*
 292  * Lock a hashtable: prevent that ht_array changes.
 293  * Don't use this when items are to be added!
 294  * Must call hash_unlock() later.
 295  */
 296     void
 297 hash_lock(ht)
 298     hashtab_T   *ht;
 299 {
 300     ++ht->ht_locked;
 301 }
 302
 303 #if 0       /* currently not used */
 304 /*
 305  * Lock a hashtable at the specified number of entries.
 306  * Caller must make sure no more than "size" entries will be added.
 307  * Must call hash_unlock() later.
 308  */
 309     void
 310 hash_lock_size(ht, size)
 311     hashtab_T   *ht;
 312     int         size;
 313 {
 314     (void)hash_may_resize(ht, size);
 315     ++ht->ht_locked;
 316 }
 317 #endif
 318
 319 /*
 320  * Unlock a hashtable: allow ht_array changes again.
 321  * Table will be resized (shrink) when necessary.
 322  * This must balance a call to hash_lock().
 323  */
 324     void
 325 hash_unlock(ht)
 326     hashtab_T   *ht;
 327 {
 328     --ht->ht_locked;
 329     (void)hash_may_resize(ht, 0);
 330 }
 331
 332 /*
 333  * Shrink a hashtable when there is too much empty space.
 334  * Grow a hashtable when there is not enough empty space.
 335  * Returns OK or FAIL (out of memory).
 336  */
 337     static int
 338 hash_may_resize(ht, minitems)
 339     hashtab_T   *ht;
 340     int         minitems;               /* minimal number of items */
 341 {
 342     hashitem_T  temparray[HT_INIT_SIZE];
 343     hashitem_T  *oldarray, *newarray;
 344     hashitem_T  *olditem, *newitem;
 345     int         newi;
 346     int         todo;
 347     long_u      oldsize, newsize;
 348     long_u      minsize;
 349     long_u      newmask;
 350     hash_T      perturb;
 351
 352     /* Don't resize a locked table. */
 353     if (ht->ht_locked > 0)
 354         return OK;
 355
 356 #ifdef HT_DEBUG
 357     if (ht->ht_used > ht->ht_filled)
 358         EMSG("hash_may_resize(): more used than filled");
 359     if (ht->ht_filled >= ht->ht_mask + 1)
 360         EMSG("hash_may_resize(): table completely filled");
 361 #endif
 362
 363     if (minitems == 0)
 364     {
 365         /* Return quickly for small tables with at least two NULL items.  NULL
 366          * items are required for the lookup to decide a key isn't there. */
 367         if (ht->ht_filled < HT_INIT_SIZE - 1
 368                                          && ht->ht_array == ht->ht_smallarray)
 369             return OK;
 370
 371         /*
 372          * Grow or refill the array when it's more than 2/3 full (including
 373          * removed items, so that they get cleaned up).
 374          * Shrink the array when it's less than 1/5 full.  When growing it is
 375          * at least 1/4 full (avoids repeated grow-shrink operations)
 376          */
 377         oldsize = ht->ht_mask + 1;
 378         if (ht->ht_filled * 3 < oldsize * 2 && ht->ht_used > oldsize / 5)
 379             return OK;
 380
 381         if (ht->ht_used > 1000)
 382             minsize = ht->ht_used * 2;  /* it's big, don't make too much room */
 383         else
 384             minsize = ht->ht_used * 4;  /* make plenty of room */
 385     }
 386     else
 387     {
 388         /* Use specified size. */
 389         if ((long_u)minitems < ht->ht_used)     /* just in case... */
 390             minitems = (int)ht->ht_used;
 391         minsize = minitems * 3 / 2;     /* array is up to 2/3 full */
 392     }
 393
 394     newsize = HT_INIT_SIZE;
 395     while (newsize < minsize)
 396     {
 397         newsize <<= 1;          /* make sure it's always a power of 2 */
 398         if (newsize == 0)
 399             return FAIL;        /* overflow */
 400     }
 401
 402     if (newsize == HT_INIT_SIZE)
 403     {
 404         /* Use the small array inside the hashdict structure. */
 405         newarray = ht->ht_smallarray;
 406         if (ht->ht_array == newarray)
 407         {
 408             /* Moving from ht_smallarray to ht_smallarray!  Happens when there
 409              * are many removed items.  Copy the items to be able to clean up
 410              * removed items. */
 411             mch_memmove(temparray, newarray, sizeof(temparray));
 412             oldarray = temparray;
 413         }
 414         else
 415             oldarray = ht->ht_array;
 416     }
 417     else
 418     {
 419         /* Allocate an array. */
 420         newarray = (hashitem_T *)alloc((unsigned)
 421                                               (sizeof(hashitem_T) * newsize));
 422         if (newarray == NULL)
 423         {
 424             /* Out of memory.  When there are NULL items still return OK.
 425              * Otherwise set ht_error, because lookup may result in a hang if
 426              * we add another item. */
 427             if (ht->ht_filled < ht->ht_mask)
 428                 return OK;
 429             ht->ht_error = TRUE;
 430             return FAIL;
 431         }
 432         oldarray = ht->ht_array;
 433     }
 434     vim_memset(newarray, 0, (size_t)(sizeof(hashitem_T) * newsize));
 435
 436     /*
 437      * Move all the items from the old array to the new one, placing them in
 438      * the right spot.  The new array won't have any removed items, thus this
 439      * is also a cleanup action.
 440      */
 441     newmask = newsize - 1;
 442     todo = (int)ht->ht_used;
 443     for (olditem = oldarray; todo > 0; ++olditem)
 444         if (!HASHITEM_EMPTY(olditem))
 445         {
 446             /*
 447              * The algorithm to find the spot to add the item is identical to
 448              * the algorithm to find an item in hash_lookup().  But we only
 449              * need to search for a NULL key, thus it's simpler.
 450              */
 451             newi = (int)(olditem->hi_hash & newmask);
 452             newitem = &newarray[newi];
 453
 454             if (newitem->hi_key != NULL)
 455                 for (perturb = olditem->hi_hash; ; perturb >>= PERTURB_SHIFT)
 456                 {
 457                     newi = (int)((newi << 2) + newi + perturb + 1);
 458                     newitem = &newarray[newi & newmask];
 459                     if (newitem->hi_key == NULL)
 460                         break;
 461                 }
 462             *newitem = *olditem;
 463             --todo;
 464         }
 465
 466     if (ht->ht_array != ht->ht_smallarray)
 467         vim_free(ht->ht_array);
 468     ht->ht_array = newarray;
 469     ht->ht_mask = newmask;
 470     ht->ht_filled = ht->ht_used;
 471     ht->ht_error = FALSE;
 472
 473     return OK;
 474 }
 475
 476 /*
 477  * Get the hash number for a key.
 478  * If you think you know a better hash function: Compile with HT_DEBUG set and
 479  * run a script that uses hashtables a lot.  Vim will then print statistics
 480  * when exiting.  Try that with the current hash algorithm and yours.  The
 481  * lower the percentage the better.
 482  */
 483     hash_T
 484 hash_hash(key)
 485     char_u      *key;
 486 {
 487     hash_T      hash;
 488     char_u      *p;
 489
 490     if ((hash = *key) == 0)
 491         return (hash_T)0;       /* Empty keys are not allowed, but we don't
 492                                    want to crash if we get one. */
 493     p = key + 1;
 494
 495 #if 0
 496     /* ElfHash algorithm, which is supposed to have an even distribution.
 497      * Suggested by Charles Campbell. */
 498     hash_T      g;
 499
 500     while (*p != NUL)
 501     {
 502         hash = (hash << 4) + *p++;      /* clear low 4 bits of hash, add char */
 503         g = hash & 0xf0000000L;         /* g has high 4 bits of hash only */
 504         if (g != 0)
 505             hash ^= g >> 24;            /* xor g's high 4 bits into hash */
 506     }
 507 #else
 508
 509     /* A simplistic algorithm that appears to do very well.
 510      * Suggested by George Reilly. */
 511     while (*p != NUL)
 512         hash = hash * 101 + *p++;
 513 #endif
 514
 515     return hash;
 516 }
 517
 518 #endif