Update copyright for 2022
[pgsql.git] / src / backend / utils / hash / dynahash.c
blob3babde8d704db76c7a785792101df405dca45dc3
1 /*-------------------------------------------------------------------------
3 * dynahash.c
4 * dynamic chained hash tables
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
25 * For hash tables in shared memory, the memory allocator function should
26 * match malloc's semantics of returning NULL on failure. For hash tables
27 * in local memory, we typically use palloc() which will throw error on
28 * failure. The code in this file has to cope with both cases.
30 * dynahash.c provides support for these types of lookup keys:
32 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 * compared as though by strcmp(). This is selected by specifying the
34 * HASH_STRINGS flag to hash_create.
36 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
37 * (Caller must ensure there are no undefined padding bits in the keys!)
38 * This is selected by specifying the HASH_BLOBS flag to hash_create.
40 * 3. More complex key behavior can be selected by specifying user-supplied
41 * hashing, comparison, and/or key-copying functions. At least a hashing
42 * function must be supplied; comparison defaults to memcmp() and key copying
43 * to memcpy() when a user-defined hashing function is selected.
45 * Compared to simplehash, dynahash has the following benefits:
47 * - It supports partitioning, which is useful for shared memory access using
48 * locks.
49 * - Shared memory hashes are allocated in a fixed size area at startup and
50 * are discoverable by name from other processes.
51 * - Because entries don't need to be moved in the case of hash conflicts,
52 * dynahash has better performance for large entries.
53 * - Guarantees stable pointers to entries.
55 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
56 * Portions Copyright (c) 1994, Regents of the University of California
59 * IDENTIFICATION
60 * src/backend/utils/hash/dynahash.c
62 *-------------------------------------------------------------------------
66 * Original comments:
68 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
69 * Coded into C, with minor code improvements, and with hsearch(3) interface,
70 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
71 * also, hcreate/hdestroy routines added to simulate hsearch(3).
73 * These routines simulate hsearch(3) and family, with the important
74 * difference that the hash table is dynamic - can grow indefinitely
75 * beyond its original size (as supplied to hcreate()).
77 * Performance appears to be comparable to that of hsearch(3).
78 * The 'source-code' options referred to in hsearch(3)'s 'man' page
79 * are not implemented; otherwise functionality is identical.
81 * Compilation controls:
82 * HASH_DEBUG controls some informative traces, mainly for debugging.
83 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
84 * when combined with HASH_DEBUG, these are displayed by hdestroy().
86 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 * concatenation property, in probably unnecessary code 'optimization'.
89 * Modified margo@postgres.berkeley.edu February 1990
90 * added multiple table interface
91 * Modified by sullivan@postgres.berkeley.edu April 1990
92 * changed ctl structure for shared memory
95 #include "postgres.h"
97 #include <limits.h>
99 #include "access/xact.h"
100 #include "common/hashfn.h"
101 #include "port/pg_bitutils.h"
102 #include "storage/shmem.h"
103 #include "storage/spin.h"
104 #include "utils/dynahash.h"
105 #include "utils/memutils.h"
109 * Constants
111 * A hash table has a top-level "directory", each of whose entries points
112 * to a "segment" of ssize bucket headers. The maximum number of hash
113 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
114 * the number of records in the table can be larger, but we don't want a
115 * whole lot of records per bucket or performance goes down.
117 * In a hash table allocated in shared memory, the directory cannot be
118 * expanded because it must stay at a fixed address. The directory size
119 * should be selected using hash_select_dirsize (and you'd better have
120 * a good idea of the maximum number of entries!). For non-shared hash
121 * tables, the initial directory size can be left at the default.
123 #define DEF_SEGSIZE 256
124 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
125 #define DEF_DIRSIZE 256
127 /* Number of freelists to be used for a partitioned hash table. */
128 #define NUM_FREELISTS 32
130 /* A hash bucket is a linked list of HASHELEMENTs */
131 typedef HASHELEMENT *HASHBUCKET;
133 /* A hash segment is an array of bucket headers */
134 typedef HASHBUCKET *HASHSEGMENT;
137 * Per-freelist data.
139 * In a partitioned hash table, each freelist is associated with a specific
140 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
141 * nentries tracks the number of live hashtable entries having those hashcodes
142 * (NOT the number of entries in the freelist, as you might expect).
144 * The coverage of a freelist might be more or less than one partition, so it
145 * needs its own lock rather than relying on caller locking. Relying on that
146 * wouldn't work even if the coverage was the same, because of the occasional
147 * need to "borrow" entries from another freelist; see get_hash_entry().
149 * Using an array of FreeListData instead of separate arrays of mutexes,
150 * nentries and freeLists helps to reduce sharing of cache lines between
151 * different mutexes.
153 typedef struct
155 slock_t mutex; /* spinlock for this freelist */
156 long nentries; /* number of entries in associated buckets */
157 HASHELEMENT *freeList; /* chain of free elements */
158 } FreeListData;
161 * Header structure for a hash table --- contains all changeable info
163 * In a shared-memory hash table, the HASHHDR is in shared memory, while
164 * each backend has a local HTAB struct. For a non-shared table, there isn't
165 * any functional difference between HASHHDR and HTAB, but we separate them
166 * anyway to share code between shared and non-shared tables.
168 struct HASHHDR
171 * The freelist can become a point of contention in high-concurrency hash
172 * tables, so we use an array of freelists, each with its own mutex and
173 * nentries count, instead of just a single one. Although the freelists
174 * normally operate independently, we will scavenge entries from freelists
175 * other than a hashcode's default freelist when necessary.
177 * If the hash table is not partitioned, only freeList[0] is used and its
178 * spinlock is not used at all; callers' locking is assumed sufficient.
180 FreeListData freeList[NUM_FREELISTS];
182 /* These fields can change, but not in a partitioned table */
183 /* Also, dsize can't change in a shared table, even if unpartitioned */
184 long dsize; /* directory size */
185 long nsegs; /* number of allocated segments (<= dsize) */
186 uint32 max_bucket; /* ID of maximum bucket in use */
187 uint32 high_mask; /* mask to modulo into entire table */
188 uint32 low_mask; /* mask to modulo into lower half of table */
190 /* These fields are fixed at hashtable creation */
191 Size keysize; /* hash key length in bytes */
192 Size entrysize; /* total user element size in bytes */
193 long num_partitions; /* # partitions (must be power of 2), or 0 */
194 long max_dsize; /* 'dsize' limit if directory is fixed size */
195 long ssize; /* segment size --- must be power of 2 */
196 int sshift; /* segment shift = log2(ssize) */
197 int nelem_alloc; /* number of entries to allocate at once */
199 #ifdef HASH_STATISTICS
202 * Count statistics here. NB: stats code doesn't bother with mutex, so
203 * counts could be corrupted a bit in a partitioned table.
205 long accesses;
206 long collisions;
207 #endif
210 #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
212 #define FREELIST_IDX(hctl, hashcode) \
213 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
216 * Top control structure for a hashtable --- in a shared table, each backend
217 * has its own copy (OK since no fields change at runtime)
219 struct HTAB
221 HASHHDR *hctl; /* => shared control information */
222 HASHSEGMENT *dir; /* directory of segment starts */
223 HashValueFunc hash; /* hash function */
224 HashCompareFunc match; /* key comparison function */
225 HashCopyFunc keycopy; /* key copying function */
226 HashAllocFunc alloc; /* memory allocator */
227 MemoryContext hcxt; /* memory context if default allocator used */
228 char *tabname; /* table name (for error messages) */
229 bool isshared; /* true if table is in shared memory */
230 bool isfixed; /* if true, don't enlarge */
232 /* freezing a shared table isn't allowed, so we can keep state here */
233 bool frozen; /* true = no more inserts allowed */
235 /* We keep local copies of these fixed values to reduce contention */
236 Size keysize; /* hash key length in bytes */
237 long ssize; /* segment size --- must be power of 2 */
238 int sshift; /* segment shift = log2(ssize) */
242 * Key (also entry) part of a HASHELEMENT
244 #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
247 * Obtain element pointer given pointer to key
249 #define ELEMENT_FROM_KEY(key) \
250 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
253 * Fast MOD arithmetic, assuming that y is a power of 2 !
255 #define MOD(x,y) ((x) & ((y)-1))
257 #ifdef HASH_STATISTICS
258 static long hash_accesses,
259 hash_collisions,
260 hash_expansions;
261 #endif
264 * Private function prototypes
266 static void *DynaHashAlloc(Size size);
267 static HASHSEGMENT seg_alloc(HTAB *hashp);
268 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
269 static bool dir_realloc(HTAB *hashp);
270 static bool expand_table(HTAB *hashp);
271 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
272 static void hdefault(HTAB *hashp);
273 static int choose_nelem_alloc(Size entrysize);
274 static bool init_htab(HTAB *hashp, long nelem);
275 static void hash_corrupted(HTAB *hashp);
276 static long next_pow2_long(long num);
277 static int next_pow2_int(long num);
278 static void register_seq_scan(HTAB *hashp);
279 static void deregister_seq_scan(HTAB *hashp);
280 static bool has_seq_scans(HTAB *hashp);
284 * memory allocation support
286 static MemoryContext CurrentDynaHashCxt = NULL;
288 static void *
289 DynaHashAlloc(Size size)
291 Assert(MemoryContextIsValid(CurrentDynaHashCxt));
292 return MemoryContextAlloc(CurrentDynaHashCxt, size);
297 * HashCompareFunc for string keys
299 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
300 * bytes, so we can only compare that many ... hence strncmp is almost but
301 * not quite the right thing.
303 static int
304 string_compare(const char *key1, const char *key2, Size keysize)
306 return strncmp(key1, key2, keysize - 1);
310 /************************** CREATE ROUTINES **********************/
313 * hash_create -- create a new dynamic hash table
315 * tabname: a name for the table (for debugging purposes)
316 * nelem: maximum number of elements expected
317 * *info: additional table parameters, as indicated by flags
318 * flags: bitmask indicating which parameters to take from *info
320 * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
321 * optional, but the default keysize and entrysize values were useless.)
322 * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
323 * or HASH_FUNCTION, to define the key hashing semantics (C strings,
324 * binary blobs, or custom, respectively). Callers specifying a custom
325 * hash function will likely also want to use HASH_COMPARE, and perhaps
326 * also HASH_KEYCOPY, to control key comparison and copying.
327 * Another often-used flag is HASH_CONTEXT, to allocate the hash table
328 * under info->hcxt rather than under TopMemoryContext; the default
329 * behavior is only suitable for session-lifespan hash tables.
330 * Other flags bits are special-purpose and seldom used, except for those
331 * associated with shared-memory hash tables, for which see ShmemInitHash().
333 * Fields in *info are read only when the associated flags bit is set.
334 * It is not necessary to initialize other fields of *info.
335 * Neither tabname nor *info need persist after the hash_create() call.
337 * Note: It is deprecated for callers of hash_create() to explicitly specify
338 * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
339 * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
340 * one of these.
342 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
343 * estimate, since we can't expand the table on the fly. But an unshared
344 * hashtable can be expanded on-the-fly, so it's better for nelem to be
345 * on the small side and let the table grow if it's exceeded. An overly
346 * large nelem will penalize hash_seq_search speed without buying much.
348 HTAB *
349 hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
351 HTAB *hashp;
352 HASHHDR *hctl;
355 * Hash tables now allocate space for key and data, but you have to say
356 * how much space to allocate.
358 Assert(flags & HASH_ELEM);
359 Assert(info->keysize > 0);
360 Assert(info->entrysize >= info->keysize);
363 * For shared hash tables, we have a local hash header (HTAB struct) that
364 * we allocate in TopMemoryContext; all else is in shared memory.
366 * For non-shared hash tables, everything including the hash header is in
367 * a memory context created specially for the hash table --- this makes
368 * hash_destroy very simple. The memory context is made a child of either
369 * a context specified by the caller, or TopMemoryContext if nothing is
370 * specified.
372 if (flags & HASH_SHARED_MEM)
374 /* Set up to allocate the hash header */
375 CurrentDynaHashCxt = TopMemoryContext;
377 else
379 /* Create the hash table's private memory context */
380 if (flags & HASH_CONTEXT)
381 CurrentDynaHashCxt = info->hcxt;
382 else
383 CurrentDynaHashCxt = TopMemoryContext;
384 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
385 "dynahash",
386 ALLOCSET_DEFAULT_SIZES);
389 /* Initialize the hash header, plus a copy of the table name */
390 hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
391 MemSet(hashp, 0, sizeof(HTAB));
393 hashp->tabname = (char *) (hashp + 1);
394 strcpy(hashp->tabname, tabname);
396 /* If we have a private context, label it with hashtable's name */
397 if (!(flags & HASH_SHARED_MEM))
398 MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
401 * Select the appropriate hash function (see comments at head of file).
403 if (flags & HASH_FUNCTION)
405 Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
406 hashp->hash = info->hash;
408 else if (flags & HASH_BLOBS)
410 Assert(!(flags & HASH_STRINGS));
411 /* We can optimize hashing for common key sizes */
412 if (info->keysize == sizeof(uint32))
413 hashp->hash = uint32_hash;
414 else
415 hashp->hash = tag_hash;
417 else
420 * string_hash used to be considered the default hash method, and in a
421 * non-assert build it effectively still is. But we now consider it
422 * an assertion error to not say HASH_STRINGS explicitly. To help
423 * catch mistaken usage of HASH_STRINGS, we also insist on a
424 * reasonably long string length: if the keysize is only 4 or 8 bytes,
425 * it's almost certainly an integer or pointer not a string.
427 Assert(flags & HASH_STRINGS);
428 Assert(info->keysize > 8);
430 hashp->hash = string_hash;
434 * If you don't specify a match function, it defaults to string_compare if
435 * you used string_hash, and to memcmp otherwise.
437 * Note: explicitly specifying string_hash is deprecated, because this
438 * might not work for callers in loadable modules on some platforms due to
439 * referencing a trampoline instead of the string_hash function proper.
440 * Specify HASH_STRINGS instead.
442 if (flags & HASH_COMPARE)
443 hashp->match = info->match;
444 else if (hashp->hash == string_hash)
445 hashp->match = (HashCompareFunc) string_compare;
446 else
447 hashp->match = memcmp;
450 * Similarly, the key-copying function defaults to strlcpy or memcpy.
452 if (flags & HASH_KEYCOPY)
453 hashp->keycopy = info->keycopy;
454 else if (hashp->hash == string_hash)
457 * The signature of keycopy is meant for memcpy(), which returns
458 * void*, but strlcpy() returns size_t. Since we never use the return
459 * value of keycopy, and size_t is pretty much always the same size as
460 * void *, this should be safe. The extra cast in the middle is to
461 * avoid warnings from -Wcast-function-type.
463 hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
465 else
466 hashp->keycopy = memcpy;
468 /* And select the entry allocation function, too. */
469 if (flags & HASH_ALLOC)
470 hashp->alloc = info->alloc;
471 else
472 hashp->alloc = DynaHashAlloc;
474 if (flags & HASH_SHARED_MEM)
477 * ctl structure and directory are preallocated for shared memory
478 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
479 * well.
481 hashp->hctl = info->hctl;
482 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
483 hashp->hcxt = NULL;
484 hashp->isshared = true;
486 /* hash table already exists, we're just attaching to it */
487 if (flags & HASH_ATTACH)
489 /* make local copies of some heavily-used values */
490 hctl = hashp->hctl;
491 hashp->keysize = hctl->keysize;
492 hashp->ssize = hctl->ssize;
493 hashp->sshift = hctl->sshift;
495 return hashp;
498 else
500 /* setup hash table defaults */
501 hashp->hctl = NULL;
502 hashp->dir = NULL;
503 hashp->hcxt = CurrentDynaHashCxt;
504 hashp->isshared = false;
507 if (!hashp->hctl)
509 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
510 if (!hashp->hctl)
511 ereport(ERROR,
512 (errcode(ERRCODE_OUT_OF_MEMORY),
513 errmsg("out of memory")));
516 hashp->frozen = false;
518 hdefault(hashp);
520 hctl = hashp->hctl;
522 if (flags & HASH_PARTITION)
524 /* Doesn't make sense to partition a local hash table */
525 Assert(flags & HASH_SHARED_MEM);
528 * The number of partitions had better be a power of 2. Also, it must
529 * be less than INT_MAX (see init_htab()), so call the int version of
530 * next_pow2.
532 Assert(info->num_partitions == next_pow2_int(info->num_partitions));
534 hctl->num_partitions = info->num_partitions;
537 if (flags & HASH_SEGMENT)
539 hctl->ssize = info->ssize;
540 hctl->sshift = my_log2(info->ssize);
541 /* ssize had better be a power of 2 */
542 Assert(hctl->ssize == (1L << hctl->sshift));
546 * SHM hash tables have fixed directory size passed by the caller.
548 if (flags & HASH_DIRSIZE)
550 hctl->max_dsize = info->max_dsize;
551 hctl->dsize = info->dsize;
554 /* remember the entry sizes, too */
555 hctl->keysize = info->keysize;
556 hctl->entrysize = info->entrysize;
558 /* make local copies of heavily-used constant fields */
559 hashp->keysize = hctl->keysize;
560 hashp->ssize = hctl->ssize;
561 hashp->sshift = hctl->sshift;
563 /* Build the hash directory structure */
564 if (!init_htab(hashp, nelem))
565 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
568 * For a shared hash table, preallocate the requested number of elements.
569 * This reduces problems with run-time out-of-shared-memory conditions.
571 * For a non-shared hash table, preallocate the requested number of
572 * elements if it's less than our chosen nelem_alloc. This avoids wasting
573 * space if the caller correctly estimates a small table size.
575 if ((flags & HASH_SHARED_MEM) ||
576 nelem < hctl->nelem_alloc)
578 int i,
579 freelist_partitions,
580 nelem_alloc,
581 nelem_alloc_first;
584 * If hash table is partitioned, give each freelist an equal share of
585 * the initial allocation. Otherwise only freeList[0] is used.
587 if (IS_PARTITIONED(hashp->hctl))
588 freelist_partitions = NUM_FREELISTS;
589 else
590 freelist_partitions = 1;
592 nelem_alloc = nelem / freelist_partitions;
593 if (nelem_alloc <= 0)
594 nelem_alloc = 1;
597 * Make sure we'll allocate all the requested elements; freeList[0]
598 * gets the excess if the request isn't divisible by NUM_FREELISTS.
600 if (nelem_alloc * freelist_partitions < nelem)
601 nelem_alloc_first =
602 nelem - nelem_alloc * (freelist_partitions - 1);
603 else
604 nelem_alloc_first = nelem_alloc;
606 for (i = 0; i < freelist_partitions; i++)
608 int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
610 if (!element_alloc(hashp, temp, i))
611 ereport(ERROR,
612 (errcode(ERRCODE_OUT_OF_MEMORY),
613 errmsg("out of memory")));
617 if (flags & HASH_FIXED_SIZE)
618 hashp->isfixed = true;
619 return hashp;
623 * Set default HASHHDR parameters.
625 static void
626 hdefault(HTAB *hashp)
628 HASHHDR *hctl = hashp->hctl;
630 MemSet(hctl, 0, sizeof(HASHHDR));
632 hctl->dsize = DEF_DIRSIZE;
633 hctl->nsegs = 0;
635 hctl->num_partitions = 0; /* not partitioned */
637 /* table has no fixed maximum size */
638 hctl->max_dsize = NO_MAX_DSIZE;
640 hctl->ssize = DEF_SEGSIZE;
641 hctl->sshift = DEF_SEGSIZE_SHIFT;
643 #ifdef HASH_STATISTICS
644 hctl->accesses = hctl->collisions = 0;
645 #endif
649 * Given the user-specified entry size, choose nelem_alloc, ie, how many
650 * elements to add to the hash table when we need more.
652 static int
653 choose_nelem_alloc(Size entrysize)
655 int nelem_alloc;
656 Size elementSize;
657 Size allocSize;
659 /* Each element has a HASHELEMENT header plus user data. */
660 /* NB: this had better match element_alloc() */
661 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
664 * The idea here is to choose nelem_alloc at least 32, but round up so
665 * that the allocation request will be a power of 2 or just less. This
666 * makes little difference for hash tables in shared memory, but for hash
667 * tables managed by palloc, the allocation request will be rounded up to
668 * a power of 2 anyway. If we fail to take this into account, we'll waste
669 * as much as half the allocated space.
671 allocSize = 32 * 4; /* assume elementSize at least 8 */
674 allocSize <<= 1;
675 nelem_alloc = allocSize / elementSize;
676 } while (nelem_alloc < 32);
678 return nelem_alloc;
682 * Compute derived fields of hctl and build the initial directory/segment
683 * arrays
685 static bool
686 init_htab(HTAB *hashp, long nelem)
688 HASHHDR *hctl = hashp->hctl;
689 HASHSEGMENT *segp;
690 int nbuckets;
691 int nsegs;
692 int i;
695 * initialize mutexes if it's a partitioned table
697 if (IS_PARTITIONED(hctl))
698 for (i = 0; i < NUM_FREELISTS; i++)
699 SpinLockInit(&(hctl->freeList[i].mutex));
702 * Allocate space for the next greater power of two number of buckets,
703 * assuming a desired maximum load factor of 1.
705 nbuckets = next_pow2_int(nelem);
708 * In a partitioned table, nbuckets must be at least equal to
709 * num_partitions; were it less, keys with apparently different partition
710 * numbers would map to the same bucket, breaking partition independence.
711 * (Normally nbuckets will be much bigger; this is just a safety check.)
713 while (nbuckets < hctl->num_partitions)
714 nbuckets <<= 1;
716 hctl->max_bucket = hctl->low_mask = nbuckets - 1;
717 hctl->high_mask = (nbuckets << 1) - 1;
720 * Figure number of directory segments needed, round up to a power of 2
722 nsegs = (nbuckets - 1) / hctl->ssize + 1;
723 nsegs = next_pow2_int(nsegs);
726 * Make sure directory is big enough. If pre-allocated directory is too
727 * small, choke (caller screwed up).
729 if (nsegs > hctl->dsize)
731 if (!(hashp->dir))
732 hctl->dsize = nsegs;
733 else
734 return false;
737 /* Allocate a directory */
738 if (!(hashp->dir))
740 CurrentDynaHashCxt = hashp->hcxt;
741 hashp->dir = (HASHSEGMENT *)
742 hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
743 if (!hashp->dir)
744 return false;
747 /* Allocate initial segments */
748 for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
750 *segp = seg_alloc(hashp);
751 if (*segp == NULL)
752 return false;
755 /* Choose number of entries to allocate at a time */
756 hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
758 #ifdef HASH_DEBUG
759 fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
760 "TABLE POINTER ", hashp,
761 "DIRECTORY SIZE ", hctl->dsize,
762 "SEGMENT SIZE ", hctl->ssize,
763 "SEGMENT SHIFT ", hctl->sshift,
764 "MAX BUCKET ", hctl->max_bucket,
765 "HIGH MASK ", hctl->high_mask,
766 "LOW MASK ", hctl->low_mask,
767 "NSEGS ", hctl->nsegs);
768 #endif
769 return true;
773 * Estimate the space needed for a hashtable containing the given number
774 * of entries of given size.
775 * NOTE: this is used to estimate the footprint of hashtables in shared
776 * memory; therefore it does not count HTAB which is in local memory.
777 * NB: assumes that all hash structure parameters have default values!
779 Size
780 hash_estimate_size(long num_entries, Size entrysize)
782 Size size;
783 long nBuckets,
784 nSegments,
785 nDirEntries,
786 nElementAllocs,
787 elementSize,
788 elementAllocCnt;
790 /* estimate number of buckets wanted */
791 nBuckets = next_pow2_long(num_entries);
792 /* # of segments needed for nBuckets */
793 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
794 /* directory entries */
795 nDirEntries = DEF_DIRSIZE;
796 while (nDirEntries < nSegments)
797 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
799 /* fixed control info */
800 size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
801 /* directory */
802 size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
803 /* segments */
804 size = add_size(size, mul_size(nSegments,
805 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
806 /* elements --- allocated in groups of choose_nelem_alloc() entries */
807 elementAllocCnt = choose_nelem_alloc(entrysize);
808 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
809 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
810 size = add_size(size,
811 mul_size(nElementAllocs,
812 mul_size(elementAllocCnt, elementSize)));
814 return size;
818 * Select an appropriate directory size for a hashtable with the given
819 * maximum number of entries.
820 * This is only needed for hashtables in shared memory, whose directories
821 * cannot be expanded dynamically.
822 * NB: assumes that all hash structure parameters have default values!
824 * XXX this had better agree with the behavior of init_htab()...
826 long
827 hash_select_dirsize(long num_entries)
829 long nBuckets,
830 nSegments,
831 nDirEntries;
833 /* estimate number of buckets wanted */
834 nBuckets = next_pow2_long(num_entries);
835 /* # of segments needed for nBuckets */
836 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
837 /* directory entries */
838 nDirEntries = DEF_DIRSIZE;
839 while (nDirEntries < nSegments)
840 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
842 return nDirEntries;
846 * Compute the required initial memory allocation for a shared-memory
847 * hashtable with the given parameters. We need space for the HASHHDR
848 * and for the (non expansible) directory.
850 Size
851 hash_get_shared_size(HASHCTL *info, int flags)
853 Assert(flags & HASH_DIRSIZE);
854 Assert(info->dsize == info->max_dsize);
855 return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
859 /********************** DESTROY ROUTINES ************************/
861 void
862 hash_destroy(HTAB *hashp)
864 if (hashp != NULL)
866 /* allocation method must be one we know how to free, too */
867 Assert(hashp->alloc == DynaHashAlloc);
868 /* so this hashtable must have its own context */
869 Assert(hashp->hcxt != NULL);
871 hash_stats("destroy", hashp);
874 * Free everything by destroying the hash table's memory context.
876 MemoryContextDelete(hashp->hcxt);
880 void
881 hash_stats(const char *where, HTAB *hashp)
883 #ifdef HASH_STATISTICS
884 fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
885 where, hashp->hctl->accesses, hashp->hctl->collisions);
887 fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
888 hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
889 hashp->hctl->max_bucket, hashp->hctl->nsegs);
890 fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
891 where, hash_accesses, hash_collisions);
892 fprintf(stderr, "hash_stats: total expansions %ld\n",
893 hash_expansions);
894 #endif
897 /*******************************SEARCH ROUTINES *****************************/
901 * get_hash_value -- exported routine to calculate a key's hash value
903 * We export this because for partitioned tables, callers need to compute
904 * the partition number (from the low-order bits of the hash value) before
905 * searching.
907 uint32
908 get_hash_value(HTAB *hashp, const void *keyPtr)
910 return hashp->hash(keyPtr, hashp->keysize);
913 /* Convert a hash value to a bucket number */
914 static inline uint32
915 calc_bucket(HASHHDR *hctl, uint32 hash_val)
917 uint32 bucket;
919 bucket = hash_val & hctl->high_mask;
920 if (bucket > hctl->max_bucket)
921 bucket = bucket & hctl->low_mask;
923 return bucket;
927 * hash_search -- look up key in table and perform action
928 * hash_search_with_hash_value -- same, with key's hash value already computed
930 * action is one of:
931 * HASH_FIND: look up key in table
932 * HASH_ENTER: look up key in table, creating entry if not present
933 * HASH_ENTER_NULL: same, but return NULL if out of memory
934 * HASH_REMOVE: look up key in table, remove entry if present
936 * Return value is a pointer to the element found/entered/removed if any,
937 * or NULL if no match was found. (NB: in the case of the REMOVE action,
938 * the result is a dangling pointer that shouldn't be dereferenced!)
940 * HASH_ENTER will normally ereport a generic "out of memory" error if
941 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
942 * the same except it will return NULL if out of memory. Note that
943 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
944 * since palloc internally ereports on out-of-memory.
946 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
947 * existing entry in the table, false otherwise. This is needed in the
948 * HASH_ENTER case, but is redundant with the return value otherwise.
950 * For hash_search_with_hash_value, the hashvalue parameter must have been
951 * calculated with get_hash_value().
953 void *
954 hash_search(HTAB *hashp,
955 const void *keyPtr,
956 HASHACTION action,
957 bool *foundPtr)
959 return hash_search_with_hash_value(hashp,
960 keyPtr,
961 hashp->hash(keyPtr, hashp->keysize),
962 action,
963 foundPtr);
966 void *
967 hash_search_with_hash_value(HTAB *hashp,
968 const void *keyPtr,
969 uint32 hashvalue,
970 HASHACTION action,
971 bool *foundPtr)
973 HASHHDR *hctl = hashp->hctl;
974 int freelist_idx = FREELIST_IDX(hctl, hashvalue);
975 Size keysize;
976 uint32 bucket;
977 long segment_num;
978 long segment_ndx;
979 HASHSEGMENT segp;
980 HASHBUCKET currBucket;
981 HASHBUCKET *prevBucketPtr;
982 HashCompareFunc match;
984 #ifdef HASH_STATISTICS
985 hash_accesses++;
986 hctl->accesses++;
987 #endif
990 * If inserting, check if it is time to split a bucket.
992 * NOTE: failure to expand table is not a fatal error, it just means we
993 * have to run at higher fill factor than we wanted. However, if we're
994 * using the palloc allocator then it will throw error anyway on
995 * out-of-memory, so we must do this before modifying the table.
997 if (action == HASH_ENTER || action == HASH_ENTER_NULL)
1000 * Can't split if running in partitioned mode, nor if frozen, nor if
1001 * table is the subject of any active hash_seq_search scans.
1003 if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
1004 !IS_PARTITIONED(hctl) && !hashp->frozen &&
1005 !has_seq_scans(hashp))
1006 (void) expand_table(hashp);
1010 * Do the initial lookup
1012 bucket = calc_bucket(hctl, hashvalue);
1014 segment_num = bucket >> hashp->sshift;
1015 segment_ndx = MOD(bucket, hashp->ssize);
1017 segp = hashp->dir[segment_num];
1019 if (segp == NULL)
1020 hash_corrupted(hashp);
1022 prevBucketPtr = &segp[segment_ndx];
1023 currBucket = *prevBucketPtr;
1026 * Follow collision chain looking for matching key
1028 match = hashp->match; /* save one fetch in inner loop */
1029 keysize = hashp->keysize; /* ditto */
1031 while (currBucket != NULL)
1033 if (currBucket->hashvalue == hashvalue &&
1034 match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1035 break;
1036 prevBucketPtr = &(currBucket->link);
1037 currBucket = *prevBucketPtr;
1038 #ifdef HASH_STATISTICS
1039 hash_collisions++;
1040 hctl->collisions++;
1041 #endif
1044 if (foundPtr)
1045 *foundPtr = (bool) (currBucket != NULL);
1048 * OK, now what?
1050 switch (action)
1052 case HASH_FIND:
1053 if (currBucket != NULL)
1054 return (void *) ELEMENTKEY(currBucket);
1055 return NULL;
1057 case HASH_REMOVE:
1058 if (currBucket != NULL)
1060 /* if partitioned, must lock to touch nentries and freeList */
1061 if (IS_PARTITIONED(hctl))
1062 SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1064 /* delete the record from the appropriate nentries counter. */
1065 Assert(hctl->freeList[freelist_idx].nentries > 0);
1066 hctl->freeList[freelist_idx].nentries--;
1068 /* remove record from hash bucket's chain. */
1069 *prevBucketPtr = currBucket->link;
1071 /* add the record to the appropriate freelist. */
1072 currBucket->link = hctl->freeList[freelist_idx].freeList;
1073 hctl->freeList[freelist_idx].freeList = currBucket;
1075 if (IS_PARTITIONED(hctl))
1076 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1079 * better hope the caller is synchronizing access to this
1080 * element, because someone else is going to reuse it the next
1081 * time something is added to the table
1083 return (void *) ELEMENTKEY(currBucket);
1085 return NULL;
1087 case HASH_ENTER_NULL:
1088 /* ENTER_NULL does not work with palloc-based allocator */
1089 Assert(hashp->alloc != DynaHashAlloc);
1090 /* FALL THRU */
1092 case HASH_ENTER:
1093 /* Return existing element if found, else create one */
1094 if (currBucket != NULL)
1095 return (void *) ELEMENTKEY(currBucket);
1097 /* disallow inserts if frozen */
1098 if (hashp->frozen)
1099 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1100 hashp->tabname);
1102 currBucket = get_hash_entry(hashp, freelist_idx);
1103 if (currBucket == NULL)
1105 /* out of memory */
1106 if (action == HASH_ENTER_NULL)
1107 return NULL;
1108 /* report a generic message */
1109 if (hashp->isshared)
1110 ereport(ERROR,
1111 (errcode(ERRCODE_OUT_OF_MEMORY),
1112 errmsg("out of shared memory")));
1113 else
1114 ereport(ERROR,
1115 (errcode(ERRCODE_OUT_OF_MEMORY),
1116 errmsg("out of memory")));
1119 /* link into hashbucket chain */
1120 *prevBucketPtr = currBucket;
1121 currBucket->link = NULL;
1123 /* copy key into record */
1124 currBucket->hashvalue = hashvalue;
1125 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1128 * Caller is expected to fill the data field on return. DO NOT
1129 * insert any code that could possibly throw error here, as doing
1130 * so would leave the table entry incomplete and hence corrupt the
1131 * caller's data structure.
1134 return (void *) ELEMENTKEY(currBucket);
1137 elog(ERROR, "unrecognized hash action code: %d", (int) action);
1139 return NULL; /* keep compiler quiet */
1143 * hash_update_hash_key -- change the hash key of an existing table entry
1145 * This is equivalent to removing the entry, making a new entry, and copying
1146 * over its data, except that the entry never goes to the table's freelist.
1147 * Therefore this cannot suffer an out-of-memory failure, even if there are
1148 * other processes operating in other partitions of the hashtable.
1150 * Returns true if successful, false if the requested new hash key is already
1151 * present. Throws error if the specified entry pointer isn't actually a
1152 * table member.
1154 * NB: currently, there is no special case for old and new hash keys being
1155 * identical, which means we'll report false for that situation. This is
1156 * preferable for existing uses.
1158 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1159 * partitions, if the new hash key would belong to a different partition.
1161 bool
1162 hash_update_hash_key(HTAB *hashp,
1163 void *existingEntry,
1164 const void *newKeyPtr)
1166 HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1167 HASHHDR *hctl = hashp->hctl;
1168 uint32 newhashvalue;
1169 Size keysize;
1170 uint32 bucket;
1171 uint32 newbucket;
1172 long segment_num;
1173 long segment_ndx;
1174 HASHSEGMENT segp;
1175 HASHBUCKET currBucket;
1176 HASHBUCKET *prevBucketPtr;
1177 HASHBUCKET *oldPrevPtr;
1178 HashCompareFunc match;
1180 #ifdef HASH_STATISTICS
1181 hash_accesses++;
1182 hctl->accesses++;
1183 #endif
1185 /* disallow updates if frozen */
1186 if (hashp->frozen)
1187 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1188 hashp->tabname);
1191 * Lookup the existing element using its saved hash value. We need to do
1192 * this to be able to unlink it from its hash chain, but as a side benefit
1193 * we can verify the validity of the passed existingEntry pointer.
1195 bucket = calc_bucket(hctl, existingElement->hashvalue);
1197 segment_num = bucket >> hashp->sshift;
1198 segment_ndx = MOD(bucket, hashp->ssize);
1200 segp = hashp->dir[segment_num];
1202 if (segp == NULL)
1203 hash_corrupted(hashp);
1205 prevBucketPtr = &segp[segment_ndx];
1206 currBucket = *prevBucketPtr;
1208 while (currBucket != NULL)
1210 if (currBucket == existingElement)
1211 break;
1212 prevBucketPtr = &(currBucket->link);
1213 currBucket = *prevBucketPtr;
1216 if (currBucket == NULL)
1217 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1218 hashp->tabname);
1220 oldPrevPtr = prevBucketPtr;
1223 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1224 * chain we want to put the entry into.
1226 newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1228 newbucket = calc_bucket(hctl, newhashvalue);
1230 segment_num = newbucket >> hashp->sshift;
1231 segment_ndx = MOD(newbucket, hashp->ssize);
1233 segp = hashp->dir[segment_num];
1235 if (segp == NULL)
1236 hash_corrupted(hashp);
1238 prevBucketPtr = &segp[segment_ndx];
1239 currBucket = *prevBucketPtr;
1242 * Follow collision chain looking for matching key
1244 match = hashp->match; /* save one fetch in inner loop */
1245 keysize = hashp->keysize; /* ditto */
1247 while (currBucket != NULL)
1249 if (currBucket->hashvalue == newhashvalue &&
1250 match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1251 break;
1252 prevBucketPtr = &(currBucket->link);
1253 currBucket = *prevBucketPtr;
1254 #ifdef HASH_STATISTICS
1255 hash_collisions++;
1256 hctl->collisions++;
1257 #endif
1260 if (currBucket != NULL)
1261 return false; /* collision with an existing entry */
1263 currBucket = existingElement;
1266 * If old and new hash values belong to the same bucket, we need not
1267 * change any chain links, and indeed should not since this simplistic
1268 * update will corrupt the list if currBucket is the last element. (We
1269 * cannot fall out earlier, however, since we need to scan the bucket to
1270 * check for duplicate keys.)
1272 if (bucket != newbucket)
1274 /* OK to remove record from old hash bucket's chain. */
1275 *oldPrevPtr = currBucket->link;
1277 /* link into new hashbucket chain */
1278 *prevBucketPtr = currBucket;
1279 currBucket->link = NULL;
1282 /* copy new key into record */
1283 currBucket->hashvalue = newhashvalue;
1284 hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1286 /* rest of record is untouched */
1288 return true;
1292 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1293 * (Or, if the underlying space allocator throws error for out-of-memory,
1294 * we won't return at all.)
1296 static HASHBUCKET
1297 get_hash_entry(HTAB *hashp, int freelist_idx)
1299 HASHHDR *hctl = hashp->hctl;
1300 HASHBUCKET newElement;
1302 for (;;)
1304 /* if partitioned, must lock to touch nentries and freeList */
1305 if (IS_PARTITIONED(hctl))
1306 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1308 /* try to get an entry from the freelist */
1309 newElement = hctl->freeList[freelist_idx].freeList;
1311 if (newElement != NULL)
1312 break;
1314 if (IS_PARTITIONED(hctl))
1315 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1318 * No free elements in this freelist. In a partitioned table, there
1319 * might be entries in other freelists, but to reduce contention we
1320 * prefer to first try to get another chunk of buckets from the main
1321 * shmem allocator. If that fails, though, we *MUST* root through all
1322 * the other freelists before giving up. There are multiple callers
1323 * that assume that they can allocate every element in the initially
1324 * requested table size, or that deleting an element guarantees they
1325 * can insert a new element, even if shared memory is entirely full.
1326 * Failing because the needed element is in a different freelist is
1327 * not acceptable.
1329 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1331 int borrow_from_idx;
1333 if (!IS_PARTITIONED(hctl))
1334 return NULL; /* out of memory */
1336 /* try to borrow element from another freelist */
1337 borrow_from_idx = freelist_idx;
1338 for (;;)
1340 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1341 if (borrow_from_idx == freelist_idx)
1342 break; /* examined all freelists, fail */
1344 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1345 newElement = hctl->freeList[borrow_from_idx].freeList;
1347 if (newElement != NULL)
1349 hctl->freeList[borrow_from_idx].freeList = newElement->link;
1350 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1352 /* careful: count the new element in its proper freelist */
1353 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1354 hctl->freeList[freelist_idx].nentries++;
1355 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1357 return newElement;
1360 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1363 /* no elements available to borrow either, so out of memory */
1364 return NULL;
1368 /* remove entry from freelist, bump nentries */
1369 hctl->freeList[freelist_idx].freeList = newElement->link;
1370 hctl->freeList[freelist_idx].nentries++;
1372 if (IS_PARTITIONED(hctl))
1373 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1375 return newElement;
1379 * hash_get_num_entries -- get the number of entries in a hashtable
1381 long
1382 hash_get_num_entries(HTAB *hashp)
1384 int i;
1385 long sum = hashp->hctl->freeList[0].nentries;
1388 * We currently don't bother with acquiring the mutexes; it's only
1389 * sensible to call this function if you've got lock on all partitions of
1390 * the table.
1392 if (IS_PARTITIONED(hashp->hctl))
1394 for (i = 1; i < NUM_FREELISTS; i++)
1395 sum += hashp->hctl->freeList[i].nentries;
1398 return sum;
1402 * hash_seq_init/_search/_term
1403 * Sequentially search through hash table and return
1404 * all the elements one by one, return NULL when no more.
1406 * hash_seq_term should be called if and only if the scan is abandoned before
1407 * completion; if hash_seq_search returns NULL then it has already done the
1408 * end-of-scan cleanup.
1410 * NOTE: caller may delete the returned element before continuing the scan.
1411 * However, deleting any other element while the scan is in progress is
1412 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1413 * if elements are added to the table while the scan is in progress, it is
1414 * unspecified whether they will be visited by the scan or not.
1416 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1417 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1418 * further insertions by calling hash_freeze.
1420 * NOTE: to use this with a partitioned hashtable, caller had better hold
1421 * at least shared lock on all partitions of the table throughout the scan!
1422 * We can cope with insertions or deletions by our own backend, but *not*
1423 * with concurrent insertions or deletions by another.
1425 void
1426 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1428 status->hashp = hashp;
1429 status->curBucket = 0;
1430 status->curEntry = NULL;
1431 if (!hashp->frozen)
1432 register_seq_scan(hashp);
1435 void *
1436 hash_seq_search(HASH_SEQ_STATUS *status)
1438 HTAB *hashp;
1439 HASHHDR *hctl;
1440 uint32 max_bucket;
1441 long ssize;
1442 long segment_num;
1443 long segment_ndx;
1444 HASHSEGMENT segp;
1445 uint32 curBucket;
1446 HASHELEMENT *curElem;
1448 if ((curElem = status->curEntry) != NULL)
1450 /* Continuing scan of curBucket... */
1451 status->curEntry = curElem->link;
1452 if (status->curEntry == NULL) /* end of this bucket */
1453 ++status->curBucket;
1454 return (void *) ELEMENTKEY(curElem);
1458 * Search for next nonempty bucket starting at curBucket.
1460 curBucket = status->curBucket;
1461 hashp = status->hashp;
1462 hctl = hashp->hctl;
1463 ssize = hashp->ssize;
1464 max_bucket = hctl->max_bucket;
1466 if (curBucket > max_bucket)
1468 hash_seq_term(status);
1469 return NULL; /* search is done */
1473 * first find the right segment in the table directory.
1475 segment_num = curBucket >> hashp->sshift;
1476 segment_ndx = MOD(curBucket, ssize);
1478 segp = hashp->dir[segment_num];
1481 * Pick up the first item in this bucket's chain. If chain is not empty
1482 * we can begin searching it. Otherwise we have to advance to find the
1483 * next nonempty bucket. We try to optimize that case since searching a
1484 * near-empty hashtable has to iterate this loop a lot.
1486 while ((curElem = segp[segment_ndx]) == NULL)
1488 /* empty bucket, advance to next */
1489 if (++curBucket > max_bucket)
1491 status->curBucket = curBucket;
1492 hash_seq_term(status);
1493 return NULL; /* search is done */
1495 if (++segment_ndx >= ssize)
1497 segment_num++;
1498 segment_ndx = 0;
1499 segp = hashp->dir[segment_num];
1503 /* Begin scan of curBucket... */
1504 status->curEntry = curElem->link;
1505 if (status->curEntry == NULL) /* end of this bucket */
1506 ++curBucket;
1507 status->curBucket = curBucket;
1508 return (void *) ELEMENTKEY(curElem);
1511 void
1512 hash_seq_term(HASH_SEQ_STATUS *status)
1514 if (!status->hashp->frozen)
1515 deregister_seq_scan(status->hashp);
1519 * hash_freeze
1520 * Freeze a hashtable against future insertions (deletions are
1521 * still allowed)
1523 * The reason for doing this is that by preventing any more bucket splits,
1524 * we no longer need to worry about registering hash_seq_search scans,
1525 * and thus caller need not be careful about ensuring hash_seq_term gets
1526 * called at the right times.
1528 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1529 * with active scans (since hash_seq_term would then do the wrong thing).
1531 void
1532 hash_freeze(HTAB *hashp)
1534 if (hashp->isshared)
1535 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1536 if (!hashp->frozen && has_seq_scans(hashp))
1537 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1538 hashp->tabname);
1539 hashp->frozen = true;
1543 /********************************* UTILITIES ************************/
1546 * Expand the table by adding one more hash bucket.
1548 static bool
1549 expand_table(HTAB *hashp)
1551 HASHHDR *hctl = hashp->hctl;
1552 HASHSEGMENT old_seg,
1553 new_seg;
1554 long old_bucket,
1555 new_bucket;
1556 long new_segnum,
1557 new_segndx;
1558 long old_segnum,
1559 old_segndx;
1560 HASHBUCKET *oldlink,
1561 *newlink;
1562 HASHBUCKET currElement,
1563 nextElement;
1565 Assert(!IS_PARTITIONED(hctl));
1567 #ifdef HASH_STATISTICS
1568 hash_expansions++;
1569 #endif
1571 new_bucket = hctl->max_bucket + 1;
1572 new_segnum = new_bucket >> hashp->sshift;
1573 new_segndx = MOD(new_bucket, hashp->ssize);
1575 if (new_segnum >= hctl->nsegs)
1577 /* Allocate new segment if necessary -- could fail if dir full */
1578 if (new_segnum >= hctl->dsize)
1579 if (!dir_realloc(hashp))
1580 return false;
1581 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1582 return false;
1583 hctl->nsegs++;
1586 /* OK, we created a new bucket */
1587 hctl->max_bucket++;
1590 * *Before* changing masks, find old bucket corresponding to same hash
1591 * values; values in that bucket may need to be relocated to new bucket.
1592 * Note that new_bucket is certainly larger than low_mask at this point,
1593 * so we can skip the first step of the regular hash mask calc.
1595 old_bucket = (new_bucket & hctl->low_mask);
1598 * If we crossed a power of 2, readjust masks.
1600 if ((uint32) new_bucket > hctl->high_mask)
1602 hctl->low_mask = hctl->high_mask;
1603 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1607 * Relocate records to the new bucket. NOTE: because of the way the hash
1608 * masking is done in calc_bucket, only one old bucket can need to be
1609 * split at this point. With a different way of reducing the hash value,
1610 * that might not be true!
1612 old_segnum = old_bucket >> hashp->sshift;
1613 old_segndx = MOD(old_bucket, hashp->ssize);
1615 old_seg = hashp->dir[old_segnum];
1616 new_seg = hashp->dir[new_segnum];
1618 oldlink = &old_seg[old_segndx];
1619 newlink = &new_seg[new_segndx];
1621 for (currElement = *oldlink;
1622 currElement != NULL;
1623 currElement = nextElement)
1625 nextElement = currElement->link;
1626 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1628 *oldlink = currElement;
1629 oldlink = &currElement->link;
1631 else
1633 *newlink = currElement;
1634 newlink = &currElement->link;
1637 /* don't forget to terminate the rebuilt hash chains... */
1638 *oldlink = NULL;
1639 *newlink = NULL;
1641 return true;
1645 static bool
1646 dir_realloc(HTAB *hashp)
1648 HASHSEGMENT *p;
1649 HASHSEGMENT *old_p;
1650 long new_dsize;
1651 long old_dirsize;
1652 long new_dirsize;
1654 if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1655 return false;
1657 /* Reallocate directory */
1658 new_dsize = hashp->hctl->dsize << 1;
1659 old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1660 new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1662 old_p = hashp->dir;
1663 CurrentDynaHashCxt = hashp->hcxt;
1664 p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1666 if (p != NULL)
1668 memcpy(p, old_p, old_dirsize);
1669 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1670 hashp->dir = p;
1671 hashp->hctl->dsize = new_dsize;
1673 /* XXX assume the allocator is palloc, so we know how to free */
1674 Assert(hashp->alloc == DynaHashAlloc);
1675 pfree(old_p);
1677 return true;
1680 return false;
1684 static HASHSEGMENT
1685 seg_alloc(HTAB *hashp)
1687 HASHSEGMENT segp;
1689 CurrentDynaHashCxt = hashp->hcxt;
1690 segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1692 if (!segp)
1693 return NULL;
1695 MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1697 return segp;
1701 * allocate some new elements and link them into the indicated free list
1703 static bool
1704 element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1706 HASHHDR *hctl = hashp->hctl;
1707 Size elementSize;
1708 HASHELEMENT *firstElement;
1709 HASHELEMENT *tmpElement;
1710 HASHELEMENT *prevElement;
1711 int i;
1713 if (hashp->isfixed)
1714 return false;
1716 /* Each element has a HASHELEMENT header plus user data. */
1717 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1719 CurrentDynaHashCxt = hashp->hcxt;
1720 firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1722 if (!firstElement)
1723 return false;
1725 /* prepare to link all the new entries into the freelist */
1726 prevElement = NULL;
1727 tmpElement = firstElement;
1728 for (i = 0; i < nelem; i++)
1730 tmpElement->link = prevElement;
1731 prevElement = tmpElement;
1732 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1735 /* if partitioned, must lock to touch freeList */
1736 if (IS_PARTITIONED(hctl))
1737 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1739 /* freelist could be nonempty if two backends did this concurrently */
1740 firstElement->link = hctl->freeList[freelist_idx].freeList;
1741 hctl->freeList[freelist_idx].freeList = prevElement;
1743 if (IS_PARTITIONED(hctl))
1744 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1746 return true;
1749 /* complain when we have detected a corrupted hashtable */
1750 static void
1751 hash_corrupted(HTAB *hashp)
1754 * If the corruption is in a shared hashtable, we'd better force a
1755 * systemwide restart. Otherwise, just shut down this one backend.
1757 if (hashp->isshared)
1758 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1759 else
1760 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1763 /* calculate ceil(log base 2) of num */
1765 my_log2(long num)
1768 * guard against too-large input, which would be invalid for
1769 * pg_ceil_log2_*()
1771 if (num > LONG_MAX / 2)
1772 num = LONG_MAX / 2;
1774 #if SIZEOF_LONG < 8
1775 return pg_ceil_log2_32(num);
1776 #else
1777 return pg_ceil_log2_64(num);
1778 #endif
1781 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1782 static long
1783 next_pow2_long(long num)
1785 /* my_log2's internal range check is sufficient */
1786 return 1L << my_log2(num);
1789 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1790 static int
1791 next_pow2_int(long num)
1793 if (num > INT_MAX / 2)
1794 num = INT_MAX / 2;
1795 return 1 << my_log2(num);
1799 /************************* SEQ SCAN TRACKING ************************/
1802 * We track active hash_seq_search scans here. The need for this mechanism
1803 * comes from the fact that a scan will get confused if a bucket split occurs
1804 * while it's in progress: it might visit entries twice, or even miss some
1805 * entirely (if it's partway through the same bucket that splits). Hence
1806 * we want to inhibit bucket splits if there are any active scans on the
1807 * table being inserted into. This is a fairly rare case in current usage,
1808 * so just postponing the split until the next insertion seems sufficient.
1810 * Given present usages of the function, only a few scans are likely to be
1811 * open concurrently; so a finite-size stack of open scans seems sufficient,
1812 * and we don't worry that linear search is too slow. Note that we do
1813 * allow multiple scans of the same hashtable to be open concurrently.
1815 * This mechanism can support concurrent scan and insertion in a shared
1816 * hashtable if it's the same backend doing both. It would fail otherwise,
1817 * but locking reasons seem to preclude any such scenario anyway, so we don't
1818 * worry.
1820 * This arrangement is reasonably robust if a transient hashtable is deleted
1821 * without notifying us. The absolute worst case is we might inhibit splits
1822 * in another table created later at exactly the same address. We will give
1823 * a warning at transaction end for reference leaks, so any bugs leading to
1824 * lack of notification should be easy to catch.
1827 #define MAX_SEQ_SCANS 100
1829 static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1830 static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1831 static int num_seq_scans = 0;
1834 /* Register a table as having an active hash_seq_search scan */
1835 static void
1836 register_seq_scan(HTAB *hashp)
1838 if (num_seq_scans >= MAX_SEQ_SCANS)
1839 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1840 hashp->tabname);
1841 seq_scan_tables[num_seq_scans] = hashp;
1842 seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1843 num_seq_scans++;
1846 /* Deregister an active scan */
1847 static void
1848 deregister_seq_scan(HTAB *hashp)
1850 int i;
1852 /* Search backward since it's most likely at the stack top */
1853 for (i = num_seq_scans - 1; i >= 0; i--)
1855 if (seq_scan_tables[i] == hashp)
1857 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1858 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1859 num_seq_scans--;
1860 return;
1863 elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1864 hashp->tabname);
1867 /* Check if a table has any active scan */
1868 static bool
1869 has_seq_scans(HTAB *hashp)
1871 int i;
1873 for (i = 0; i < num_seq_scans; i++)
1875 if (seq_scan_tables[i] == hashp)
1876 return true;
1878 return false;
1881 /* Clean up any open scans at end of transaction */
1882 void
1883 AtEOXact_HashTables(bool isCommit)
1886 * During abort cleanup, open scans are expected; just silently clean 'em
1887 * out. An open scan at commit means someone forgot a hash_seq_term()
1888 * call, so complain.
1890 * Note: it's tempting to try to print the tabname here, but refrain for
1891 * fear of touching deallocated memory. This isn't a user-facing message
1892 * anyway, so it needn't be pretty.
1894 if (isCommit)
1896 int i;
1898 for (i = 0; i < num_seq_scans; i++)
1900 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1901 seq_scan_tables[i]);
1904 num_seq_scans = 0;
1907 /* Clean up any open scans at end of subtransaction */
1908 void
1909 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1911 int i;
1914 * Search backward to make cleanup easy. Note we must check all entries,
1915 * not only those at the end of the array, because deletion technique
1916 * doesn't keep them in order.
1918 for (i = num_seq_scans - 1; i >= 0; i--)
1920 if (seq_scan_level[i] >= nestDepth)
1922 if (isCommit)
1923 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1924 seq_scan_tables[i]);
1925 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1926 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1927 num_seq_scans--;