1 /*-------------------------------------------------------------------------
4 * dynamic chained hash tables
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
25 * For hash tables in shared memory, the memory allocator function should
26 * match malloc's semantics of returning NULL on failure. For hash tables
27 * in local memory, we typically use palloc() which will throw error on
28 * failure. The code in this file has to cope with both cases.
30 * dynahash.c provides support for these types of lookup keys:
32 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 * compared as though by strcmp(). This is selected by specifying the
34 * HASH_STRINGS flag to hash_create.
36 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
37 * (Caller must ensure there are no undefined padding bits in the keys!)
38 * This is selected by specifying the HASH_BLOBS flag to hash_create.
40 * 3. More complex key behavior can be selected by specifying user-supplied
41 * hashing, comparison, and/or key-copying functions. At least a hashing
42 * function must be supplied; comparison defaults to memcmp() and key copying
43 * to memcpy() when a user-defined hashing function is selected.
45 * Compared to simplehash, dynahash has the following benefits:
47 * - It supports partitioning, which is useful for shared memory access using
49 * - Shared memory hashes are allocated in a fixed size area at startup and
50 * are discoverable by name from other processes.
51 * - Because entries don't need to be moved in the case of hash conflicts,
52 * dynahash has better performance for large entries.
53 * - Guarantees stable pointers to entries.
55 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
56 * Portions Copyright (c) 1994, Regents of the University of California
60 * src/backend/utils/hash/dynahash.c
62 *-------------------------------------------------------------------------
68 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
69 * Coded into C, with minor code improvements, and with hsearch(3) interface,
70 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
71 * also, hcreate/hdestroy routines added to simulate hsearch(3).
73 * These routines simulate hsearch(3) and family, with the important
74 * difference that the hash table is dynamic - can grow indefinitely
75 * beyond its original size (as supplied to hcreate()).
77 * Performance appears to be comparable to that of hsearch(3).
78 * The 'source-code' options referred to in hsearch(3)'s 'man' page
79 * are not implemented; otherwise functionality is identical.
81 * Compilation controls:
82 * HASH_DEBUG controls some informative traces, mainly for debugging.
83 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
84 * when combined with HASH_DEBUG, these are displayed by hdestroy().
86 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 * concatenation property, in probably unnecessary code 'optimization'.
89 * Modified margo@postgres.berkeley.edu February 1990
90 * added multiple table interface
91 * Modified by sullivan@postgres.berkeley.edu April 1990
92 * changed ctl structure for shared memory
99 #include "access/xact.h"
100 #include "common/hashfn.h"
101 #include "port/pg_bitutils.h"
102 #include "storage/shmem.h"
103 #include "storage/spin.h"
104 #include "utils/dynahash.h"
105 #include "utils/memutils.h"
111 * A hash table has a top-level "directory", each of whose entries points
112 * to a "segment" of ssize bucket headers. The maximum number of hash
113 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
114 * the number of records in the table can be larger, but we don't want a
115 * whole lot of records per bucket or performance goes down.
117 * In a hash table allocated in shared memory, the directory cannot be
118 * expanded because it must stay at a fixed address. The directory size
119 * should be selected using hash_select_dirsize (and you'd better have
120 * a good idea of the maximum number of entries!). For non-shared hash
121 * tables, the initial directory size can be left at the default.
123 #define DEF_SEGSIZE 256
124 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
125 #define DEF_DIRSIZE 256
127 /* Number of freelists to be used for a partitioned hash table. */
128 #define NUM_FREELISTS 32
130 /* A hash bucket is a linked list of HASHELEMENTs */
131 typedef HASHELEMENT
*HASHBUCKET
;
133 /* A hash segment is an array of bucket headers */
134 typedef HASHBUCKET
*HASHSEGMENT
;
139 * In a partitioned hash table, each freelist is associated with a specific
140 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
141 * nentries tracks the number of live hashtable entries having those hashcodes
142 * (NOT the number of entries in the freelist, as you might expect).
144 * The coverage of a freelist might be more or less than one partition, so it
145 * needs its own lock rather than relying on caller locking. Relying on that
146 * wouldn't work even if the coverage was the same, because of the occasional
147 * need to "borrow" entries from another freelist; see get_hash_entry().
149 * Using an array of FreeListData instead of separate arrays of mutexes,
150 * nentries and freeLists helps to reduce sharing of cache lines between
155 slock_t mutex
; /* spinlock for this freelist */
156 long nentries
; /* number of entries in associated buckets */
157 HASHELEMENT
*freeList
; /* chain of free elements */
161 * Header structure for a hash table --- contains all changeable info
163 * In a shared-memory hash table, the HASHHDR is in shared memory, while
164 * each backend has a local HTAB struct. For a non-shared table, there isn't
165 * any functional difference between HASHHDR and HTAB, but we separate them
166 * anyway to share code between shared and non-shared tables.
171 * The freelist can become a point of contention in high-concurrency hash
172 * tables, so we use an array of freelists, each with its own mutex and
173 * nentries count, instead of just a single one. Although the freelists
174 * normally operate independently, we will scavenge entries from freelists
175 * other than a hashcode's default freelist when necessary.
177 * If the hash table is not partitioned, only freeList[0] is used and its
178 * spinlock is not used at all; callers' locking is assumed sufficient.
180 FreeListData freeList
[NUM_FREELISTS
];
182 /* These fields can change, but not in a partitioned table */
183 /* Also, dsize can't change in a shared table, even if unpartitioned */
184 long dsize
; /* directory size */
185 long nsegs
; /* number of allocated segments (<= dsize) */
186 uint32 max_bucket
; /* ID of maximum bucket in use */
187 uint32 high_mask
; /* mask to modulo into entire table */
188 uint32 low_mask
; /* mask to modulo into lower half of table */
190 /* These fields are fixed at hashtable creation */
191 Size keysize
; /* hash key length in bytes */
192 Size entrysize
; /* total user element size in bytes */
193 long num_partitions
; /* # partitions (must be power of 2), or 0 */
194 long max_dsize
; /* 'dsize' limit if directory is fixed size */
195 long ssize
; /* segment size --- must be power of 2 */
196 int sshift
; /* segment shift = log2(ssize) */
197 int nelem_alloc
; /* number of entries to allocate at once */
199 #ifdef HASH_STATISTICS
202 * Count statistics here. NB: stats code doesn't bother with mutex, so
203 * counts could be corrupted a bit in a partitioned table.
210 #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
212 #define FREELIST_IDX(hctl, hashcode) \
213 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
216 * Top control structure for a hashtable --- in a shared table, each backend
217 * has its own copy (OK since no fields change at runtime)
221 HASHHDR
*hctl
; /* => shared control information */
222 HASHSEGMENT
*dir
; /* directory of segment starts */
223 HashValueFunc hash
; /* hash function */
224 HashCompareFunc match
; /* key comparison function */
225 HashCopyFunc keycopy
; /* key copying function */
226 HashAllocFunc alloc
; /* memory allocator */
227 MemoryContext hcxt
; /* memory context if default allocator used */
228 char *tabname
; /* table name (for error messages) */
229 bool isshared
; /* true if table is in shared memory */
230 bool isfixed
; /* if true, don't enlarge */
232 /* freezing a shared table isn't allowed, so we can keep state here */
233 bool frozen
; /* true = no more inserts allowed */
235 /* We keep local copies of these fixed values to reduce contention */
236 Size keysize
; /* hash key length in bytes */
237 long ssize
; /* segment size --- must be power of 2 */
238 int sshift
; /* segment shift = log2(ssize) */
242 * Key (also entry) part of a HASHELEMENT
244 #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
247 * Obtain element pointer given pointer to key
249 #define ELEMENT_FROM_KEY(key) \
250 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
253 * Fast MOD arithmetic, assuming that y is a power of 2 !
255 #define MOD(x,y) ((x) & ((y)-1))
257 #ifdef HASH_STATISTICS
258 static long hash_accesses
,
264 * Private function prototypes
266 static void *DynaHashAlloc(Size size
);
267 static HASHSEGMENT
seg_alloc(HTAB
*hashp
);
268 static bool element_alloc(HTAB
*hashp
, int nelem
, int freelist_idx
);
269 static bool dir_realloc(HTAB
*hashp
);
270 static bool expand_table(HTAB
*hashp
);
271 static HASHBUCKET
get_hash_entry(HTAB
*hashp
, int freelist_idx
);
272 static void hdefault(HTAB
*hashp
);
273 static int choose_nelem_alloc(Size entrysize
);
274 static bool init_htab(HTAB
*hashp
, long nelem
);
275 static void hash_corrupted(HTAB
*hashp
);
276 static long next_pow2_long(long num
);
277 static int next_pow2_int(long num
);
278 static void register_seq_scan(HTAB
*hashp
);
279 static void deregister_seq_scan(HTAB
*hashp
);
280 static bool has_seq_scans(HTAB
*hashp
);
284 * memory allocation support
286 static MemoryContext CurrentDynaHashCxt
= NULL
;
289 DynaHashAlloc(Size size
)
291 Assert(MemoryContextIsValid(CurrentDynaHashCxt
));
292 return MemoryContextAlloc(CurrentDynaHashCxt
, size
);
297 * HashCompareFunc for string keys
299 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
300 * bytes, so we can only compare that many ... hence strncmp is almost but
301 * not quite the right thing.
304 string_compare(const char *key1
, const char *key2
, Size keysize
)
306 return strncmp(key1
, key2
, keysize
- 1);
310 /************************** CREATE ROUTINES **********************/
313 * hash_create -- create a new dynamic hash table
315 * tabname: a name for the table (for debugging purposes)
316 * nelem: maximum number of elements expected
317 * *info: additional table parameters, as indicated by flags
318 * flags: bitmask indicating which parameters to take from *info
320 * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
321 * optional, but the default keysize and entrysize values were useless.)
322 * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
323 * or HASH_FUNCTION, to define the key hashing semantics (C strings,
324 * binary blobs, or custom, respectively). Callers specifying a custom
325 * hash function will likely also want to use HASH_COMPARE, and perhaps
326 * also HASH_KEYCOPY, to control key comparison and copying.
327 * Another often-used flag is HASH_CONTEXT, to allocate the hash table
328 * under info->hcxt rather than under TopMemoryContext; the default
329 * behavior is only suitable for session-lifespan hash tables.
330 * Other flags bits are special-purpose and seldom used, except for those
331 * associated with shared-memory hash tables, for which see ShmemInitHash().
333 * Fields in *info are read only when the associated flags bit is set.
334 * It is not necessary to initialize other fields of *info.
335 * Neither tabname nor *info need persist after the hash_create() call.
337 * Note: It is deprecated for callers of hash_create() to explicitly specify
338 * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
339 * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
342 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
343 * estimate, since we can't expand the table on the fly. But an unshared
344 * hashtable can be expanded on-the-fly, so it's better for nelem to be
345 * on the small side and let the table grow if it's exceeded. An overly
346 * large nelem will penalize hash_seq_search speed without buying much.
349 hash_create(const char *tabname
, long nelem
, const HASHCTL
*info
, int flags
)
355 * Hash tables now allocate space for key and data, but you have to say
356 * how much space to allocate.
358 Assert(flags
& HASH_ELEM
);
359 Assert(info
->keysize
> 0);
360 Assert(info
->entrysize
>= info
->keysize
);
363 * For shared hash tables, we have a local hash header (HTAB struct) that
364 * we allocate in TopMemoryContext; all else is in shared memory.
366 * For non-shared hash tables, everything including the hash header is in
367 * a memory context created specially for the hash table --- this makes
368 * hash_destroy very simple. The memory context is made a child of either
369 * a context specified by the caller, or TopMemoryContext if nothing is
372 if (flags
& HASH_SHARED_MEM
)
374 /* Set up to allocate the hash header */
375 CurrentDynaHashCxt
= TopMemoryContext
;
379 /* Create the hash table's private memory context */
380 if (flags
& HASH_CONTEXT
)
381 CurrentDynaHashCxt
= info
->hcxt
;
383 CurrentDynaHashCxt
= TopMemoryContext
;
384 CurrentDynaHashCxt
= AllocSetContextCreate(CurrentDynaHashCxt
,
386 ALLOCSET_DEFAULT_SIZES
);
389 /* Initialize the hash header, plus a copy of the table name */
390 hashp
= (HTAB
*) DynaHashAlloc(sizeof(HTAB
) + strlen(tabname
) + 1);
391 MemSet(hashp
, 0, sizeof(HTAB
));
393 hashp
->tabname
= (char *) (hashp
+ 1);
394 strcpy(hashp
->tabname
, tabname
);
396 /* If we have a private context, label it with hashtable's name */
397 if (!(flags
& HASH_SHARED_MEM
))
398 MemoryContextSetIdentifier(CurrentDynaHashCxt
, hashp
->tabname
);
401 * Select the appropriate hash function (see comments at head of file).
403 if (flags
& HASH_FUNCTION
)
405 Assert(!(flags
& (HASH_BLOBS
| HASH_STRINGS
)));
406 hashp
->hash
= info
->hash
;
408 else if (flags
& HASH_BLOBS
)
410 Assert(!(flags
& HASH_STRINGS
));
411 /* We can optimize hashing for common key sizes */
412 if (info
->keysize
== sizeof(uint32
))
413 hashp
->hash
= uint32_hash
;
415 hashp
->hash
= tag_hash
;
420 * string_hash used to be considered the default hash method, and in a
421 * non-assert build it effectively still is. But we now consider it
422 * an assertion error to not say HASH_STRINGS explicitly. To help
423 * catch mistaken usage of HASH_STRINGS, we also insist on a
424 * reasonably long string length: if the keysize is only 4 or 8 bytes,
425 * it's almost certainly an integer or pointer not a string.
427 Assert(flags
& HASH_STRINGS
);
428 Assert(info
->keysize
> 8);
430 hashp
->hash
= string_hash
;
434 * If you don't specify a match function, it defaults to string_compare if
435 * you used string_hash, and to memcmp otherwise.
437 * Note: explicitly specifying string_hash is deprecated, because this
438 * might not work for callers in loadable modules on some platforms due to
439 * referencing a trampoline instead of the string_hash function proper.
440 * Specify HASH_STRINGS instead.
442 if (flags
& HASH_COMPARE
)
443 hashp
->match
= info
->match
;
444 else if (hashp
->hash
== string_hash
)
445 hashp
->match
= (HashCompareFunc
) string_compare
;
447 hashp
->match
= memcmp
;
450 * Similarly, the key-copying function defaults to strlcpy or memcpy.
452 if (flags
& HASH_KEYCOPY
)
453 hashp
->keycopy
= info
->keycopy
;
454 else if (hashp
->hash
== string_hash
)
457 * The signature of keycopy is meant for memcpy(), which returns
458 * void*, but strlcpy() returns size_t. Since we never use the return
459 * value of keycopy, and size_t is pretty much always the same size as
460 * void *, this should be safe. The extra cast in the middle is to
461 * avoid warnings from -Wcast-function-type.
463 hashp
->keycopy
= (HashCopyFunc
) (pg_funcptr_t
) strlcpy
;
466 hashp
->keycopy
= memcpy
;
468 /* And select the entry allocation function, too. */
469 if (flags
& HASH_ALLOC
)
470 hashp
->alloc
= info
->alloc
;
472 hashp
->alloc
= DynaHashAlloc
;
474 if (flags
& HASH_SHARED_MEM
)
477 * ctl structure and directory are preallocated for shared memory
478 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
481 hashp
->hctl
= info
->hctl
;
482 hashp
->dir
= (HASHSEGMENT
*) (((char *) info
->hctl
) + sizeof(HASHHDR
));
484 hashp
->isshared
= true;
486 /* hash table already exists, we're just attaching to it */
487 if (flags
& HASH_ATTACH
)
489 /* make local copies of some heavily-used values */
491 hashp
->keysize
= hctl
->keysize
;
492 hashp
->ssize
= hctl
->ssize
;
493 hashp
->sshift
= hctl
->sshift
;
500 /* setup hash table defaults */
503 hashp
->hcxt
= CurrentDynaHashCxt
;
504 hashp
->isshared
= false;
509 hashp
->hctl
= (HASHHDR
*) hashp
->alloc(sizeof(HASHHDR
));
512 (errcode(ERRCODE_OUT_OF_MEMORY
),
513 errmsg("out of memory")));
516 hashp
->frozen
= false;
522 if (flags
& HASH_PARTITION
)
524 /* Doesn't make sense to partition a local hash table */
525 Assert(flags
& HASH_SHARED_MEM
);
528 * The number of partitions had better be a power of 2. Also, it must
529 * be less than INT_MAX (see init_htab()), so call the int version of
532 Assert(info
->num_partitions
== next_pow2_int(info
->num_partitions
));
534 hctl
->num_partitions
= info
->num_partitions
;
537 if (flags
& HASH_SEGMENT
)
539 hctl
->ssize
= info
->ssize
;
540 hctl
->sshift
= my_log2(info
->ssize
);
541 /* ssize had better be a power of 2 */
542 Assert(hctl
->ssize
== (1L << hctl
->sshift
));
546 * SHM hash tables have fixed directory size passed by the caller.
548 if (flags
& HASH_DIRSIZE
)
550 hctl
->max_dsize
= info
->max_dsize
;
551 hctl
->dsize
= info
->dsize
;
554 /* remember the entry sizes, too */
555 hctl
->keysize
= info
->keysize
;
556 hctl
->entrysize
= info
->entrysize
;
558 /* make local copies of heavily-used constant fields */
559 hashp
->keysize
= hctl
->keysize
;
560 hashp
->ssize
= hctl
->ssize
;
561 hashp
->sshift
= hctl
->sshift
;
563 /* Build the hash directory structure */
564 if (!init_htab(hashp
, nelem
))
565 elog(ERROR
, "failed to initialize hash table \"%s\"", hashp
->tabname
);
568 * For a shared hash table, preallocate the requested number of elements.
569 * This reduces problems with run-time out-of-shared-memory conditions.
571 * For a non-shared hash table, preallocate the requested number of
572 * elements if it's less than our chosen nelem_alloc. This avoids wasting
573 * space if the caller correctly estimates a small table size.
575 if ((flags
& HASH_SHARED_MEM
) ||
576 nelem
< hctl
->nelem_alloc
)
584 * If hash table is partitioned, give each freelist an equal share of
585 * the initial allocation. Otherwise only freeList[0] is used.
587 if (IS_PARTITIONED(hashp
->hctl
))
588 freelist_partitions
= NUM_FREELISTS
;
590 freelist_partitions
= 1;
592 nelem_alloc
= nelem
/ freelist_partitions
;
593 if (nelem_alloc
<= 0)
597 * Make sure we'll allocate all the requested elements; freeList[0]
598 * gets the excess if the request isn't divisible by NUM_FREELISTS.
600 if (nelem_alloc
* freelist_partitions
< nelem
)
602 nelem
- nelem_alloc
* (freelist_partitions
- 1);
604 nelem_alloc_first
= nelem_alloc
;
606 for (i
= 0; i
< freelist_partitions
; i
++)
608 int temp
= (i
== 0) ? nelem_alloc_first
: nelem_alloc
;
610 if (!element_alloc(hashp
, temp
, i
))
612 (errcode(ERRCODE_OUT_OF_MEMORY
),
613 errmsg("out of memory")));
617 if (flags
& HASH_FIXED_SIZE
)
618 hashp
->isfixed
= true;
623 * Set default HASHHDR parameters.
626 hdefault(HTAB
*hashp
)
628 HASHHDR
*hctl
= hashp
->hctl
;
630 MemSet(hctl
, 0, sizeof(HASHHDR
));
632 hctl
->dsize
= DEF_DIRSIZE
;
635 hctl
->num_partitions
= 0; /* not partitioned */
637 /* table has no fixed maximum size */
638 hctl
->max_dsize
= NO_MAX_DSIZE
;
640 hctl
->ssize
= DEF_SEGSIZE
;
641 hctl
->sshift
= DEF_SEGSIZE_SHIFT
;
643 #ifdef HASH_STATISTICS
644 hctl
->accesses
= hctl
->collisions
= 0;
649 * Given the user-specified entry size, choose nelem_alloc, ie, how many
650 * elements to add to the hash table when we need more.
653 choose_nelem_alloc(Size entrysize
)
659 /* Each element has a HASHELEMENT header plus user data. */
660 /* NB: this had better match element_alloc() */
661 elementSize
= MAXALIGN(sizeof(HASHELEMENT
)) + MAXALIGN(entrysize
);
664 * The idea here is to choose nelem_alloc at least 32, but round up so
665 * that the allocation request will be a power of 2 or just less. This
666 * makes little difference for hash tables in shared memory, but for hash
667 * tables managed by palloc, the allocation request will be rounded up to
668 * a power of 2 anyway. If we fail to take this into account, we'll waste
669 * as much as half the allocated space.
671 allocSize
= 32 * 4; /* assume elementSize at least 8 */
675 nelem_alloc
= allocSize
/ elementSize
;
676 } while (nelem_alloc
< 32);
682 * Compute derived fields of hctl and build the initial directory/segment
686 init_htab(HTAB
*hashp
, long nelem
)
688 HASHHDR
*hctl
= hashp
->hctl
;
695 * initialize mutexes if it's a partitioned table
697 if (IS_PARTITIONED(hctl
))
698 for (i
= 0; i
< NUM_FREELISTS
; i
++)
699 SpinLockInit(&(hctl
->freeList
[i
].mutex
));
702 * Allocate space for the next greater power of two number of buckets,
703 * assuming a desired maximum load factor of 1.
705 nbuckets
= next_pow2_int(nelem
);
708 * In a partitioned table, nbuckets must be at least equal to
709 * num_partitions; were it less, keys with apparently different partition
710 * numbers would map to the same bucket, breaking partition independence.
711 * (Normally nbuckets will be much bigger; this is just a safety check.)
713 while (nbuckets
< hctl
->num_partitions
)
716 hctl
->max_bucket
= hctl
->low_mask
= nbuckets
- 1;
717 hctl
->high_mask
= (nbuckets
<< 1) - 1;
720 * Figure number of directory segments needed, round up to a power of 2
722 nsegs
= (nbuckets
- 1) / hctl
->ssize
+ 1;
723 nsegs
= next_pow2_int(nsegs
);
726 * Make sure directory is big enough. If pre-allocated directory is too
727 * small, choke (caller screwed up).
729 if (nsegs
> hctl
->dsize
)
737 /* Allocate a directory */
740 CurrentDynaHashCxt
= hashp
->hcxt
;
741 hashp
->dir
= (HASHSEGMENT
*)
742 hashp
->alloc(hctl
->dsize
* sizeof(HASHSEGMENT
));
747 /* Allocate initial segments */
748 for (segp
= hashp
->dir
; hctl
->nsegs
< nsegs
; hctl
->nsegs
++, segp
++)
750 *segp
= seg_alloc(hashp
);
755 /* Choose number of entries to allocate at a time */
756 hctl
->nelem_alloc
= choose_nelem_alloc(hctl
->entrysize
);
759 fprintf(stderr
, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
760 "TABLE POINTER ", hashp
,
761 "DIRECTORY SIZE ", hctl
->dsize
,
762 "SEGMENT SIZE ", hctl
->ssize
,
763 "SEGMENT SHIFT ", hctl
->sshift
,
764 "MAX BUCKET ", hctl
->max_bucket
,
765 "HIGH MASK ", hctl
->high_mask
,
766 "LOW MASK ", hctl
->low_mask
,
767 "NSEGS ", hctl
->nsegs
);
773 * Estimate the space needed for a hashtable containing the given number
774 * of entries of given size.
775 * NOTE: this is used to estimate the footprint of hashtables in shared
776 * memory; therefore it does not count HTAB which is in local memory.
777 * NB: assumes that all hash structure parameters have default values!
780 hash_estimate_size(long num_entries
, Size entrysize
)
790 /* estimate number of buckets wanted */
791 nBuckets
= next_pow2_long(num_entries
);
792 /* # of segments needed for nBuckets */
793 nSegments
= next_pow2_long((nBuckets
- 1) / DEF_SEGSIZE
+ 1);
794 /* directory entries */
795 nDirEntries
= DEF_DIRSIZE
;
796 while (nDirEntries
< nSegments
)
797 nDirEntries
<<= 1; /* dir_alloc doubles dsize at each call */
799 /* fixed control info */
800 size
= MAXALIGN(sizeof(HASHHDR
)); /* but not HTAB, per above */
802 size
= add_size(size
, mul_size(nDirEntries
, sizeof(HASHSEGMENT
)));
804 size
= add_size(size
, mul_size(nSegments
,
805 MAXALIGN(DEF_SEGSIZE
* sizeof(HASHBUCKET
))));
806 /* elements --- allocated in groups of choose_nelem_alloc() entries */
807 elementAllocCnt
= choose_nelem_alloc(entrysize
);
808 nElementAllocs
= (num_entries
- 1) / elementAllocCnt
+ 1;
809 elementSize
= MAXALIGN(sizeof(HASHELEMENT
)) + MAXALIGN(entrysize
);
810 size
= add_size(size
,
811 mul_size(nElementAllocs
,
812 mul_size(elementAllocCnt
, elementSize
)));
818 * Select an appropriate directory size for a hashtable with the given
819 * maximum number of entries.
820 * This is only needed for hashtables in shared memory, whose directories
821 * cannot be expanded dynamically.
822 * NB: assumes that all hash structure parameters have default values!
824 * XXX this had better agree with the behavior of init_htab()...
827 hash_select_dirsize(long num_entries
)
833 /* estimate number of buckets wanted */
834 nBuckets
= next_pow2_long(num_entries
);
835 /* # of segments needed for nBuckets */
836 nSegments
= next_pow2_long((nBuckets
- 1) / DEF_SEGSIZE
+ 1);
837 /* directory entries */
838 nDirEntries
= DEF_DIRSIZE
;
839 while (nDirEntries
< nSegments
)
840 nDirEntries
<<= 1; /* dir_alloc doubles dsize at each call */
846 * Compute the required initial memory allocation for a shared-memory
847 * hashtable with the given parameters. We need space for the HASHHDR
848 * and for the (non expansible) directory.
851 hash_get_shared_size(HASHCTL
*info
, int flags
)
853 Assert(flags
& HASH_DIRSIZE
);
854 Assert(info
->dsize
== info
->max_dsize
);
855 return sizeof(HASHHDR
) + info
->dsize
* sizeof(HASHSEGMENT
);
859 /********************** DESTROY ROUTINES ************************/
862 hash_destroy(HTAB
*hashp
)
866 /* allocation method must be one we know how to free, too */
867 Assert(hashp
->alloc
== DynaHashAlloc
);
868 /* so this hashtable must have its own context */
869 Assert(hashp
->hcxt
!= NULL
);
871 hash_stats("destroy", hashp
);
874 * Free everything by destroying the hash table's memory context.
876 MemoryContextDelete(hashp
->hcxt
);
881 hash_stats(const char *where
, HTAB
*hashp
)
883 #ifdef HASH_STATISTICS
884 fprintf(stderr
, "%s: this HTAB -- accesses %ld collisions %ld\n",
885 where
, hashp
->hctl
->accesses
, hashp
->hctl
->collisions
);
887 fprintf(stderr
, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
888 hash_get_num_entries(hashp
), (long) hashp
->hctl
->keysize
,
889 hashp
->hctl
->max_bucket
, hashp
->hctl
->nsegs
);
890 fprintf(stderr
, "%s: total accesses %ld total collisions %ld\n",
891 where
, hash_accesses
, hash_collisions
);
892 fprintf(stderr
, "hash_stats: total expansions %ld\n",
897 /*******************************SEARCH ROUTINES *****************************/
901 * get_hash_value -- exported routine to calculate a key's hash value
903 * We export this because for partitioned tables, callers need to compute
904 * the partition number (from the low-order bits of the hash value) before
908 get_hash_value(HTAB
*hashp
, const void *keyPtr
)
910 return hashp
->hash(keyPtr
, hashp
->keysize
);
913 /* Convert a hash value to a bucket number */
915 calc_bucket(HASHHDR
*hctl
, uint32 hash_val
)
919 bucket
= hash_val
& hctl
->high_mask
;
920 if (bucket
> hctl
->max_bucket
)
921 bucket
= bucket
& hctl
->low_mask
;
927 * hash_search -- look up key in table and perform action
928 * hash_search_with_hash_value -- same, with key's hash value already computed
931 * HASH_FIND: look up key in table
932 * HASH_ENTER: look up key in table, creating entry if not present
933 * HASH_ENTER_NULL: same, but return NULL if out of memory
934 * HASH_REMOVE: look up key in table, remove entry if present
936 * Return value is a pointer to the element found/entered/removed if any,
937 * or NULL if no match was found. (NB: in the case of the REMOVE action,
938 * the result is a dangling pointer that shouldn't be dereferenced!)
940 * HASH_ENTER will normally ereport a generic "out of memory" error if
941 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
942 * the same except it will return NULL if out of memory. Note that
943 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
944 * since palloc internally ereports on out-of-memory.
946 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
947 * existing entry in the table, false otherwise. This is needed in the
948 * HASH_ENTER case, but is redundant with the return value otherwise.
950 * For hash_search_with_hash_value, the hashvalue parameter must have been
951 * calculated with get_hash_value().
954 hash_search(HTAB
*hashp
,
959 return hash_search_with_hash_value(hashp
,
961 hashp
->hash(keyPtr
, hashp
->keysize
),
967 hash_search_with_hash_value(HTAB
*hashp
,
973 HASHHDR
*hctl
= hashp
->hctl
;
974 int freelist_idx
= FREELIST_IDX(hctl
, hashvalue
);
980 HASHBUCKET currBucket
;
981 HASHBUCKET
*prevBucketPtr
;
982 HashCompareFunc match
;
984 #ifdef HASH_STATISTICS
990 * If inserting, check if it is time to split a bucket.
992 * NOTE: failure to expand table is not a fatal error, it just means we
993 * have to run at higher fill factor than we wanted. However, if we're
994 * using the palloc allocator then it will throw error anyway on
995 * out-of-memory, so we must do this before modifying the table.
997 if (action
== HASH_ENTER
|| action
== HASH_ENTER_NULL
)
1000 * Can't split if running in partitioned mode, nor if frozen, nor if
1001 * table is the subject of any active hash_seq_search scans.
1003 if (hctl
->freeList
[0].nentries
> (long) hctl
->max_bucket
&&
1004 !IS_PARTITIONED(hctl
) && !hashp
->frozen
&&
1005 !has_seq_scans(hashp
))
1006 (void) expand_table(hashp
);
1010 * Do the initial lookup
1012 bucket
= calc_bucket(hctl
, hashvalue
);
1014 segment_num
= bucket
>> hashp
->sshift
;
1015 segment_ndx
= MOD(bucket
, hashp
->ssize
);
1017 segp
= hashp
->dir
[segment_num
];
1020 hash_corrupted(hashp
);
1022 prevBucketPtr
= &segp
[segment_ndx
];
1023 currBucket
= *prevBucketPtr
;
1026 * Follow collision chain looking for matching key
1028 match
= hashp
->match
; /* save one fetch in inner loop */
1029 keysize
= hashp
->keysize
; /* ditto */
1031 while (currBucket
!= NULL
)
1033 if (currBucket
->hashvalue
== hashvalue
&&
1034 match(ELEMENTKEY(currBucket
), keyPtr
, keysize
) == 0)
1036 prevBucketPtr
= &(currBucket
->link
);
1037 currBucket
= *prevBucketPtr
;
1038 #ifdef HASH_STATISTICS
1045 *foundPtr
= (bool) (currBucket
!= NULL
);
1053 if (currBucket
!= NULL
)
1054 return (void *) ELEMENTKEY(currBucket
);
1058 if (currBucket
!= NULL
)
1060 /* if partitioned, must lock to touch nentries and freeList */
1061 if (IS_PARTITIONED(hctl
))
1062 SpinLockAcquire(&(hctl
->freeList
[freelist_idx
].mutex
));
1064 /* delete the record from the appropriate nentries counter. */
1065 Assert(hctl
->freeList
[freelist_idx
].nentries
> 0);
1066 hctl
->freeList
[freelist_idx
].nentries
--;
1068 /* remove record from hash bucket's chain. */
1069 *prevBucketPtr
= currBucket
->link
;
1071 /* add the record to the appropriate freelist. */
1072 currBucket
->link
= hctl
->freeList
[freelist_idx
].freeList
;
1073 hctl
->freeList
[freelist_idx
].freeList
= currBucket
;
1075 if (IS_PARTITIONED(hctl
))
1076 SpinLockRelease(&hctl
->freeList
[freelist_idx
].mutex
);
1079 * better hope the caller is synchronizing access to this
1080 * element, because someone else is going to reuse it the next
1081 * time something is added to the table
1083 return (void *) ELEMENTKEY(currBucket
);
1087 case HASH_ENTER_NULL
:
1088 /* ENTER_NULL does not work with palloc-based allocator */
1089 Assert(hashp
->alloc
!= DynaHashAlloc
);
1093 /* Return existing element if found, else create one */
1094 if (currBucket
!= NULL
)
1095 return (void *) ELEMENTKEY(currBucket
);
1097 /* disallow inserts if frozen */
1099 elog(ERROR
, "cannot insert into frozen hashtable \"%s\"",
1102 currBucket
= get_hash_entry(hashp
, freelist_idx
);
1103 if (currBucket
== NULL
)
1106 if (action
== HASH_ENTER_NULL
)
1108 /* report a generic message */
1109 if (hashp
->isshared
)
1111 (errcode(ERRCODE_OUT_OF_MEMORY
),
1112 errmsg("out of shared memory")));
1115 (errcode(ERRCODE_OUT_OF_MEMORY
),
1116 errmsg("out of memory")));
1119 /* link into hashbucket chain */
1120 *prevBucketPtr
= currBucket
;
1121 currBucket
->link
= NULL
;
1123 /* copy key into record */
1124 currBucket
->hashvalue
= hashvalue
;
1125 hashp
->keycopy(ELEMENTKEY(currBucket
), keyPtr
, keysize
);
1128 * Caller is expected to fill the data field on return. DO NOT
1129 * insert any code that could possibly throw error here, as doing
1130 * so would leave the table entry incomplete and hence corrupt the
1131 * caller's data structure.
1134 return (void *) ELEMENTKEY(currBucket
);
1137 elog(ERROR
, "unrecognized hash action code: %d", (int) action
);
1139 return NULL
; /* keep compiler quiet */
1143 * hash_update_hash_key -- change the hash key of an existing table entry
1145 * This is equivalent to removing the entry, making a new entry, and copying
1146 * over its data, except that the entry never goes to the table's freelist.
1147 * Therefore this cannot suffer an out-of-memory failure, even if there are
1148 * other processes operating in other partitions of the hashtable.
1150 * Returns true if successful, false if the requested new hash key is already
1151 * present. Throws error if the specified entry pointer isn't actually a
1154 * NB: currently, there is no special case for old and new hash keys being
1155 * identical, which means we'll report false for that situation. This is
1156 * preferable for existing uses.
1158 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1159 * partitions, if the new hash key would belong to a different partition.
1162 hash_update_hash_key(HTAB
*hashp
,
1163 void *existingEntry
,
1164 const void *newKeyPtr
)
1166 HASHELEMENT
*existingElement
= ELEMENT_FROM_KEY(existingEntry
);
1167 HASHHDR
*hctl
= hashp
->hctl
;
1168 uint32 newhashvalue
;
1175 HASHBUCKET currBucket
;
1176 HASHBUCKET
*prevBucketPtr
;
1177 HASHBUCKET
*oldPrevPtr
;
1178 HashCompareFunc match
;
1180 #ifdef HASH_STATISTICS
1185 /* disallow updates if frozen */
1187 elog(ERROR
, "cannot update in frozen hashtable \"%s\"",
1191 * Lookup the existing element using its saved hash value. We need to do
1192 * this to be able to unlink it from its hash chain, but as a side benefit
1193 * we can verify the validity of the passed existingEntry pointer.
1195 bucket
= calc_bucket(hctl
, existingElement
->hashvalue
);
1197 segment_num
= bucket
>> hashp
->sshift
;
1198 segment_ndx
= MOD(bucket
, hashp
->ssize
);
1200 segp
= hashp
->dir
[segment_num
];
1203 hash_corrupted(hashp
);
1205 prevBucketPtr
= &segp
[segment_ndx
];
1206 currBucket
= *prevBucketPtr
;
1208 while (currBucket
!= NULL
)
1210 if (currBucket
== existingElement
)
1212 prevBucketPtr
= &(currBucket
->link
);
1213 currBucket
= *prevBucketPtr
;
1216 if (currBucket
== NULL
)
1217 elog(ERROR
, "hash_update_hash_key argument is not in hashtable \"%s\"",
1220 oldPrevPtr
= prevBucketPtr
;
1223 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1224 * chain we want to put the entry into.
1226 newhashvalue
= hashp
->hash(newKeyPtr
, hashp
->keysize
);
1228 newbucket
= calc_bucket(hctl
, newhashvalue
);
1230 segment_num
= newbucket
>> hashp
->sshift
;
1231 segment_ndx
= MOD(newbucket
, hashp
->ssize
);
1233 segp
= hashp
->dir
[segment_num
];
1236 hash_corrupted(hashp
);
1238 prevBucketPtr
= &segp
[segment_ndx
];
1239 currBucket
= *prevBucketPtr
;
1242 * Follow collision chain looking for matching key
1244 match
= hashp
->match
; /* save one fetch in inner loop */
1245 keysize
= hashp
->keysize
; /* ditto */
1247 while (currBucket
!= NULL
)
1249 if (currBucket
->hashvalue
== newhashvalue
&&
1250 match(ELEMENTKEY(currBucket
), newKeyPtr
, keysize
) == 0)
1252 prevBucketPtr
= &(currBucket
->link
);
1253 currBucket
= *prevBucketPtr
;
1254 #ifdef HASH_STATISTICS
1260 if (currBucket
!= NULL
)
1261 return false; /* collision with an existing entry */
1263 currBucket
= existingElement
;
1266 * If old and new hash values belong to the same bucket, we need not
1267 * change any chain links, and indeed should not since this simplistic
1268 * update will corrupt the list if currBucket is the last element. (We
1269 * cannot fall out earlier, however, since we need to scan the bucket to
1270 * check for duplicate keys.)
1272 if (bucket
!= newbucket
)
1274 /* OK to remove record from old hash bucket's chain. */
1275 *oldPrevPtr
= currBucket
->link
;
1277 /* link into new hashbucket chain */
1278 *prevBucketPtr
= currBucket
;
1279 currBucket
->link
= NULL
;
1282 /* copy new key into record */
1283 currBucket
->hashvalue
= newhashvalue
;
1284 hashp
->keycopy(ELEMENTKEY(currBucket
), newKeyPtr
, keysize
);
1286 /* rest of record is untouched */
1292 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1293 * (Or, if the underlying space allocator throws error for out-of-memory,
1294 * we won't return at all.)
1297 get_hash_entry(HTAB
*hashp
, int freelist_idx
)
1299 HASHHDR
*hctl
= hashp
->hctl
;
1300 HASHBUCKET newElement
;
1304 /* if partitioned, must lock to touch nentries and freeList */
1305 if (IS_PARTITIONED(hctl
))
1306 SpinLockAcquire(&hctl
->freeList
[freelist_idx
].mutex
);
1308 /* try to get an entry from the freelist */
1309 newElement
= hctl
->freeList
[freelist_idx
].freeList
;
1311 if (newElement
!= NULL
)
1314 if (IS_PARTITIONED(hctl
))
1315 SpinLockRelease(&hctl
->freeList
[freelist_idx
].mutex
);
1318 * No free elements in this freelist. In a partitioned table, there
1319 * might be entries in other freelists, but to reduce contention we
1320 * prefer to first try to get another chunk of buckets from the main
1321 * shmem allocator. If that fails, though, we *MUST* root through all
1322 * the other freelists before giving up. There are multiple callers
1323 * that assume that they can allocate every element in the initially
1324 * requested table size, or that deleting an element guarantees they
1325 * can insert a new element, even if shared memory is entirely full.
1326 * Failing because the needed element is in a different freelist is
1329 if (!element_alloc(hashp
, hctl
->nelem_alloc
, freelist_idx
))
1331 int borrow_from_idx
;
1333 if (!IS_PARTITIONED(hctl
))
1334 return NULL
; /* out of memory */
1336 /* try to borrow element from another freelist */
1337 borrow_from_idx
= freelist_idx
;
1340 borrow_from_idx
= (borrow_from_idx
+ 1) % NUM_FREELISTS
;
1341 if (borrow_from_idx
== freelist_idx
)
1342 break; /* examined all freelists, fail */
1344 SpinLockAcquire(&(hctl
->freeList
[borrow_from_idx
].mutex
));
1345 newElement
= hctl
->freeList
[borrow_from_idx
].freeList
;
1347 if (newElement
!= NULL
)
1349 hctl
->freeList
[borrow_from_idx
].freeList
= newElement
->link
;
1350 SpinLockRelease(&(hctl
->freeList
[borrow_from_idx
].mutex
));
1352 /* careful: count the new element in its proper freelist */
1353 SpinLockAcquire(&hctl
->freeList
[freelist_idx
].mutex
);
1354 hctl
->freeList
[freelist_idx
].nentries
++;
1355 SpinLockRelease(&hctl
->freeList
[freelist_idx
].mutex
);
1360 SpinLockRelease(&(hctl
->freeList
[borrow_from_idx
].mutex
));
1363 /* no elements available to borrow either, so out of memory */
1368 /* remove entry from freelist, bump nentries */
1369 hctl
->freeList
[freelist_idx
].freeList
= newElement
->link
;
1370 hctl
->freeList
[freelist_idx
].nentries
++;
1372 if (IS_PARTITIONED(hctl
))
1373 SpinLockRelease(&hctl
->freeList
[freelist_idx
].mutex
);
1379 * hash_get_num_entries -- get the number of entries in a hashtable
1382 hash_get_num_entries(HTAB
*hashp
)
1385 long sum
= hashp
->hctl
->freeList
[0].nentries
;
1388 * We currently don't bother with acquiring the mutexes; it's only
1389 * sensible to call this function if you've got lock on all partitions of
1392 if (IS_PARTITIONED(hashp
->hctl
))
1394 for (i
= 1; i
< NUM_FREELISTS
; i
++)
1395 sum
+= hashp
->hctl
->freeList
[i
].nentries
;
1402 * hash_seq_init/_search/_term
1403 * Sequentially search through hash table and return
1404 * all the elements one by one, return NULL when no more.
1406 * hash_seq_term should be called if and only if the scan is abandoned before
1407 * completion; if hash_seq_search returns NULL then it has already done the
1408 * end-of-scan cleanup.
1410 * NOTE: caller may delete the returned element before continuing the scan.
1411 * However, deleting any other element while the scan is in progress is
1412 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1413 * if elements are added to the table while the scan is in progress, it is
1414 * unspecified whether they will be visited by the scan or not.
1416 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1417 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1418 * further insertions by calling hash_freeze.
1420 * NOTE: to use this with a partitioned hashtable, caller had better hold
1421 * at least shared lock on all partitions of the table throughout the scan!
1422 * We can cope with insertions or deletions by our own backend, but *not*
1423 * with concurrent insertions or deletions by another.
1426 hash_seq_init(HASH_SEQ_STATUS
*status
, HTAB
*hashp
)
1428 status
->hashp
= hashp
;
1429 status
->curBucket
= 0;
1430 status
->curEntry
= NULL
;
1432 register_seq_scan(hashp
);
1436 hash_seq_search(HASH_SEQ_STATUS
*status
)
1446 HASHELEMENT
*curElem
;
1448 if ((curElem
= status
->curEntry
) != NULL
)
1450 /* Continuing scan of curBucket... */
1451 status
->curEntry
= curElem
->link
;
1452 if (status
->curEntry
== NULL
) /* end of this bucket */
1453 ++status
->curBucket
;
1454 return (void *) ELEMENTKEY(curElem
);
1458 * Search for next nonempty bucket starting at curBucket.
1460 curBucket
= status
->curBucket
;
1461 hashp
= status
->hashp
;
1463 ssize
= hashp
->ssize
;
1464 max_bucket
= hctl
->max_bucket
;
1466 if (curBucket
> max_bucket
)
1468 hash_seq_term(status
);
1469 return NULL
; /* search is done */
1473 * first find the right segment in the table directory.
1475 segment_num
= curBucket
>> hashp
->sshift
;
1476 segment_ndx
= MOD(curBucket
, ssize
);
1478 segp
= hashp
->dir
[segment_num
];
1481 * Pick up the first item in this bucket's chain. If chain is not empty
1482 * we can begin searching it. Otherwise we have to advance to find the
1483 * next nonempty bucket. We try to optimize that case since searching a
1484 * near-empty hashtable has to iterate this loop a lot.
1486 while ((curElem
= segp
[segment_ndx
]) == NULL
)
1488 /* empty bucket, advance to next */
1489 if (++curBucket
> max_bucket
)
1491 status
->curBucket
= curBucket
;
1492 hash_seq_term(status
);
1493 return NULL
; /* search is done */
1495 if (++segment_ndx
>= ssize
)
1499 segp
= hashp
->dir
[segment_num
];
1503 /* Begin scan of curBucket... */
1504 status
->curEntry
= curElem
->link
;
1505 if (status
->curEntry
== NULL
) /* end of this bucket */
1507 status
->curBucket
= curBucket
;
1508 return (void *) ELEMENTKEY(curElem
);
1512 hash_seq_term(HASH_SEQ_STATUS
*status
)
1514 if (!status
->hashp
->frozen
)
1515 deregister_seq_scan(status
->hashp
);
1520 * Freeze a hashtable against future insertions (deletions are
1523 * The reason for doing this is that by preventing any more bucket splits,
1524 * we no longer need to worry about registering hash_seq_search scans,
1525 * and thus caller need not be careful about ensuring hash_seq_term gets
1526 * called at the right times.
1528 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1529 * with active scans (since hash_seq_term would then do the wrong thing).
1532 hash_freeze(HTAB
*hashp
)
1534 if (hashp
->isshared
)
1535 elog(ERROR
, "cannot freeze shared hashtable \"%s\"", hashp
->tabname
);
1536 if (!hashp
->frozen
&& has_seq_scans(hashp
))
1537 elog(ERROR
, "cannot freeze hashtable \"%s\" because it has active scans",
1539 hashp
->frozen
= true;
1543 /********************************* UTILITIES ************************/
1546 * Expand the table by adding one more hash bucket.
1549 expand_table(HTAB
*hashp
)
1551 HASHHDR
*hctl
= hashp
->hctl
;
1552 HASHSEGMENT old_seg
,
1560 HASHBUCKET
*oldlink
,
1562 HASHBUCKET currElement
,
1565 Assert(!IS_PARTITIONED(hctl
));
1567 #ifdef HASH_STATISTICS
1571 new_bucket
= hctl
->max_bucket
+ 1;
1572 new_segnum
= new_bucket
>> hashp
->sshift
;
1573 new_segndx
= MOD(new_bucket
, hashp
->ssize
);
1575 if (new_segnum
>= hctl
->nsegs
)
1577 /* Allocate new segment if necessary -- could fail if dir full */
1578 if (new_segnum
>= hctl
->dsize
)
1579 if (!dir_realloc(hashp
))
1581 if (!(hashp
->dir
[new_segnum
] = seg_alloc(hashp
)))
1586 /* OK, we created a new bucket */
1590 * *Before* changing masks, find old bucket corresponding to same hash
1591 * values; values in that bucket may need to be relocated to new bucket.
1592 * Note that new_bucket is certainly larger than low_mask at this point,
1593 * so we can skip the first step of the regular hash mask calc.
1595 old_bucket
= (new_bucket
& hctl
->low_mask
);
1598 * If we crossed a power of 2, readjust masks.
1600 if ((uint32
) new_bucket
> hctl
->high_mask
)
1602 hctl
->low_mask
= hctl
->high_mask
;
1603 hctl
->high_mask
= (uint32
) new_bucket
| hctl
->low_mask
;
1607 * Relocate records to the new bucket. NOTE: because of the way the hash
1608 * masking is done in calc_bucket, only one old bucket can need to be
1609 * split at this point. With a different way of reducing the hash value,
1610 * that might not be true!
1612 old_segnum
= old_bucket
>> hashp
->sshift
;
1613 old_segndx
= MOD(old_bucket
, hashp
->ssize
);
1615 old_seg
= hashp
->dir
[old_segnum
];
1616 new_seg
= hashp
->dir
[new_segnum
];
1618 oldlink
= &old_seg
[old_segndx
];
1619 newlink
= &new_seg
[new_segndx
];
1621 for (currElement
= *oldlink
;
1622 currElement
!= NULL
;
1623 currElement
= nextElement
)
1625 nextElement
= currElement
->link
;
1626 if ((long) calc_bucket(hctl
, currElement
->hashvalue
) == old_bucket
)
1628 *oldlink
= currElement
;
1629 oldlink
= &currElement
->link
;
1633 *newlink
= currElement
;
1634 newlink
= &currElement
->link
;
1637 /* don't forget to terminate the rebuilt hash chains... */
1646 dir_realloc(HTAB
*hashp
)
1654 if (hashp
->hctl
->max_dsize
!= NO_MAX_DSIZE
)
1657 /* Reallocate directory */
1658 new_dsize
= hashp
->hctl
->dsize
<< 1;
1659 old_dirsize
= hashp
->hctl
->dsize
* sizeof(HASHSEGMENT
);
1660 new_dirsize
= new_dsize
* sizeof(HASHSEGMENT
);
1663 CurrentDynaHashCxt
= hashp
->hcxt
;
1664 p
= (HASHSEGMENT
*) hashp
->alloc((Size
) new_dirsize
);
1668 memcpy(p
, old_p
, old_dirsize
);
1669 MemSet(((char *) p
) + old_dirsize
, 0, new_dirsize
- old_dirsize
);
1671 hashp
->hctl
->dsize
= new_dsize
;
1673 /* XXX assume the allocator is palloc, so we know how to free */
1674 Assert(hashp
->alloc
== DynaHashAlloc
);
1685 seg_alloc(HTAB
*hashp
)
1689 CurrentDynaHashCxt
= hashp
->hcxt
;
1690 segp
= (HASHSEGMENT
) hashp
->alloc(sizeof(HASHBUCKET
) * hashp
->ssize
);
1695 MemSet(segp
, 0, sizeof(HASHBUCKET
) * hashp
->ssize
);
1701 * allocate some new elements and link them into the indicated free list
1704 element_alloc(HTAB
*hashp
, int nelem
, int freelist_idx
)
1706 HASHHDR
*hctl
= hashp
->hctl
;
1708 HASHELEMENT
*firstElement
;
1709 HASHELEMENT
*tmpElement
;
1710 HASHELEMENT
*prevElement
;
1716 /* Each element has a HASHELEMENT header plus user data. */
1717 elementSize
= MAXALIGN(sizeof(HASHELEMENT
)) + MAXALIGN(hctl
->entrysize
);
1719 CurrentDynaHashCxt
= hashp
->hcxt
;
1720 firstElement
= (HASHELEMENT
*) hashp
->alloc(nelem
* elementSize
);
1725 /* prepare to link all the new entries into the freelist */
1727 tmpElement
= firstElement
;
1728 for (i
= 0; i
< nelem
; i
++)
1730 tmpElement
->link
= prevElement
;
1731 prevElement
= tmpElement
;
1732 tmpElement
= (HASHELEMENT
*) (((char *) tmpElement
) + elementSize
);
1735 /* if partitioned, must lock to touch freeList */
1736 if (IS_PARTITIONED(hctl
))
1737 SpinLockAcquire(&hctl
->freeList
[freelist_idx
].mutex
);
1739 /* freelist could be nonempty if two backends did this concurrently */
1740 firstElement
->link
= hctl
->freeList
[freelist_idx
].freeList
;
1741 hctl
->freeList
[freelist_idx
].freeList
= prevElement
;
1743 if (IS_PARTITIONED(hctl
))
1744 SpinLockRelease(&hctl
->freeList
[freelist_idx
].mutex
);
1749 /* complain when we have detected a corrupted hashtable */
1751 hash_corrupted(HTAB
*hashp
)
1754 * If the corruption is in a shared hashtable, we'd better force a
1755 * systemwide restart. Otherwise, just shut down this one backend.
1757 if (hashp
->isshared
)
1758 elog(PANIC
, "hash table \"%s\" corrupted", hashp
->tabname
);
1760 elog(FATAL
, "hash table \"%s\" corrupted", hashp
->tabname
);
1763 /* calculate ceil(log base 2) of num */
1768 * guard against too-large input, which would be invalid for
1771 if (num
> LONG_MAX
/ 2)
1775 return pg_ceil_log2_32(num
);
1777 return pg_ceil_log2_64(num
);
1781 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1783 next_pow2_long(long num
)
1785 /* my_log2's internal range check is sufficient */
1786 return 1L << my_log2(num
);
1789 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1791 next_pow2_int(long num
)
1793 if (num
> INT_MAX
/ 2)
1795 return 1 << my_log2(num
);
1799 /************************* SEQ SCAN TRACKING ************************/
1802 * We track active hash_seq_search scans here. The need for this mechanism
1803 * comes from the fact that a scan will get confused if a bucket split occurs
1804 * while it's in progress: it might visit entries twice, or even miss some
1805 * entirely (if it's partway through the same bucket that splits). Hence
1806 * we want to inhibit bucket splits if there are any active scans on the
1807 * table being inserted into. This is a fairly rare case in current usage,
1808 * so just postponing the split until the next insertion seems sufficient.
1810 * Given present usages of the function, only a few scans are likely to be
1811 * open concurrently; so a finite-size stack of open scans seems sufficient,
1812 * and we don't worry that linear search is too slow. Note that we do
1813 * allow multiple scans of the same hashtable to be open concurrently.
1815 * This mechanism can support concurrent scan and insertion in a shared
1816 * hashtable if it's the same backend doing both. It would fail otherwise,
1817 * but locking reasons seem to preclude any such scenario anyway, so we don't
1820 * This arrangement is reasonably robust if a transient hashtable is deleted
1821 * without notifying us. The absolute worst case is we might inhibit splits
1822 * in another table created later at exactly the same address. We will give
1823 * a warning at transaction end for reference leaks, so any bugs leading to
1824 * lack of notification should be easy to catch.
1827 #define MAX_SEQ_SCANS 100
1829 static HTAB
*seq_scan_tables
[MAX_SEQ_SCANS
]; /* tables being scanned */
1830 static int seq_scan_level
[MAX_SEQ_SCANS
]; /* subtransaction nest level */
1831 static int num_seq_scans
= 0;
1834 /* Register a table as having an active hash_seq_search scan */
1836 register_seq_scan(HTAB
*hashp
)
1838 if (num_seq_scans
>= MAX_SEQ_SCANS
)
1839 elog(ERROR
, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1841 seq_scan_tables
[num_seq_scans
] = hashp
;
1842 seq_scan_level
[num_seq_scans
] = GetCurrentTransactionNestLevel();
1846 /* Deregister an active scan */
1848 deregister_seq_scan(HTAB
*hashp
)
1852 /* Search backward since it's most likely at the stack top */
1853 for (i
= num_seq_scans
- 1; i
>= 0; i
--)
1855 if (seq_scan_tables
[i
] == hashp
)
1857 seq_scan_tables
[i
] = seq_scan_tables
[num_seq_scans
- 1];
1858 seq_scan_level
[i
] = seq_scan_level
[num_seq_scans
- 1];
1863 elog(ERROR
, "no hash_seq_search scan for hash table \"%s\"",
1867 /* Check if a table has any active scan */
1869 has_seq_scans(HTAB
*hashp
)
1873 for (i
= 0; i
< num_seq_scans
; i
++)
1875 if (seq_scan_tables
[i
] == hashp
)
1881 /* Clean up any open scans at end of transaction */
1883 AtEOXact_HashTables(bool isCommit
)
1886 * During abort cleanup, open scans are expected; just silently clean 'em
1887 * out. An open scan at commit means someone forgot a hash_seq_term()
1888 * call, so complain.
1890 * Note: it's tempting to try to print the tabname here, but refrain for
1891 * fear of touching deallocated memory. This isn't a user-facing message
1892 * anyway, so it needn't be pretty.
1898 for (i
= 0; i
< num_seq_scans
; i
++)
1900 elog(WARNING
, "leaked hash_seq_search scan for hash table %p",
1901 seq_scan_tables
[i
]);
1907 /* Clean up any open scans at end of subtransaction */
1909 AtEOSubXact_HashTables(bool isCommit
, int nestDepth
)
1914 * Search backward to make cleanup easy. Note we must check all entries,
1915 * not only those at the end of the array, because deletion technique
1916 * doesn't keep them in order.
1918 for (i
= num_seq_scans
- 1; i
>= 0; i
--)
1920 if (seq_scan_level
[i
] >= nestDepth
)
1923 elog(WARNING
, "leaked hash_seq_search scan for hash table %p",
1924 seq_scan_tables
[i
]);
1925 seq_scan_tables
[i
] = seq_scan_tables
[num_seq_scans
- 1];
1926 seq_scan_level
[i
] = seq_scan_level
[num_seq_scans
- 1];