src/backend/utils/hash/dynahash.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * dynahash.c
   4  *        dynamic chained hash tables
   5  *
   6  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
   7  * shared memory.  For shared hash tables, it is the caller's responsibility
   8  * to provide appropriate access interlocking.  The simplest convention is
   9  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
  10  * hash_seq_search) need only shared lock, but any update requires exclusive
  11  * lock.  For heavily-used shared tables, the single-lock approach creates a
  12  * concurrency bottleneck, so we also support "partitioned" locking wherein
  13  * there are multiple LWLocks guarding distinct subsets of the table.  To use
  14  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
  15  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
  16  * Therefore, each hash bucket chain operates independently, and no fields
  17  * of the hash header change after init except nentries and freeList.
  18  * (A partitioned table uses multiple copies of those fields, guarded by
  19  * spinlocks, for additional concurrency.)
  20  * This lets any subset of the hash buckets be treated as a separately
  21  * lockable partition.  We expect callers to use the low-order bits of a
  22  * lookup key's hash value as a partition number --- this will work because
  23  * of the way calc_bucket() maps hash values to bucket numbers.
  24  *
  25  * For hash tables in shared memory, the memory allocator function should
  26  * match malloc's semantics of returning NULL on failure.  For hash tables
  27  * in local memory, we typically use palloc() which will throw error on
  28  * failure.  The code in this file has to cope with both cases.
  29  *
  30  * dynahash.c provides support for these types of lookup keys:
  31  *
  32  * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
  33  * compared as though by strcmp().  This is selected by specifying the
  34  * HASH_STRINGS flag to hash_create.
  35  *
  36  * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
  37  * (Caller must ensure there are no undefined padding bits in the keys!)
  38  * This is selected by specifying the HASH_BLOBS flag to hash_create.
  39  *
  40  * 3. More complex key behavior can be selected by specifying user-supplied
  41  * hashing, comparison, and/or key-copying functions.  At least a hashing
  42  * function must be supplied; comparison defaults to memcmp() and key copying
  43  * to memcpy() when a user-defined hashing function is selected.
  44  *
  45  * Compared to simplehash, dynahash has the following benefits:
  46  *
  47  * - It supports partitioning, which is useful for shared memory access using
  48  *   locks.
  49  * - Shared memory hashes are allocated in a fixed size area at startup and
  50  *   are discoverable by name from other processes.
  51  * - Because entries don't need to be moved in the case of hash conflicts,
  52  *   dynahash has better performance for large entries.
  53  * - Guarantees stable pointers to entries.
  54  *
  55  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  56  * Portions Copyright (c) 1994, Regents of the University of California
  57  *
  58  *
  59  * IDENTIFICATION
  60  *        src/backend/utils/hash/dynahash.c
  61  *
  62  *-------------------------------------------------------------------------
  63  */
  64
  65 /*
  66  * Original comments:
  67  *
  68  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
  69  * Coded into C, with minor code improvements, and with hsearch(3) interface,
  70  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
  71  * also, hcreate/hdestroy routines added to simulate hsearch(3).
  72  *
  73  * These routines simulate hsearch(3) and family, with the important
  74  * difference that the hash table is dynamic - can grow indefinitely
  75  * beyond its original size (as supplied to hcreate()).
  76  *
  77  * Performance appears to be comparable to that of hsearch(3).
  78  * The 'source-code' options referred to in hsearch(3)'s 'man' page
  79  * are not implemented; otherwise functionality is identical.
  80  *
  81  * Compilation controls:
  82  * HASH_DEBUG controls some informative traces, mainly for debugging.
  83  * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
  84  * when combined with HASH_DEBUG, these are displayed by hdestroy().
  85  *
  86  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
  87  * concatenation property, in probably unnecessary code 'optimization'.
  88  *
  89  * Modified margo@postgres.berkeley.edu February 1990
  90  *              added multiple table interface
  91  * Modified by sullivan@postgres.berkeley.edu April 1990
  92  *              changed ctl structure for shared memory
  93  */
  94
  95 #include "postgres.h"
  96
  97 #include <limits.h>
  98
  99 #include "access/xact.h"
 100 #include "common/hashfn.h"
 101 #include "port/pg_bitutils.h"
 102 #include "storage/shmem.h"
 103 #include "storage/spin.h"
 104 #include "utils/dynahash.h"
 105 #include "utils/memutils.h"
 106
 107
 108 /*
 109  * Constants
 110  *
 111  * A hash table has a top-level "directory", each of whose entries points
 112  * to a "segment" of ssize bucket headers.  The maximum number of hash
 113  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
 114  * the number of records in the table can be larger, but we don't want a
 115  * whole lot of records per bucket or performance goes down.
 116  *
 117  * In a hash table allocated in shared memory, the directory cannot be
 118  * expanded because it must stay at a fixed address.  The directory size
 119  * should be selected using hash_select_dirsize (and you'd better have
 120  * a good idea of the maximum number of entries!).  For non-shared hash
 121  * tables, the initial directory size can be left at the default.
 122  */
 123 #define DEF_SEGSIZE                        256
 124 #define DEF_SEGSIZE_SHIFT          8    /* must be log2(DEF_SEGSIZE) */
 125 #define DEF_DIRSIZE                        256
 126
 127 /* Number of freelists to be used for a partitioned hash table. */
 128 #define NUM_FREELISTS                   32
 129
 130 /* A hash bucket is a linked list of HASHELEMENTs */
 131 typedef HASHELEMENT *HASHBUCKET;
 132
 133 /* A hash segment is an array of bucket headers */
 134 typedef HASHBUCKET *HASHSEGMENT;
 135
 136 /*
 137  * Per-freelist data.
 138  *
 139  * In a partitioned hash table, each freelist is associated with a specific
 140  * set of hashcodes, as determined by the FREELIST_IDX() macro below.
 141  * nentries tracks the number of live hashtable entries having those hashcodes
 142  * (NOT the number of entries in the freelist, as you might expect).
 143  *
 144  * The coverage of a freelist might be more or less than one partition, so it
 145  * needs its own lock rather than relying on caller locking.  Relying on that
 146  * wouldn't work even if the coverage was the same, because of the occasional
 147  * need to "borrow" entries from another freelist; see get_hash_entry().
 148  *
 149  * Using an array of FreeListData instead of separate arrays of mutexes,
 150  * nentries and freeLists helps to reduce sharing of cache lines between
 151  * different mutexes.
 152  */
 153 typedef struct
 154 {
 155         slock_t         mutex;                  /* spinlock for this freelist */
 156         long            nentries;               /* number of entries in associated buckets */
 157         HASHELEMENT *freeList;          /* chain of free elements */
 158 } FreeListData;
 159
 160 /*
 161  * Header structure for a hash table --- contains all changeable info
 162  *
 163  * In a shared-memory hash table, the HASHHDR is in shared memory, while
 164  * each backend has a local HTAB struct.  For a non-shared table, there isn't
 165  * any functional difference between HASHHDR and HTAB, but we separate them
 166  * anyway to share code between shared and non-shared tables.
 167  */
 168 struct HASHHDR
 169 {
 170         /*
 171          * The freelist can become a point of contention in high-concurrency hash
 172          * tables, so we use an array of freelists, each with its own mutex and
 173          * nentries count, instead of just a single one.  Although the freelists
 174          * normally operate independently, we will scavenge entries from freelists
 175          * other than a hashcode's default freelist when necessary.
 176          *
 177          * If the hash table is not partitioned, only freeList[0] is used and its
 178          * spinlock is not used at all; callers' locking is assumed sufficient.
 179          */
 180         FreeListData freeList[NUM_FREELISTS];
 181
 182         /* These fields can change, but not in a partitioned table */
 183         /* Also, dsize can't change in a shared table, even if unpartitioned */
 184         long            dsize;                  /* directory size */
 185         long            nsegs;                  /* number of allocated segments (<= dsize) */
 186         uint32          max_bucket;             /* ID of maximum bucket in use */
 187         uint32          high_mask;              /* mask to modulo into entire table */
 188         uint32          low_mask;               /* mask to modulo into lower half of table */
 189
 190         /* These fields are fixed at hashtable creation */
 191         Size            keysize;                /* hash key length in bytes */
 192         Size            entrysize;              /* total user element size in bytes */
 193         long            num_partitions; /* # partitions (must be power of 2), or 0 */
 194         long            max_dsize;              /* 'dsize' limit if directory is fixed size */
 195         long            ssize;                  /* segment size --- must be power of 2 */
 196         int                     sshift;                 /* segment shift = log2(ssize) */
 197         int                     nelem_alloc;    /* number of entries to allocate at once */
 198
 199 #ifdef HASH_STATISTICS
 200
 201         /*
 202          * Count statistics here.  NB: stats code doesn't bother with mutex, so
 203          * counts could be corrupted a bit in a partitioned table.
 204          */
 205         long            accesses;
 206         long            collisions;
 207 #endif
 208 };
 209
 210 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 211
 212 #define FREELIST_IDX(hctl, hashcode) \
 213         (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
 214
 215 /*
 216  * Top control structure for a hashtable --- in a shared table, each backend
 217  * has its own copy (OK since no fields change at runtime)
 218  */
 219 struct HTAB
 220 {
 221         HASHHDR    *hctl;                       /* => shared control information */
 222         HASHSEGMENT *dir;                       /* directory of segment starts */
 223         HashValueFunc hash;                     /* hash function */
 224         HashCompareFunc match;          /* key comparison function */
 225         HashCopyFunc keycopy;           /* key copying function */
 226         HashAllocFunc alloc;            /* memory allocator */
 227         MemoryContext hcxt;                     /* memory context if default allocator used */
 228         char       *tabname;            /* table name (for error messages) */
 229         bool            isshared;               /* true if table is in shared memory */
 230         bool            isfixed;                /* if true, don't enlarge */
 231
 232         /* freezing a shared table isn't allowed, so we can keep state here */
 233         bool            frozen;                 /* true = no more inserts allowed */
 234
 235         /* We keep local copies of these fixed values to reduce contention */
 236         Size            keysize;                /* hash key length in bytes */
 237         long            ssize;                  /* segment size --- must be power of 2 */
 238         int                     sshift;                 /* segment shift = log2(ssize) */
 239 };
 240
 241 /*
 242  * Key (also entry) part of a HASHELEMENT
 243  */
 244 #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
 245
 246 /*
 247  * Obtain element pointer given pointer to key
 248  */
 249 #define ELEMENT_FROM_KEY(key)  \
 250         ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
 251
 252 /*
 253  * Fast MOD arithmetic, assuming that y is a power of 2 !
 254  */
 255 #define MOD(x,y)                           ((x) & ((y)-1))
 256
 257 #ifdef HASH_STATISTICS
 258 static long hash_accesses,
 259                         hash_collisions,
 260                         hash_expansions;
 261 #endif
 262
 263 /*
 264  * Private function prototypes
 265  */
 266 static void *DynaHashAlloc(Size size);
 267 static HASHSEGMENT seg_alloc(HTAB *hashp);
 268 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
 269 static bool dir_realloc(HTAB *hashp);
 270 static bool expand_table(HTAB *hashp);
 271 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 272 static void hdefault(HTAB *hashp);
 273 static int      choose_nelem_alloc(Size entrysize);
 274 static bool init_htab(HTAB *hashp, long nelem);
 275 static void hash_corrupted(HTAB *hashp);
 276 static long next_pow2_long(long num);
 277 static int      next_pow2_int(long num);
 278 static void register_seq_scan(HTAB *hashp);
 279 static void deregister_seq_scan(HTAB *hashp);
 280 static bool has_seq_scans(HTAB *hashp);
 281
 282
 283 /*
 284  * memory allocation support
 285  */
 286 static MemoryContext CurrentDynaHashCxt = NULL;
 287
 288 static void *
 289 DynaHashAlloc(Size size)
 290 {
 291         Assert(MemoryContextIsValid(CurrentDynaHashCxt));
 292         return MemoryContextAlloc(CurrentDynaHashCxt, size);
 293 }
 294
 295
 296 /*
 297  * HashCompareFunc for string keys
 298  *
 299  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
 300  * bytes, so we can only compare that many ... hence strncmp is almost but
 301  * not quite the right thing.
 302  */
 303 static int
 304 string_compare(const char *key1, const char *key2, Size keysize)
 305 {
 306         return strncmp(key1, key2, keysize - 1);
 307 }
 308
 309
 310 /************************** CREATE ROUTINES **********************/
 311
 312 /*
 313  * hash_create -- create a new dynamic hash table
 314  *
 315  *      tabname: a name for the table (for debugging purposes)
 316  *      nelem: maximum number of elements expected
 317  *      *info: additional table parameters, as indicated by flags
 318  *      flags: bitmask indicating which parameters to take from *info
 319  *
 320  * The flags value *must* include HASH_ELEM.  (Formerly, this was nominally
 321  * optional, but the default keysize and entrysize values were useless.)
 322  * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
 323  * or HASH_FUNCTION, to define the key hashing semantics (C strings,
 324  * binary blobs, or custom, respectively).  Callers specifying a custom
 325  * hash function will likely also want to use HASH_COMPARE, and perhaps
 326  * also HASH_KEYCOPY, to control key comparison and copying.
 327  * Another often-used flag is HASH_CONTEXT, to allocate the hash table
 328  * under info->hcxt rather than under TopMemoryContext; the default
 329  * behavior is only suitable for session-lifespan hash tables.
 330  * Other flags bits are special-purpose and seldom used, except for those
 331  * associated with shared-memory hash tables, for which see ShmemInitHash().
 332  *
 333  * Fields in *info are read only when the associated flags bit is set.
 334  * It is not necessary to initialize other fields of *info.
 335  * Neither tabname nor *info need persist after the hash_create() call.
 336  *
 337  * Note: It is deprecated for callers of hash_create() to explicitly specify
 338  * string_hash, tag_hash, uint32_hash, or oid_hash.  Just set HASH_STRINGS or
 339  * HASH_BLOBS.  Use HASH_FUNCTION only when you want something other than
 340  * one of these.
 341  *
 342  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
 343  * estimate, since we can't expand the table on the fly.  But an unshared
 344  * hashtable can be expanded on-the-fly, so it's better for nelem to be
 345  * on the small side and let the table grow if it's exceeded.  An overly
 346  * large nelem will penalize hash_seq_search speed without buying much.
 347  */
 348 HTAB *
 349 hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 350 {
 351         HTAB       *hashp;
 352         HASHHDR    *hctl;
 353
 354         /*
 355          * Hash tables now allocate space for key and data, but you have to say
 356          * how much space to allocate.
 357          */
 358         Assert(flags & HASH_ELEM);
 359         Assert(info->keysize > 0);
 360         Assert(info->entrysize >= info->keysize);
 361
 362         /*
 363          * For shared hash tables, we have a local hash header (HTAB struct) that
 364          * we allocate in TopMemoryContext; all else is in shared memory.
 365          *
 366          * For non-shared hash tables, everything including the hash header is in
 367          * a memory context created specially for the hash table --- this makes
 368          * hash_destroy very simple.  The memory context is made a child of either
 369          * a context specified by the caller, or TopMemoryContext if nothing is
 370          * specified.
 371          */
 372         if (flags & HASH_SHARED_MEM)
 373         {
 374                 /* Set up to allocate the hash header */
 375                 CurrentDynaHashCxt = TopMemoryContext;
 376         }
 377         else
 378         {
 379                 /* Create the hash table's private memory context */
 380                 if (flags & HASH_CONTEXT)
 381                         CurrentDynaHashCxt = info->hcxt;
 382                 else
 383                         CurrentDynaHashCxt = TopMemoryContext;
 384                 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
 385                                                                                                    "dynahash",
 386                                                                                                    ALLOCSET_DEFAULT_SIZES);
 387         }
 388
 389         /* Initialize the hash header, plus a copy of the table name */
 390         hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
 391         MemSet(hashp, 0, sizeof(HTAB));
 392
 393         hashp->tabname = (char *) (hashp + 1);
 394         strcpy(hashp->tabname, tabname);
 395
 396         /* If we have a private context, label it with hashtable's name */
 397         if (!(flags & HASH_SHARED_MEM))
 398                 MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
 399
 400         /*
 401          * Select the appropriate hash function (see comments at head of file).
 402          */
 403         if (flags & HASH_FUNCTION)
 404         {
 405                 Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
 406                 hashp->hash = info->hash;
 407         }
 408         else if (flags & HASH_BLOBS)
 409         {
 410                 Assert(!(flags & HASH_STRINGS));
 411                 /* We can optimize hashing for common key sizes */
 412                 if (info->keysize == sizeof(uint32))
 413                         hashp->hash = uint32_hash;
 414                 else
 415                         hashp->hash = tag_hash;
 416         }
 417         else
 418         {
 419                 /*
 420                  * string_hash used to be considered the default hash method, and in a
 421                  * non-assert build it effectively still is.  But we now consider it
 422                  * an assertion error to not say HASH_STRINGS explicitly.  To help
 423                  * catch mistaken usage of HASH_STRINGS, we also insist on a
 424                  * reasonably long string length: if the keysize is only 4 or 8 bytes,
 425                  * it's almost certainly an integer or pointer not a string.
 426                  */
 427                 Assert(flags & HASH_STRINGS);
 428                 Assert(info->keysize > 8);
 429
 430                 hashp->hash = string_hash;
 431         }
 432
 433         /*
 434          * If you don't specify a match function, it defaults to string_compare if
 435          * you used string_hash, and to memcmp otherwise.
 436          *
 437          * Note: explicitly specifying string_hash is deprecated, because this
 438          * might not work for callers in loadable modules on some platforms due to
 439          * referencing a trampoline instead of the string_hash function proper.
 440          * Specify HASH_STRINGS instead.
 441          */
 442         if (flags & HASH_COMPARE)
 443                 hashp->match = info->match;
 444         else if (hashp->hash == string_hash)
 445                 hashp->match = (HashCompareFunc) string_compare;
 446         else
 447                 hashp->match = memcmp;
 448
 449         /*
 450          * Similarly, the key-copying function defaults to strlcpy or memcpy.
 451          */
 452         if (flags & HASH_KEYCOPY)
 453                 hashp->keycopy = info->keycopy;
 454         else if (hashp->hash == string_hash)
 455         {
 456                 /*
 457                  * The signature of keycopy is meant for memcpy(), which returns
 458                  * void*, but strlcpy() returns size_t.  Since we never use the return
 459                  * value of keycopy, and size_t is pretty much always the same size as
 460                  * void *, this should be safe.  The extra cast in the middle is to
 461                  * avoid warnings from -Wcast-function-type.
 462                  */
 463                 hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
 464         }
 465         else
 466                 hashp->keycopy = memcpy;
 467
 468         /* And select the entry allocation function, too. */
 469         if (flags & HASH_ALLOC)
 470                 hashp->alloc = info->alloc;
 471         else
 472                 hashp->alloc = DynaHashAlloc;
 473
 474         if (flags & HASH_SHARED_MEM)
 475         {
 476                 /*
 477                  * ctl structure and directory are preallocated for shared memory
 478                  * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
 479                  * well.
 480                  */
 481                 hashp->hctl = info->hctl;
 482                 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
 483                 hashp->hcxt = NULL;
 484                 hashp->isshared = true;
 485
 486                 /* hash table already exists, we're just attaching to it */
 487                 if (flags & HASH_ATTACH)
 488                 {
 489                         /* make local copies of some heavily-used values */
 490                         hctl = hashp->hctl;
 491                         hashp->keysize = hctl->keysize;
 492                         hashp->ssize = hctl->ssize;
 493                         hashp->sshift = hctl->sshift;
 494
 495                         return hashp;
 496                 }
 497         }
 498         else
 499         {
 500                 /* setup hash table defaults */
 501                 hashp->hctl = NULL;
 502                 hashp->dir = NULL;
 503                 hashp->hcxt = CurrentDynaHashCxt;
 504                 hashp->isshared = false;
 505         }
 506
 507         if (!hashp->hctl)
 508         {
 509                 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
 510                 if (!hashp->hctl)
 511                         ereport(ERROR,
 512                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 513                                          errmsg("out of memory")));
 514         }
 515
 516         hashp->frozen = false;
 517
 518         hdefault(hashp);
 519
 520         hctl = hashp->hctl;
 521
 522         if (flags & HASH_PARTITION)
 523         {
 524                 /* Doesn't make sense to partition a local hash table */
 525                 Assert(flags & HASH_SHARED_MEM);
 526
 527                 /*
 528                  * The number of partitions had better be a power of 2. Also, it must
 529                  * be less than INT_MAX (see init_htab()), so call the int version of
 530                  * next_pow2.
 531                  */
 532                 Assert(info->num_partitions == next_pow2_int(info->num_partitions));
 533
 534                 hctl->num_partitions = info->num_partitions;
 535         }
 536
 537         if (flags & HASH_SEGMENT)
 538         {
 539                 hctl->ssize = info->ssize;
 540                 hctl->sshift = my_log2(info->ssize);
 541                 /* ssize had better be a power of 2 */
 542                 Assert(hctl->ssize == (1L << hctl->sshift));
 543         }
 544
 545         /*
 546          * SHM hash tables have fixed directory size passed by the caller.
 547          */
 548         if (flags & HASH_DIRSIZE)
 549         {
 550                 hctl->max_dsize = info->max_dsize;
 551                 hctl->dsize = info->dsize;
 552         }
 553
 554         /* remember the entry sizes, too */
 555         hctl->keysize = info->keysize;
 556         hctl->entrysize = info->entrysize;
 557
 558         /* make local copies of heavily-used constant fields */
 559         hashp->keysize = hctl->keysize;
 560         hashp->ssize = hctl->ssize;
 561         hashp->sshift = hctl->sshift;
 562
 563         /* Build the hash directory structure */
 564         if (!init_htab(hashp, nelem))
 565                 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
 566
 567         /*
 568          * For a shared hash table, preallocate the requested number of elements.
 569          * This reduces problems with run-time out-of-shared-memory conditions.
 570          *
 571          * For a non-shared hash table, preallocate the requested number of
 572          * elements if it's less than our chosen nelem_alloc.  This avoids wasting
 573          * space if the caller correctly estimates a small table size.
 574          */
 575         if ((flags & HASH_SHARED_MEM) ||
 576                 nelem < hctl->nelem_alloc)
 577         {
 578                 int                     i,
 579                                         freelist_partitions,
 580                                         nelem_alloc,
 581                                         nelem_alloc_first;
 582
 583                 /*
 584                  * If hash table is partitioned, give each freelist an equal share of
 585                  * the initial allocation.  Otherwise only freeList[0] is used.
 586                  */
 587                 if (IS_PARTITIONED(hashp->hctl))
 588                         freelist_partitions = NUM_FREELISTS;
 589                 else
 590                         freelist_partitions = 1;
 591
 592                 nelem_alloc = nelem / freelist_partitions;
 593                 if (nelem_alloc <= 0)
 594                         nelem_alloc = 1;
 595
 596                 /*
 597                  * Make sure we'll allocate all the requested elements; freeList[0]
 598                  * gets the excess if the request isn't divisible by NUM_FREELISTS.
 599                  */
 600                 if (nelem_alloc * freelist_partitions < nelem)
 601                         nelem_alloc_first =
 602                                 nelem - nelem_alloc * (freelist_partitions - 1);
 603                 else
 604                         nelem_alloc_first = nelem_alloc;
 605
 606                 for (i = 0; i < freelist_partitions; i++)
 607                 {
 608                         int                     temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
 609
 610                         if (!element_alloc(hashp, temp, i))
 611                                 ereport(ERROR,
 612                                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 613                                                  errmsg("out of memory")));
 614                 }
 615         }
 616
 617         if (flags & HASH_FIXED_SIZE)
 618                 hashp->isfixed = true;
 619         return hashp;
 620 }
 621
 622 /*
 623  * Set default HASHHDR parameters.
 624  */
 625 static void
 626 hdefault(HTAB *hashp)
 627 {
 628         HASHHDR    *hctl = hashp->hctl;
 629
 630         MemSet(hctl, 0, sizeof(HASHHDR));
 631
 632         hctl->dsize = DEF_DIRSIZE;
 633         hctl->nsegs = 0;
 634
 635         hctl->num_partitions = 0;       /* not partitioned */
 636
 637         /* table has no fixed maximum size */
 638         hctl->max_dsize = NO_MAX_DSIZE;
 639
 640         hctl->ssize = DEF_SEGSIZE;
 641         hctl->sshift = DEF_SEGSIZE_SHIFT;
 642
 643 #ifdef HASH_STATISTICS
 644         hctl->accesses = hctl->collisions = 0;
 645 #endif
 646 }
 647
 648 /*
 649  * Given the user-specified entry size, choose nelem_alloc, ie, how many
 650  * elements to add to the hash table when we need more.
 651  */
 652 static int
 653 choose_nelem_alloc(Size entrysize)
 654 {
 655         int                     nelem_alloc;
 656         Size            elementSize;
 657         Size            allocSize;
 658
 659         /* Each element has a HASHELEMENT header plus user data. */
 660         /* NB: this had better match element_alloc() */
 661         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
 662
 663         /*
 664          * The idea here is to choose nelem_alloc at least 32, but round up so
 665          * that the allocation request will be a power of 2 or just less. This
 666          * makes little difference for hash tables in shared memory, but for hash
 667          * tables managed by palloc, the allocation request will be rounded up to
 668          * a power of 2 anyway.  If we fail to take this into account, we'll waste
 669          * as much as half the allocated space.
 670          */
 671         allocSize = 32 * 4;                     /* assume elementSize at least 8 */
 672         do
 673         {
 674                 allocSize <<= 1;
 675                 nelem_alloc = allocSize / elementSize;
 676         } while (nelem_alloc < 32);
 677
 678         return nelem_alloc;
 679 }
 680
 681 /*
 682  * Compute derived fields of hctl and build the initial directory/segment
 683  * arrays
 684  */
 685 static bool
 686 init_htab(HTAB *hashp, long nelem)
 687 {
 688         HASHHDR    *hctl = hashp->hctl;
 689         HASHSEGMENT *segp;
 690         int                     nbuckets;
 691         int                     nsegs;
 692         int                     i;
 693
 694         /*
 695          * initialize mutexes if it's a partitioned table
 696          */
 697         if (IS_PARTITIONED(hctl))
 698                 for (i = 0; i < NUM_FREELISTS; i++)
 699                         SpinLockInit(&(hctl->freeList[i].mutex));
 700
 701         /*
 702          * Allocate space for the next greater power of two number of buckets,
 703          * assuming a desired maximum load factor of 1.
 704          */
 705         nbuckets = next_pow2_int(nelem);
 706
 707         /*
 708          * In a partitioned table, nbuckets must be at least equal to
 709          * num_partitions; were it less, keys with apparently different partition
 710          * numbers would map to the same bucket, breaking partition independence.
 711          * (Normally nbuckets will be much bigger; this is just a safety check.)
 712          */
 713         while (nbuckets < hctl->num_partitions)
 714                 nbuckets <<= 1;
 715
 716         hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 717         hctl->high_mask = (nbuckets << 1) - 1;
 718
 719         /*
 720          * Figure number of directory segments needed, round up to a power of 2
 721          */
 722         nsegs = (nbuckets - 1) / hctl->ssize + 1;
 723         nsegs = next_pow2_int(nsegs);
 724
 725         /*
 726          * Make sure directory is big enough. If pre-allocated directory is too
 727          * small, choke (caller screwed up).
 728          */
 729         if (nsegs > hctl->dsize)
 730         {
 731                 if (!(hashp->dir))
 732                         hctl->dsize = nsegs;
 733                 else
 734                         return false;
 735         }
 736
 737         /* Allocate a directory */
 738         if (!(hashp->dir))
 739         {
 740                 CurrentDynaHashCxt = hashp->hcxt;
 741                 hashp->dir = (HASHSEGMENT *)
 742                         hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
 743                 if (!hashp->dir)
 744                         return false;
 745         }
 746
 747         /* Allocate initial segments */
 748         for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
 749         {
 750                 *segp = seg_alloc(hashp);
 751                 if (*segp == NULL)
 752                         return false;
 753         }
 754
 755         /* Choose number of entries to allocate at a time */
 756         hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
 757
 758 #ifdef HASH_DEBUG
 759         fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
 760                         "TABLE POINTER   ", hashp,
 761                         "DIRECTORY SIZE  ", hctl->dsize,
 762                         "SEGMENT SIZE    ", hctl->ssize,
 763                         "SEGMENT SHIFT   ", hctl->sshift,
 764                         "MAX BUCKET      ", hctl->max_bucket,
 765                         "HIGH MASK       ", hctl->high_mask,
 766                         "LOW  MASK       ", hctl->low_mask,
 767                         "NSEGS           ", hctl->nsegs);
 768 #endif
 769         return true;
 770 }
 771
 772 /*
 773  * Estimate the space needed for a hashtable containing the given number
 774  * of entries of given size.
 775  * NOTE: this is used to estimate the footprint of hashtables in shared
 776  * memory; therefore it does not count HTAB which is in local memory.
 777  * NB: assumes that all hash structure parameters have default values!
 778  */
 779 Size
 780 hash_estimate_size(long num_entries, Size entrysize)
 781 {
 782         Size            size;
 783         long            nBuckets,
 784                                 nSegments,
 785                                 nDirEntries,
 786                                 nElementAllocs,
 787                                 elementSize,
 788                                 elementAllocCnt;
 789
 790         /* estimate number of buckets wanted */
 791         nBuckets = next_pow2_long(num_entries);
 792         /* # of segments needed for nBuckets */
 793         nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
 794         /* directory entries */
 795         nDirEntries = DEF_DIRSIZE;
 796         while (nDirEntries < nSegments)
 797                 nDirEntries <<= 1;              /* dir_alloc doubles dsize at each call */
 798
 799         /* fixed control info */
 800         size = MAXALIGN(sizeof(HASHHDR));       /* but not HTAB, per above */
 801         /* directory */
 802         size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
 803         /* segments */
 804         size = add_size(size, mul_size(nSegments,
 805                                                                    MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
 806         /* elements --- allocated in groups of choose_nelem_alloc() entries */
 807         elementAllocCnt = choose_nelem_alloc(entrysize);
 808         nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
 809         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
 810         size = add_size(size,
 811                                         mul_size(nElementAllocs,
 812                                                          mul_size(elementAllocCnt, elementSize)));
 813
 814         return size;
 815 }
 816
 817 /*
 818  * Select an appropriate directory size for a hashtable with the given
 819  * maximum number of entries.
 820  * This is only needed for hashtables in shared memory, whose directories
 821  * cannot be expanded dynamically.
 822  * NB: assumes that all hash structure parameters have default values!
 823  *
 824  * XXX this had better agree with the behavior of init_htab()...
 825  */
 826 long
 827 hash_select_dirsize(long num_entries)
 828 {
 829         long            nBuckets,
 830                                 nSegments,
 831                                 nDirEntries;
 832
 833         /* estimate number of buckets wanted */
 834         nBuckets = next_pow2_long(num_entries);
 835         /* # of segments needed for nBuckets */
 836         nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
 837         /* directory entries */
 838         nDirEntries = DEF_DIRSIZE;
 839         while (nDirEntries < nSegments)
 840                 nDirEntries <<= 1;              /* dir_alloc doubles dsize at each call */
 841
 842         return nDirEntries;
 843 }
 844
 845 /*
 846  * Compute the required initial memory allocation for a shared-memory
 847  * hashtable with the given parameters.  We need space for the HASHHDR
 848  * and for the (non expansible) directory.
 849  */
 850 Size
 851 hash_get_shared_size(HASHCTL *info, int flags)
 852 {
 853         Assert(flags & HASH_DIRSIZE);
 854         Assert(info->dsize == info->max_dsize);
 855         return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
 856 }
 857
 858
 859 /********************** DESTROY ROUTINES ************************/
 860
 861 void
 862 hash_destroy(HTAB *hashp)
 863 {
 864         if (hashp != NULL)
 865         {
 866                 /* allocation method must be one we know how to free, too */
 867                 Assert(hashp->alloc == DynaHashAlloc);
 868                 /* so this hashtable must have its own context */
 869                 Assert(hashp->hcxt != NULL);
 870
 871                 hash_stats("destroy", hashp);
 872
 873                 /*
 874                  * Free everything by destroying the hash table's memory context.
 875                  */
 876                 MemoryContextDelete(hashp->hcxt);
 877         }
 878 }
 879
 880 void
 881 hash_stats(const char *where, HTAB *hashp)
 882 {
 883 #ifdef HASH_STATISTICS
 884         fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
 885                         where, hashp->hctl->accesses, hashp->hctl->collisions);
 886
 887         fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
 888                         hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
 889                         hashp->hctl->max_bucket, hashp->hctl->nsegs);
 890         fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 891                         where, hash_accesses, hash_collisions);
 892         fprintf(stderr, "hash_stats: total expansions %ld\n",
 893                         hash_expansions);
 894 #endif
 895 }
 896
 897 /*******************************SEARCH ROUTINES *****************************/
 898
 899
 900 /*
 901  * get_hash_value -- exported routine to calculate a key's hash value
 902  *
 903  * We export this because for partitioned tables, callers need to compute
 904  * the partition number (from the low-order bits of the hash value) before
 905  * searching.
 906  */
 907 uint32
 908 get_hash_value(HTAB *hashp, const void *keyPtr)
 909 {
 910         return hashp->hash(keyPtr, hashp->keysize);
 911 }
 912
 913 /* Convert a hash value to a bucket number */
 914 static inline uint32
 915 calc_bucket(HASHHDR *hctl, uint32 hash_val)
 916 {
 917         uint32          bucket;
 918
 919         bucket = hash_val & hctl->high_mask;
 920         if (bucket > hctl->max_bucket)
 921                 bucket = bucket & hctl->low_mask;
 922
 923         return bucket;
 924 }
 925
 926 /*
 927  * hash_search -- look up key in table and perform action
 928  * hash_search_with_hash_value -- same, with key's hash value already computed
 929  *
 930  * action is one of:
 931  *              HASH_FIND: look up key in table
 932  *              HASH_ENTER: look up key in table, creating entry if not present
 933  *              HASH_ENTER_NULL: same, but return NULL if out of memory
 934  *              HASH_REMOVE: look up key in table, remove entry if present
 935  *
 936  * Return value is a pointer to the element found/entered/removed if any,
 937  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
 938  * the result is a dangling pointer that shouldn't be dereferenced!)
 939  *
 940  * HASH_ENTER will normally ereport a generic "out of memory" error if
 941  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
 942  * the same except it will return NULL if out of memory.  Note that
 943  * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
 944  * since palloc internally ereports on out-of-memory.
 945  *
 946  * If foundPtr isn't NULL, then *foundPtr is set true if we found an
 947  * existing entry in the table, false otherwise.  This is needed in the
 948  * HASH_ENTER case, but is redundant with the return value otherwise.
 949  *
 950  * For hash_search_with_hash_value, the hashvalue parameter must have been
 951  * calculated with get_hash_value().
 952  */
 953 void *
 954 hash_search(HTAB *hashp,
 955                         const void *keyPtr,
 956                         HASHACTION action,
 957                         bool *foundPtr)
 958 {
 959         return hash_search_with_hash_value(hashp,
 960                                                                            keyPtr,
 961                                                                            hashp->hash(keyPtr, hashp->keysize),
 962                                                                            action,
 963                                                                            foundPtr);
 964 }
 965
 966 void *
 967 hash_search_with_hash_value(HTAB *hashp,
 968                                                         const void *keyPtr,
 969                                                         uint32 hashvalue,
 970                                                         HASHACTION action,
 971                                                         bool *foundPtr)
 972 {
 973         HASHHDR    *hctl = hashp->hctl;
 974         int                     freelist_idx = FREELIST_IDX(hctl, hashvalue);
 975         Size            keysize;
 976         uint32          bucket;
 977         long            segment_num;
 978         long            segment_ndx;
 979         HASHSEGMENT segp;
 980         HASHBUCKET      currBucket;
 981         HASHBUCKET *prevBucketPtr;
 982         HashCompareFunc match;
 983
 984 #ifdef HASH_STATISTICS
 985         hash_accesses++;
 986         hctl->accesses++;
 987 #endif
 988
 989         /*
 990          * If inserting, check if it is time to split a bucket.
 991          *
 992          * NOTE: failure to expand table is not a fatal error, it just means we
 993          * have to run at higher fill factor than we wanted.  However, if we're
 994          * using the palloc allocator then it will throw error anyway on
 995          * out-of-memory, so we must do this before modifying the table.
 996          */
 997         if (action == HASH_ENTER || action == HASH_ENTER_NULL)
 998         {
 999                 /*
1000                  * Can't split if running in partitioned mode, nor if frozen, nor if
1001                  * table is the subject of any active hash_seq_search scans.
1002                  */
1003                 if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
1004                         !IS_PARTITIONED(hctl) && !hashp->frozen &&
1005                         !has_seq_scans(hashp))
1006                         (void) expand_table(hashp);
1007         }
1008
1009         /*
1010          * Do the initial lookup
1011          */
1012         bucket = calc_bucket(hctl, hashvalue);
1013
1014         segment_num = bucket >> hashp->sshift;
1015         segment_ndx = MOD(bucket, hashp->ssize);
1016
1017         segp = hashp->dir[segment_num];
1018
1019         if (segp == NULL)
1020                 hash_corrupted(hashp);
1021
1022         prevBucketPtr = &segp[segment_ndx];
1023         currBucket = *prevBucketPtr;
1024
1025         /*
1026          * Follow collision chain looking for matching key
1027          */
1028         match = hashp->match;           /* save one fetch in inner loop */
1029         keysize = hashp->keysize;       /* ditto */
1030
1031         while (currBucket != NULL)
1032         {
1033                 if (currBucket->hashvalue == hashvalue &&
1034                         match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1035                         break;
1036                 prevBucketPtr = &(currBucket->link);
1037                 currBucket = *prevBucketPtr;
1038 #ifdef HASH_STATISTICS
1039                 hash_collisions++;
1040                 hctl->collisions++;
1041 #endif
1042         }
1043
1044         if (foundPtr)
1045                 *foundPtr = (bool) (currBucket != NULL);
1046
1047         /*
1048          * OK, now what?
1049          */
1050         switch (action)
1051         {
1052                 case HASH_FIND:
1053                         if (currBucket != NULL)
1054                                 return (void *) ELEMENTKEY(currBucket);
1055                         return NULL;
1056
1057                 case HASH_REMOVE:
1058                         if (currBucket != NULL)
1059                         {
1060                                 /* if partitioned, must lock to touch nentries and freeList */
1061                                 if (IS_PARTITIONED(hctl))
1062                                         SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1063
1064                                 /* delete the record from the appropriate nentries counter. */
1065                                 Assert(hctl->freeList[freelist_idx].nentries > 0);
1066                                 hctl->freeList[freelist_idx].nentries--;
1067
1068                                 /* remove record from hash bucket's chain. */
1069                                 *prevBucketPtr = currBucket->link;
1070
1071                                 /* add the record to the appropriate freelist. */
1072                                 currBucket->link = hctl->freeList[freelist_idx].freeList;
1073                                 hctl->freeList[freelist_idx].freeList = currBucket;
1074
1075                                 if (IS_PARTITIONED(hctl))
1076                                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1077
1078                                 /*
1079                                  * better hope the caller is synchronizing access to this
1080                                  * element, because someone else is going to reuse it the next
1081                                  * time something is added to the table
1082                                  */
1083                                 return (void *) ELEMENTKEY(currBucket);
1084                         }
1085                         return NULL;
1086
1087                 case HASH_ENTER_NULL:
1088                         /* ENTER_NULL does not work with palloc-based allocator */
1089                         Assert(hashp->alloc != DynaHashAlloc);
1090                         /* FALL THRU */
1091
1092                 case HASH_ENTER:
1093                         /* Return existing element if found, else create one */
1094                         if (currBucket != NULL)
1095                                 return (void *) ELEMENTKEY(currBucket);
1096
1097                         /* disallow inserts if frozen */
1098                         if (hashp->frozen)
1099                                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1100                                          hashp->tabname);
1101
1102                         currBucket = get_hash_entry(hashp, freelist_idx);
1103                         if (currBucket == NULL)
1104                         {
1105                                 /* out of memory */
1106                                 if (action == HASH_ENTER_NULL)
1107                                         return NULL;
1108                                 /* report a generic message */
1109                                 if (hashp->isshared)
1110                                         ereport(ERROR,
1111                                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1112                                                          errmsg("out of shared memory")));
1113                                 else
1114                                         ereport(ERROR,
1115                                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1116                                                          errmsg("out of memory")));
1117                         }
1118
1119                         /* link into hashbucket chain */
1120                         *prevBucketPtr = currBucket;
1121                         currBucket->link = NULL;
1122
1123                         /* copy key into record */
1124                         currBucket->hashvalue = hashvalue;
1125                         hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1126
1127                         /*
1128                          * Caller is expected to fill the data field on return.  DO NOT
1129                          * insert any code that could possibly throw error here, as doing
1130                          * so would leave the table entry incomplete and hence corrupt the
1131                          * caller's data structure.
1132                          */
1133
1134                         return (void *) ELEMENTKEY(currBucket);
1135         }
1136
1137         elog(ERROR, "unrecognized hash action code: %d", (int) action);
1138
1139         return NULL;                            /* keep compiler quiet */
1140 }
1141
1142 /*
1143  * hash_update_hash_key -- change the hash key of an existing table entry
1144  *
1145  * This is equivalent to removing the entry, making a new entry, and copying
1146  * over its data, except that the entry never goes to the table's freelist.
1147  * Therefore this cannot suffer an out-of-memory failure, even if there are
1148  * other processes operating in other partitions of the hashtable.
1149  *
1150  * Returns true if successful, false if the requested new hash key is already
1151  * present.  Throws error if the specified entry pointer isn't actually a
1152  * table member.
1153  *
1154  * NB: currently, there is no special case for old and new hash keys being
1155  * identical, which means we'll report false for that situation.  This is
1156  * preferable for existing uses.
1157  *
1158  * NB: for a partitioned hashtable, caller must hold lock on both relevant
1159  * partitions, if the new hash key would belong to a different partition.
1160  */
1161 bool
1162 hash_update_hash_key(HTAB *hashp,
1163                                          void *existingEntry,
1164                                          const void *newKeyPtr)
1165 {
1166         HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1167         HASHHDR    *hctl = hashp->hctl;
1168         uint32          newhashvalue;
1169         Size            keysize;
1170         uint32          bucket;
1171         uint32          newbucket;
1172         long            segment_num;
1173         long            segment_ndx;
1174         HASHSEGMENT segp;
1175         HASHBUCKET      currBucket;
1176         HASHBUCKET *prevBucketPtr;
1177         HASHBUCKET *oldPrevPtr;
1178         HashCompareFunc match;
1179
1180 #ifdef HASH_STATISTICS
1181         hash_accesses++;
1182         hctl->accesses++;
1183 #endif
1184
1185         /* disallow updates if frozen */
1186         if (hashp->frozen)
1187                 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1188                          hashp->tabname);
1189
1190         /*
1191          * Lookup the existing element using its saved hash value.  We need to do
1192          * this to be able to unlink it from its hash chain, but as a side benefit
1193          * we can verify the validity of the passed existingEntry pointer.
1194          */
1195         bucket = calc_bucket(hctl, existingElement->hashvalue);
1196
1197         segment_num = bucket >> hashp->sshift;
1198         segment_ndx = MOD(bucket, hashp->ssize);
1199
1200         segp = hashp->dir[segment_num];
1201
1202         if (segp == NULL)
1203                 hash_corrupted(hashp);
1204
1205         prevBucketPtr = &segp[segment_ndx];
1206         currBucket = *prevBucketPtr;
1207
1208         while (currBucket != NULL)
1209         {
1210                 if (currBucket == existingElement)
1211                         break;
1212                 prevBucketPtr = &(currBucket->link);
1213                 currBucket = *prevBucketPtr;
1214         }
1215
1216         if (currBucket == NULL)
1217                 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1218                          hashp->tabname);
1219
1220         oldPrevPtr = prevBucketPtr;
1221
1222         /*
1223          * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1224          * chain we want to put the entry into.
1225          */
1226         newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1227
1228         newbucket = calc_bucket(hctl, newhashvalue);
1229
1230         segment_num = newbucket >> hashp->sshift;
1231         segment_ndx = MOD(newbucket, hashp->ssize);
1232
1233         segp = hashp->dir[segment_num];
1234
1235         if (segp == NULL)
1236                 hash_corrupted(hashp);
1237
1238         prevBucketPtr = &segp[segment_ndx];
1239         currBucket = *prevBucketPtr;
1240
1241         /*
1242          * Follow collision chain looking for matching key
1243          */
1244         match = hashp->match;           /* save one fetch in inner loop */
1245         keysize = hashp->keysize;       /* ditto */
1246
1247         while (currBucket != NULL)
1248         {
1249                 if (currBucket->hashvalue == newhashvalue &&
1250                         match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1251                         break;
1252                 prevBucketPtr = &(currBucket->link);
1253                 currBucket = *prevBucketPtr;
1254 #ifdef HASH_STATISTICS
1255                 hash_collisions++;
1256                 hctl->collisions++;
1257 #endif
1258         }
1259
1260         if (currBucket != NULL)
1261                 return false;                   /* collision with an existing entry */
1262
1263         currBucket = existingElement;
1264
1265         /*
1266          * If old and new hash values belong to the same bucket, we need not
1267          * change any chain links, and indeed should not since this simplistic
1268          * update will corrupt the list if currBucket is the last element.  (We
1269          * cannot fall out earlier, however, since we need to scan the bucket to
1270          * check for duplicate keys.)
1271          */
1272         if (bucket != newbucket)
1273         {
1274                 /* OK to remove record from old hash bucket's chain. */
1275                 *oldPrevPtr = currBucket->link;
1276
1277                 /* link into new hashbucket chain */
1278                 *prevBucketPtr = currBucket;
1279                 currBucket->link = NULL;
1280         }
1281
1282         /* copy new key into record */
1283         currBucket->hashvalue = newhashvalue;
1284         hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1285
1286         /* rest of record is untouched */
1287
1288         return true;
1289 }
1290
1291 /*
1292  * Allocate a new hashtable entry if possible; return NULL if out of memory.
1293  * (Or, if the underlying space allocator throws error for out-of-memory,
1294  * we won't return at all.)
1295  */
1296 static HASHBUCKET
1297 get_hash_entry(HTAB *hashp, int freelist_idx)
1298 {
1299         HASHHDR    *hctl = hashp->hctl;
1300         HASHBUCKET      newElement;
1301
1302         for (;;)
1303         {
1304                 /* if partitioned, must lock to touch nentries and freeList */
1305                 if (IS_PARTITIONED(hctl))
1306                         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1307
1308                 /* try to get an entry from the freelist */
1309                 newElement = hctl->freeList[freelist_idx].freeList;
1310
1311                 if (newElement != NULL)
1312                         break;
1313
1314                 if (IS_PARTITIONED(hctl))
1315                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1316
1317                 /*
1318                  * No free elements in this freelist.  In a partitioned table, there
1319                  * might be entries in other freelists, but to reduce contention we
1320                  * prefer to first try to get another chunk of buckets from the main
1321                  * shmem allocator.  If that fails, though, we *MUST* root through all
1322                  * the other freelists before giving up.  There are multiple callers
1323                  * that assume that they can allocate every element in the initially
1324                  * requested table size, or that deleting an element guarantees they
1325                  * can insert a new element, even if shared memory is entirely full.
1326                  * Failing because the needed element is in a different freelist is
1327                  * not acceptable.
1328                  */
1329                 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1330                 {
1331                         int                     borrow_from_idx;
1332
1333                         if (!IS_PARTITIONED(hctl))
1334                                 return NULL;    /* out of memory */
1335
1336                         /* try to borrow element from another freelist */
1337                         borrow_from_idx = freelist_idx;
1338                         for (;;)
1339                         {
1340                                 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1341                                 if (borrow_from_idx == freelist_idx)
1342                                         break;          /* examined all freelists, fail */
1343
1344                                 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1345                                 newElement = hctl->freeList[borrow_from_idx].freeList;
1346
1347                                 if (newElement != NULL)
1348                                 {
1349                                         hctl->freeList[borrow_from_idx].freeList = newElement->link;
1350                                         SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1351
1352                                         /* careful: count the new element in its proper freelist */
1353                                         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1354                                         hctl->freeList[freelist_idx].nentries++;
1355                                         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1356
1357                                         return newElement;
1358                                 }
1359
1360                                 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1361                         }
1362
1363                         /* no elements available to borrow either, so out of memory */
1364                         return NULL;
1365                 }
1366         }
1367
1368         /* remove entry from freelist, bump nentries */
1369         hctl->freeList[freelist_idx].freeList = newElement->link;
1370         hctl->freeList[freelist_idx].nentries++;
1371
1372         if (IS_PARTITIONED(hctl))
1373                 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1374
1375         return newElement;
1376 }
1377
1378 /*
1379  * hash_get_num_entries -- get the number of entries in a hashtable
1380  */
1381 long
1382 hash_get_num_entries(HTAB *hashp)
1383 {
1384         int                     i;
1385         long            sum = hashp->hctl->freeList[0].nentries;
1386
1387         /*
1388          * We currently don't bother with acquiring the mutexes; it's only
1389          * sensible to call this function if you've got lock on all partitions of
1390          * the table.
1391          */
1392         if (IS_PARTITIONED(hashp->hctl))
1393         {
1394                 for (i = 1; i < NUM_FREELISTS; i++)
1395                         sum += hashp->hctl->freeList[i].nentries;
1396         }
1397
1398         return sum;
1399 }
1400
1401 /*
1402  * hash_seq_init/_search/_term
1403  *                      Sequentially search through hash table and return
1404  *                      all the elements one by one, return NULL when no more.
1405  *
1406  * hash_seq_term should be called if and only if the scan is abandoned before
1407  * completion; if hash_seq_search returns NULL then it has already done the
1408  * end-of-scan cleanup.
1409  *
1410  * NOTE: caller may delete the returned element before continuing the scan.
1411  * However, deleting any other element while the scan is in progress is
1412  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
1413  * if elements are added to the table while the scan is in progress, it is
1414  * unspecified whether they will be visited by the scan or not.
1415  *
1416  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1417  * worry about hash_seq_term cleanup, if the hashtable is first locked against
1418  * further insertions by calling hash_freeze.
1419  *
1420  * NOTE: to use this with a partitioned hashtable, caller had better hold
1421  * at least shared lock on all partitions of the table throughout the scan!
1422  * We can cope with insertions or deletions by our own backend, but *not*
1423  * with concurrent insertions or deletions by another.
1424  */
1425 void
1426 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1427 {
1428         status->hashp = hashp;
1429         status->curBucket = 0;
1430         status->curEntry = NULL;
1431         if (!hashp->frozen)
1432                 register_seq_scan(hashp);
1433 }
1434
1435 void *
1436 hash_seq_search(HASH_SEQ_STATUS *status)
1437 {
1438         HTAB       *hashp;
1439         HASHHDR    *hctl;
1440         uint32          max_bucket;
1441         long            ssize;
1442         long            segment_num;
1443         long            segment_ndx;
1444         HASHSEGMENT segp;
1445         uint32          curBucket;
1446         HASHELEMENT *curElem;
1447
1448         if ((curElem = status->curEntry) != NULL)
1449         {
1450                 /* Continuing scan of curBucket... */
1451                 status->curEntry = curElem->link;
1452                 if (status->curEntry == NULL)   /* end of this bucket */
1453                         ++status->curBucket;
1454                 return (void *) ELEMENTKEY(curElem);
1455         }
1456
1457         /*
1458          * Search for next nonempty bucket starting at curBucket.
1459          */
1460         curBucket = status->curBucket;
1461         hashp = status->hashp;
1462         hctl = hashp->hctl;
1463         ssize = hashp->ssize;
1464         max_bucket = hctl->max_bucket;
1465
1466         if (curBucket > max_bucket)
1467         {
1468                 hash_seq_term(status);
1469                 return NULL;                    /* search is done */
1470         }
1471
1472         /*
1473          * first find the right segment in the table directory.
1474          */
1475         segment_num = curBucket >> hashp->sshift;
1476         segment_ndx = MOD(curBucket, ssize);
1477
1478         segp = hashp->dir[segment_num];
1479
1480         /*
1481          * Pick up the first item in this bucket's chain.  If chain is not empty
1482          * we can begin searching it.  Otherwise we have to advance to find the
1483          * next nonempty bucket.  We try to optimize that case since searching a
1484          * near-empty hashtable has to iterate this loop a lot.
1485          */
1486         while ((curElem = segp[segment_ndx]) == NULL)
1487         {
1488                 /* empty bucket, advance to next */
1489                 if (++curBucket > max_bucket)
1490                 {
1491                         status->curBucket = curBucket;
1492                         hash_seq_term(status);
1493                         return NULL;            /* search is done */
1494                 }
1495                 if (++segment_ndx >= ssize)
1496                 {
1497                         segment_num++;
1498                         segment_ndx = 0;
1499                         segp = hashp->dir[segment_num];
1500                 }
1501         }
1502
1503         /* Begin scan of curBucket... */
1504         status->curEntry = curElem->link;
1505         if (status->curEntry == NULL)   /* end of this bucket */
1506                 ++curBucket;
1507         status->curBucket = curBucket;
1508         return (void *) ELEMENTKEY(curElem);
1509 }
1510
1511 void
1512 hash_seq_term(HASH_SEQ_STATUS *status)
1513 {
1514         if (!status->hashp->frozen)
1515                 deregister_seq_scan(status->hashp);
1516 }
1517
1518 /*
1519  * hash_freeze
1520  *                      Freeze a hashtable against future insertions (deletions are
1521  *                      still allowed)
1522  *
1523  * The reason for doing this is that by preventing any more bucket splits,
1524  * we no longer need to worry about registering hash_seq_search scans,
1525  * and thus caller need not be careful about ensuring hash_seq_term gets
1526  * called at the right times.
1527  *
1528  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1529  * with active scans (since hash_seq_term would then do the wrong thing).
1530  */
1531 void
1532 hash_freeze(HTAB *hashp)
1533 {
1534         if (hashp->isshared)
1535                 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1536         if (!hashp->frozen && has_seq_scans(hashp))
1537                 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1538                          hashp->tabname);
1539         hashp->frozen = true;
1540 }
1541
1542
1543 /********************************* UTILITIES ************************/
1544
1545 /*
1546  * Expand the table by adding one more hash bucket.
1547  */
1548 static bool
1549 expand_table(HTAB *hashp)
1550 {
1551         HASHHDR    *hctl = hashp->hctl;
1552         HASHSEGMENT old_seg,
1553                                 new_seg;
1554         long            old_bucket,
1555                                 new_bucket;
1556         long            new_segnum,
1557                                 new_segndx;
1558         long            old_segnum,
1559                                 old_segndx;
1560         HASHBUCKET *oldlink,
1561                            *newlink;
1562         HASHBUCKET      currElement,
1563                                 nextElement;
1564
1565         Assert(!IS_PARTITIONED(hctl));
1566
1567 #ifdef HASH_STATISTICS
1568         hash_expansions++;
1569 #endif
1570
1571         new_bucket = hctl->max_bucket + 1;
1572         new_segnum = new_bucket >> hashp->sshift;
1573         new_segndx = MOD(new_bucket, hashp->ssize);
1574
1575         if (new_segnum >= hctl->nsegs)
1576         {
1577                 /* Allocate new segment if necessary -- could fail if dir full */
1578                 if (new_segnum >= hctl->dsize)
1579                         if (!dir_realloc(hashp))
1580                                 return false;
1581                 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1582                         return false;
1583                 hctl->nsegs++;
1584         }
1585
1586         /* OK, we created a new bucket */
1587         hctl->max_bucket++;
1588
1589         /*
1590          * *Before* changing masks, find old bucket corresponding to same hash
1591          * values; values in that bucket may need to be relocated to new bucket.
1592          * Note that new_bucket is certainly larger than low_mask at this point,
1593          * so we can skip the first step of the regular hash mask calc.
1594          */
1595         old_bucket = (new_bucket & hctl->low_mask);
1596
1597         /*
1598          * If we crossed a power of 2, readjust masks.
1599          */
1600         if ((uint32) new_bucket > hctl->high_mask)
1601         {
1602                 hctl->low_mask = hctl->high_mask;
1603                 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1604         }
1605
1606         /*
1607          * Relocate records to the new bucket.  NOTE: because of the way the hash
1608          * masking is done in calc_bucket, only one old bucket can need to be
1609          * split at this point.  With a different way of reducing the hash value,
1610          * that might not be true!
1611          */
1612         old_segnum = old_bucket >> hashp->sshift;
1613         old_segndx = MOD(old_bucket, hashp->ssize);
1614
1615         old_seg = hashp->dir[old_segnum];
1616         new_seg = hashp->dir[new_segnum];
1617
1618         oldlink = &old_seg[old_segndx];
1619         newlink = &new_seg[new_segndx];
1620
1621         for (currElement = *oldlink;
1622                  currElement != NULL;
1623                  currElement = nextElement)
1624         {
1625                 nextElement = currElement->link;
1626                 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1627                 {
1628                         *oldlink = currElement;
1629                         oldlink = &currElement->link;
1630                 }
1631                 else
1632                 {
1633                         *newlink = currElement;
1634                         newlink = &currElement->link;
1635                 }
1636         }
1637         /* don't forget to terminate the rebuilt hash chains... */
1638         *oldlink = NULL;
1639         *newlink = NULL;
1640
1641         return true;
1642 }
1643
1644
1645 static bool
1646 dir_realloc(HTAB *hashp)
1647 {
1648         HASHSEGMENT *p;
1649         HASHSEGMENT *old_p;
1650         long            new_dsize;
1651         long            old_dirsize;
1652         long            new_dirsize;
1653
1654         if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1655                 return false;
1656
1657         /* Reallocate directory */
1658         new_dsize = hashp->hctl->dsize << 1;
1659         old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1660         new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1661
1662         old_p = hashp->dir;
1663         CurrentDynaHashCxt = hashp->hcxt;
1664         p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1665
1666         if (p != NULL)
1667         {
1668                 memcpy(p, old_p, old_dirsize);
1669                 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1670                 hashp->dir = p;
1671                 hashp->hctl->dsize = new_dsize;
1672
1673                 /* XXX assume the allocator is palloc, so we know how to free */
1674                 Assert(hashp->alloc == DynaHashAlloc);
1675                 pfree(old_p);
1676
1677                 return true;
1678         }
1679
1680         return false;
1681 }
1682
1683
1684 static HASHSEGMENT
1685 seg_alloc(HTAB *hashp)
1686 {
1687         HASHSEGMENT segp;
1688
1689         CurrentDynaHashCxt = hashp->hcxt;
1690         segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1691
1692         if (!segp)
1693                 return NULL;
1694
1695         MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1696
1697         return segp;
1698 }
1699
1700 /*
1701  * allocate some new elements and link them into the indicated free list
1702  */
1703 static bool
1704 element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1705 {
1706         HASHHDR    *hctl = hashp->hctl;
1707         Size            elementSize;
1708         HASHELEMENT *firstElement;
1709         HASHELEMENT *tmpElement;
1710         HASHELEMENT *prevElement;
1711         int                     i;
1712
1713         if (hashp->isfixed)
1714                 return false;
1715
1716         /* Each element has a HASHELEMENT header plus user data. */
1717         elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1718
1719         CurrentDynaHashCxt = hashp->hcxt;
1720         firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1721
1722         if (!firstElement)
1723                 return false;
1724
1725         /* prepare to link all the new entries into the freelist */
1726         prevElement = NULL;
1727         tmpElement = firstElement;
1728         for (i = 0; i < nelem; i++)
1729         {
1730                 tmpElement->link = prevElement;
1731                 prevElement = tmpElement;
1732                 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1733         }
1734
1735         /* if partitioned, must lock to touch freeList */
1736         if (IS_PARTITIONED(hctl))
1737                 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1738
1739         /* freelist could be nonempty if two backends did this concurrently */
1740         firstElement->link = hctl->freeList[freelist_idx].freeList;
1741         hctl->freeList[freelist_idx].freeList = prevElement;
1742
1743         if (IS_PARTITIONED(hctl))
1744                 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1745
1746         return true;
1747 }
1748
1749 /* complain when we have detected a corrupted hashtable */
1750 static void
1751 hash_corrupted(HTAB *hashp)
1752 {
1753         /*
1754          * If the corruption is in a shared hashtable, we'd better force a
1755          * systemwide restart.  Otherwise, just shut down this one backend.
1756          */
1757         if (hashp->isshared)
1758                 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1759         else
1760                 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1761 }
1762
1763 /* calculate ceil(log base 2) of num */
1764 int
1765 my_log2(long num)
1766 {
1767         /*
1768          * guard against too-large input, which would be invalid for
1769          * pg_ceil_log2_*()
1770          */
1771         if (num > LONG_MAX / 2)
1772                 num = LONG_MAX / 2;
1773
1774 #if SIZEOF_LONG < 8
1775         return pg_ceil_log2_32(num);
1776 #else
1777         return pg_ceil_log2_64(num);
1778 #endif
1779 }
1780
1781 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1782 static long
1783 next_pow2_long(long num)
1784 {
1785         /* my_log2's internal range check is sufficient */
1786         return 1L << my_log2(num);
1787 }
1788
1789 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1790 static int
1791 next_pow2_int(long num)
1792 {
1793         if (num > INT_MAX / 2)
1794                 num = INT_MAX / 2;
1795         return 1 << my_log2(num);
1796 }
1797
1798
1799 /************************* SEQ SCAN TRACKING ************************/
1800
1801 /*
1802  * We track active hash_seq_search scans here.  The need for this mechanism
1803  * comes from the fact that a scan will get confused if a bucket split occurs
1804  * while it's in progress: it might visit entries twice, or even miss some
1805  * entirely (if it's partway through the same bucket that splits).  Hence
1806  * we want to inhibit bucket splits if there are any active scans on the
1807  * table being inserted into.  This is a fairly rare case in current usage,
1808  * so just postponing the split until the next insertion seems sufficient.
1809  *
1810  * Given present usages of the function, only a few scans are likely to be
1811  * open concurrently; so a finite-size stack of open scans seems sufficient,
1812  * and we don't worry that linear search is too slow.  Note that we do
1813  * allow multiple scans of the same hashtable to be open concurrently.
1814  *
1815  * This mechanism can support concurrent scan and insertion in a shared
1816  * hashtable if it's the same backend doing both.  It would fail otherwise,
1817  * but locking reasons seem to preclude any such scenario anyway, so we don't
1818  * worry.
1819  *
1820  * This arrangement is reasonably robust if a transient hashtable is deleted
1821  * without notifying us.  The absolute worst case is we might inhibit splits
1822  * in another table created later at exactly the same address.  We will give
1823  * a warning at transaction end for reference leaks, so any bugs leading to
1824  * lack of notification should be easy to catch.
1825  */
1826
1827 #define MAX_SEQ_SCANS 100
1828
1829 static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
1830 static int      seq_scan_level[MAX_SEQ_SCANS];  /* subtransaction nest level */
1831 static int      num_seq_scans = 0;
1832
1833
1834 /* Register a table as having an active hash_seq_search scan */
1835 static void
1836 register_seq_scan(HTAB *hashp)
1837 {
1838         if (num_seq_scans >= MAX_SEQ_SCANS)
1839                 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1840                          hashp->tabname);
1841         seq_scan_tables[num_seq_scans] = hashp;
1842         seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1843         num_seq_scans++;
1844 }
1845
1846 /* Deregister an active scan */
1847 static void
1848 deregister_seq_scan(HTAB *hashp)
1849 {
1850         int                     i;
1851
1852         /* Search backward since it's most likely at the stack top */
1853         for (i = num_seq_scans - 1; i >= 0; i--)
1854         {
1855                 if (seq_scan_tables[i] == hashp)
1856                 {
1857                         seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1858                         seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1859                         num_seq_scans--;
1860                         return;
1861                 }
1862         }
1863         elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1864                  hashp->tabname);
1865 }
1866
1867 /* Check if a table has any active scan */
1868 static bool
1869 has_seq_scans(HTAB *hashp)
1870 {
1871         int                     i;
1872
1873         for (i = 0; i < num_seq_scans; i++)
1874         {
1875                 if (seq_scan_tables[i] == hashp)
1876                         return true;
1877         }
1878         return false;
1879 }
1880
1881 /* Clean up any open scans at end of transaction */
1882 void
1883 AtEOXact_HashTables(bool isCommit)
1884 {
1885         /*
1886          * During abort cleanup, open scans are expected; just silently clean 'em
1887          * out.  An open scan at commit means someone forgot a hash_seq_term()
1888          * call, so complain.
1889          *
1890          * Note: it's tempting to try to print the tabname here, but refrain for
1891          * fear of touching deallocated memory.  This isn't a user-facing message
1892          * anyway, so it needn't be pretty.
1893          */
1894         if (isCommit)
1895         {
1896                 int                     i;
1897
1898                 for (i = 0; i < num_seq_scans; i++)
1899                 {
1900                         elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1901                                  seq_scan_tables[i]);
1902                 }
1903         }
1904         num_seq_scans = 0;
1905 }
1906
1907 /* Clean up any open scans at end of subtransaction */
1908 void
1909 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1910 {
1911         int                     i;
1912
1913         /*
1914          * Search backward to make cleanup easy.  Note we must check all entries,
1915          * not only those at the end of the array, because deletion technique
1916          * doesn't keep them in order.
1917          */
1918         for (i = num_seq_scans - 1; i >= 0; i--)
1919         {
1920                 if (seq_scan_level[i] >= nestDepth)
1921                 {
1922                         if (isCommit)
1923                                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1924                                          seq_scan_tables[i]);
1925                         seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1926                         seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1927                         num_seq_scans--;
1928                 }
1929         }
1930 }