Update copyright for 2022
[pgsql.git] / src / backend / access / hash / hashsearch.c
blob7ca542a3fbaa06ded055d5905d962fd0a41ab195
1 /*-------------------------------------------------------------------------
3 * hashsearch.c
4 * search code for postgres hash tables
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/access/hash/hashsearch.c
13 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
20 #include "pgstat.h"
21 #include "storage/predicate.h"
22 #include "utils/rel.h"
24 static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP,
25 ScanDirection dir);
26 static int _hash_load_qualified_items(IndexScanDesc scan, Page page,
27 OffsetNumber offnum, ScanDirection dir);
28 static inline void _hash_saveitem(HashScanOpaque so, int itemIndex,
29 OffsetNumber offnum, IndexTuple itup);
30 static void _hash_readnext(IndexScanDesc scan, Buffer *bufp,
31 Page *pagep, HashPageOpaque *opaquep);
34 * _hash_next() -- Get the next item in a scan.
36 * On entry, so->currPos describes the current page, which may
37 * be pinned but not locked, and so->currPos.itemIndex identifies
38 * which item was previously returned.
40 * On successful exit, scan->xs_ctup.t_self is set to the TID
41 * of the next heap tuple. so->currPos is updated as needed.
43 * On failure exit (no more tuples), we return false with pin
44 * held on bucket page but no pins or locks held on overflow
45 * page.
47 bool
48 _hash_next(IndexScanDesc scan, ScanDirection dir)
50 Relation rel = scan->indexRelation;
51 HashScanOpaque so = (HashScanOpaque) scan->opaque;
52 HashScanPosItem *currItem;
53 BlockNumber blkno;
54 Buffer buf;
55 bool end_of_scan = false;
58 * Advance to the next tuple on the current page; or if done, try to read
59 * data from the next or previous page based on the scan direction. Before
60 * moving to the next or previous page make sure that we deal with all the
61 * killed items.
63 if (ScanDirectionIsForward(dir))
65 if (++so->currPos.itemIndex > so->currPos.lastItem)
67 if (so->numKilled > 0)
68 _hash_kill_items(scan);
70 blkno = so->currPos.nextPage;
71 if (BlockNumberIsValid(blkno))
73 buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
74 TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(buf));
75 if (!_hash_readpage(scan, &buf, dir))
76 end_of_scan = true;
78 else
79 end_of_scan = true;
82 else
84 if (--so->currPos.itemIndex < so->currPos.firstItem)
86 if (so->numKilled > 0)
87 _hash_kill_items(scan);
89 blkno = so->currPos.prevPage;
90 if (BlockNumberIsValid(blkno))
92 buf = _hash_getbuf(rel, blkno, HASH_READ,
93 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
94 TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(buf));
97 * We always maintain the pin on bucket page for whole scan
98 * operation, so releasing the additional pin we have acquired
99 * here.
101 if (buf == so->hashso_bucket_buf ||
102 buf == so->hashso_split_bucket_buf)
103 _hash_dropbuf(rel, buf);
105 if (!_hash_readpage(scan, &buf, dir))
106 end_of_scan = true;
108 else
109 end_of_scan = true;
113 if (end_of_scan)
115 _hash_dropscanbuf(rel, so);
116 HashScanPosInvalidate(so->currPos);
117 return false;
120 /* OK, itemIndex says what to return */
121 currItem = &so->currPos.items[so->currPos.itemIndex];
122 scan->xs_heaptid = currItem->heapTid;
124 return true;
128 * Advance to next page in a bucket, if any. If we are scanning the bucket
129 * being populated during split operation then this function advances to the
130 * bucket being split after the last bucket page of bucket being populated.
132 static void
133 _hash_readnext(IndexScanDesc scan,
134 Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
136 BlockNumber blkno;
137 Relation rel = scan->indexRelation;
138 HashScanOpaque so = (HashScanOpaque) scan->opaque;
139 bool block_found = false;
141 blkno = (*opaquep)->hasho_nextblkno;
144 * Retain the pin on primary bucket page till the end of scan. Refer the
145 * comments in _hash_first to know the reason of retaining pin.
147 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
148 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
149 else
150 _hash_relbuf(rel, *bufp);
152 *bufp = InvalidBuffer;
153 /* check for interrupts while we're not holding any buffer lock */
154 CHECK_FOR_INTERRUPTS();
155 if (BlockNumberIsValid(blkno))
157 *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
158 block_found = true;
160 else if (so->hashso_buc_populated && !so->hashso_buc_split)
163 * end of bucket, scan bucket being split if there was a split in
164 * progress at the start of scan.
166 *bufp = so->hashso_split_bucket_buf;
169 * buffer for bucket being split must be valid as we acquire the pin
170 * on it before the start of scan and retain it till end of scan.
172 Assert(BufferIsValid(*bufp));
174 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
175 PredicateLockPage(rel, BufferGetBlockNumber(*bufp), scan->xs_snapshot);
178 * setting hashso_buc_split to true indicates that we are scanning
179 * bucket being split.
181 so->hashso_buc_split = true;
183 block_found = true;
186 if (block_found)
188 *pagep = BufferGetPage(*bufp);
189 TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
190 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
195 * Advance to previous page in a bucket, if any. If the current scan has
196 * started during split operation then this function advances to bucket
197 * being populated after the first bucket page of bucket being split.
199 static void
200 _hash_readprev(IndexScanDesc scan,
201 Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
203 BlockNumber blkno;
204 Relation rel = scan->indexRelation;
205 HashScanOpaque so = (HashScanOpaque) scan->opaque;
206 bool haveprevblk;
208 blkno = (*opaquep)->hasho_prevblkno;
211 * Retain the pin on primary bucket page till the end of scan. Refer the
212 * comments in _hash_first to know the reason of retaining pin.
214 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
216 LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
217 haveprevblk = false;
219 else
221 _hash_relbuf(rel, *bufp);
222 haveprevblk = true;
225 *bufp = InvalidBuffer;
226 /* check for interrupts while we're not holding any buffer lock */
227 CHECK_FOR_INTERRUPTS();
229 if (haveprevblk)
231 Assert(BlockNumberIsValid(blkno));
232 *bufp = _hash_getbuf(rel, blkno, HASH_READ,
233 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
234 *pagep = BufferGetPage(*bufp);
235 TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
236 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
239 * We always maintain the pin on bucket page for whole scan operation,
240 * so releasing the additional pin we have acquired here.
242 if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
243 _hash_dropbuf(rel, *bufp);
245 else if (so->hashso_buc_populated && so->hashso_buc_split)
248 * end of bucket, scan bucket being populated if there was a split in
249 * progress at the start of scan.
251 *bufp = so->hashso_bucket_buf;
254 * buffer for bucket being populated must be valid as we acquire the
255 * pin on it before the start of scan and retain it till end of scan.
257 Assert(BufferIsValid(*bufp));
259 LockBuffer(*bufp, BUFFER_LOCK_SHARE);
260 *pagep = BufferGetPage(*bufp);
261 *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
263 /* move to the end of bucket chain */
264 while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
265 _hash_readnext(scan, bufp, pagep, opaquep);
268 * setting hashso_buc_split to false indicates that we are scanning
269 * bucket being populated.
271 so->hashso_buc_split = false;
276 * _hash_first() -- Find the first item in a scan.
278 * We find the first item (or, if backward scan, the last item) in the
279 * index that satisfies the qualification associated with the scan
280 * descriptor.
282 * On successful exit, if the page containing current index tuple is an
283 * overflow page, both pin and lock are released whereas if it is a bucket
284 * page then it is pinned but not locked and data about the matching
285 * tuple(s) on the page has been loaded into so->currPos,
286 * scan->xs_ctup.t_self is set to the heap TID of the current tuple.
288 * On failure exit (no more tuples), we return false, with pin held on
289 * bucket page but no pins or locks held on overflow page.
291 bool
292 _hash_first(IndexScanDesc scan, ScanDirection dir)
294 Relation rel = scan->indexRelation;
295 HashScanOpaque so = (HashScanOpaque) scan->opaque;
296 ScanKey cur;
297 uint32 hashkey;
298 Bucket bucket;
299 Buffer buf;
300 Page page;
301 HashPageOpaque opaque;
302 HashScanPosItem *currItem;
304 pgstat_count_index_scan(rel);
307 * We do not support hash scans with no index qualification, because we
308 * would have to read the whole index rather than just one bucket. That
309 * creates a whole raft of problems, since we haven't got a practical way
310 * to lock all the buckets against splits or compactions.
312 if (scan->numberOfKeys < 1)
313 ereport(ERROR,
314 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
315 errmsg("hash indexes do not support whole-index scans")));
317 /* There may be more than one index qual, but we hash only the first */
318 cur = &scan->keyData[0];
320 /* We support only single-column hash indexes */
321 Assert(cur->sk_attno == 1);
322 /* And there's only one operator strategy, too */
323 Assert(cur->sk_strategy == HTEqualStrategyNumber);
326 * If the constant in the index qual is NULL, assume it cannot match any
327 * items in the index.
329 if (cur->sk_flags & SK_ISNULL)
330 return false;
333 * Okay to compute the hash key. We want to do this before acquiring any
334 * locks, in case a user-defined hash function happens to be slow.
336 * If scankey operator is not a cross-type comparison, we can use the
337 * cached hash function; otherwise gotta look it up in the catalogs.
339 * We support the convention that sk_subtype == InvalidOid means the
340 * opclass input type; this is a hack to simplify life for ScanKeyInit().
342 if (cur->sk_subtype == rel->rd_opcintype[0] ||
343 cur->sk_subtype == InvalidOid)
344 hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
345 else
346 hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
347 cur->sk_subtype);
349 so->hashso_sk_hash = hashkey;
351 buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
352 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
353 page = BufferGetPage(buf);
354 TestForOldSnapshot(scan->xs_snapshot, rel, page);
355 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
356 bucket = opaque->hasho_bucket;
358 so->hashso_bucket_buf = buf;
361 * If a bucket split is in progress, then while scanning the bucket being
362 * populated, we need to skip tuples that were copied from bucket being
363 * split. We also need to maintain a pin on the bucket being split to
364 * ensure that split-cleanup work done by vacuum doesn't remove tuples
365 * from it till this scan is done. We need to maintain a pin on the
366 * bucket being populated to ensure that vacuum doesn't squeeze that
367 * bucket till this scan is complete; otherwise, the ordering of tuples
368 * can't be maintained during forward and backward scans. Here, we have
369 * to be cautious about locking order: first, acquire the lock on bucket
370 * being split; then, release the lock on it but not the pin; then,
371 * acquire a lock on bucket being populated and again re-verify whether
372 * the bucket split is still in progress. Acquiring the lock on bucket
373 * being split first ensures that the vacuum waits for this scan to
374 * finish.
376 if (H_BUCKET_BEING_POPULATED(opaque))
378 BlockNumber old_blkno;
379 Buffer old_buf;
381 old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
384 * release the lock on new bucket and re-acquire it after acquiring
385 * the lock on old bucket.
387 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
389 old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
390 TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf));
393 * remember the split bucket buffer so as to use it later for
394 * scanning.
396 so->hashso_split_bucket_buf = old_buf;
397 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
399 LockBuffer(buf, BUFFER_LOCK_SHARE);
400 page = BufferGetPage(buf);
401 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
402 Assert(opaque->hasho_bucket == bucket);
404 if (H_BUCKET_BEING_POPULATED(opaque))
405 so->hashso_buc_populated = true;
406 else
408 _hash_dropbuf(rel, so->hashso_split_bucket_buf);
409 so->hashso_split_bucket_buf = InvalidBuffer;
413 /* If a backwards scan is requested, move to the end of the chain */
414 if (ScanDirectionIsBackward(dir))
417 * Backward scans that start during split needs to start from end of
418 * bucket being split.
420 while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
421 (so->hashso_buc_populated && !so->hashso_buc_split))
422 _hash_readnext(scan, &buf, &page, &opaque);
425 /* remember which buffer we have pinned, if any */
426 Assert(BufferIsInvalid(so->currPos.buf));
427 so->currPos.buf = buf;
429 /* Now find all the tuples satisfying the qualification from a page */
430 if (!_hash_readpage(scan, &buf, dir))
431 return false;
433 /* OK, itemIndex says what to return */
434 currItem = &so->currPos.items[so->currPos.itemIndex];
435 scan->xs_heaptid = currItem->heapTid;
437 /* if we're here, _hash_readpage found a valid tuples */
438 return true;
442 * _hash_readpage() -- Load data from current index page into so->currPos
444 * We scan all the items in the current index page and save them into
445 * so->currPos if it satisfies the qualification. If no matching items
446 * are found in the current page, we move to the next or previous page
447 * in a bucket chain as indicated by the direction.
449 * Return true if any matching items are found else return false.
451 static bool
452 _hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
454 Relation rel = scan->indexRelation;
455 HashScanOpaque so = (HashScanOpaque) scan->opaque;
456 Buffer buf;
457 Page page;
458 HashPageOpaque opaque;
459 OffsetNumber offnum;
460 uint16 itemIndex;
462 buf = *bufP;
463 Assert(BufferIsValid(buf));
464 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
465 page = BufferGetPage(buf);
466 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
468 so->currPos.buf = buf;
469 so->currPos.currPage = BufferGetBlockNumber(buf);
471 if (ScanDirectionIsForward(dir))
473 BlockNumber prev_blkno = InvalidBlockNumber;
475 for (;;)
477 /* new page, locate starting position by binary search */
478 offnum = _hash_binsearch(page, so->hashso_sk_hash);
480 itemIndex = _hash_load_qualified_items(scan, page, offnum, dir);
482 if (itemIndex != 0)
483 break;
486 * Could not find any matching tuples in the current page, move to
487 * the next page. Before leaving the current page, deal with any
488 * killed items.
490 if (so->numKilled > 0)
491 _hash_kill_items(scan);
494 * If this is a primary bucket page, hasho_prevblkno is not a real
495 * block number.
497 if (so->currPos.buf == so->hashso_bucket_buf ||
498 so->currPos.buf == so->hashso_split_bucket_buf)
499 prev_blkno = InvalidBlockNumber;
500 else
501 prev_blkno = opaque->hasho_prevblkno;
503 _hash_readnext(scan, &buf, &page, &opaque);
504 if (BufferIsValid(buf))
506 so->currPos.buf = buf;
507 so->currPos.currPage = BufferGetBlockNumber(buf);
509 else
512 * Remember next and previous block numbers for scrollable
513 * cursors to know the start position and return false
514 * indicating that no more matching tuples were found. Also,
515 * don't reset currPage or lsn, because we expect
516 * _hash_kill_items to be called for the old page after this
517 * function returns.
519 so->currPos.prevPage = prev_blkno;
520 so->currPos.nextPage = InvalidBlockNumber;
521 so->currPos.buf = buf;
522 return false;
526 so->currPos.firstItem = 0;
527 so->currPos.lastItem = itemIndex - 1;
528 so->currPos.itemIndex = 0;
530 else
532 BlockNumber next_blkno = InvalidBlockNumber;
534 for (;;)
536 /* new page, locate starting position by binary search */
537 offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
539 itemIndex = _hash_load_qualified_items(scan, page, offnum, dir);
541 if (itemIndex != MaxIndexTuplesPerPage)
542 break;
545 * Could not find any matching tuples in the current page, move to
546 * the previous page. Before leaving the current page, deal with
547 * any killed items.
549 if (so->numKilled > 0)
550 _hash_kill_items(scan);
552 if (so->currPos.buf == so->hashso_bucket_buf ||
553 so->currPos.buf == so->hashso_split_bucket_buf)
554 next_blkno = opaque->hasho_nextblkno;
556 _hash_readprev(scan, &buf, &page, &opaque);
557 if (BufferIsValid(buf))
559 so->currPos.buf = buf;
560 so->currPos.currPage = BufferGetBlockNumber(buf);
562 else
565 * Remember next and previous block numbers for scrollable
566 * cursors to know the start position and return false
567 * indicating that no more matching tuples were found. Also,
568 * don't reset currPage or lsn, because we expect
569 * _hash_kill_items to be called for the old page after this
570 * function returns.
572 so->currPos.prevPage = InvalidBlockNumber;
573 so->currPos.nextPage = next_blkno;
574 so->currPos.buf = buf;
575 return false;
579 so->currPos.firstItem = itemIndex;
580 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
581 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
584 if (so->currPos.buf == so->hashso_bucket_buf ||
585 so->currPos.buf == so->hashso_split_bucket_buf)
587 so->currPos.prevPage = InvalidBlockNumber;
588 so->currPos.nextPage = opaque->hasho_nextblkno;
589 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
591 else
593 so->currPos.prevPage = opaque->hasho_prevblkno;
594 so->currPos.nextPage = opaque->hasho_nextblkno;
595 _hash_relbuf(rel, so->currPos.buf);
596 so->currPos.buf = InvalidBuffer;
599 Assert(so->currPos.firstItem <= so->currPos.lastItem);
600 return true;
604 * Load all the qualified items from a current index page
605 * into so->currPos. Helper function for _hash_readpage.
607 static int
608 _hash_load_qualified_items(IndexScanDesc scan, Page page,
609 OffsetNumber offnum, ScanDirection dir)
611 HashScanOpaque so = (HashScanOpaque) scan->opaque;
612 IndexTuple itup;
613 int itemIndex;
614 OffsetNumber maxoff;
616 maxoff = PageGetMaxOffsetNumber(page);
618 if (ScanDirectionIsForward(dir))
620 /* load items[] in ascending order */
621 itemIndex = 0;
623 while (offnum <= maxoff)
625 Assert(offnum >= FirstOffsetNumber);
626 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
629 * skip the tuples that are moved by split operation for the scan
630 * that has started when split was in progress. Also, skip the
631 * tuples that are marked as dead.
633 if ((so->hashso_buc_populated && !so->hashso_buc_split &&
634 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
635 (scan->ignore_killed_tuples &&
636 (ItemIdIsDead(PageGetItemId(page, offnum)))))
638 offnum = OffsetNumberNext(offnum); /* move forward */
639 continue;
642 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
643 _hash_checkqual(scan, itup))
645 /* tuple is qualified, so remember it */
646 _hash_saveitem(so, itemIndex, offnum, itup);
647 itemIndex++;
649 else
652 * No more matching tuples exist in this page. so, exit while
653 * loop.
655 break;
658 offnum = OffsetNumberNext(offnum);
661 Assert(itemIndex <= MaxIndexTuplesPerPage);
662 return itemIndex;
664 else
666 /* load items[] in descending order */
667 itemIndex = MaxIndexTuplesPerPage;
669 while (offnum >= FirstOffsetNumber)
671 Assert(offnum <= maxoff);
672 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
675 * skip the tuples that are moved by split operation for the scan
676 * that has started when split was in progress. Also, skip the
677 * tuples that are marked as dead.
679 if ((so->hashso_buc_populated && !so->hashso_buc_split &&
680 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
681 (scan->ignore_killed_tuples &&
682 (ItemIdIsDead(PageGetItemId(page, offnum)))))
684 offnum = OffsetNumberPrev(offnum); /* move back */
685 continue;
688 if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
689 _hash_checkqual(scan, itup))
691 itemIndex--;
692 /* tuple is qualified, so remember it */
693 _hash_saveitem(so, itemIndex, offnum, itup);
695 else
698 * No more matching tuples exist in this page. so, exit while
699 * loop.
701 break;
704 offnum = OffsetNumberPrev(offnum);
707 Assert(itemIndex >= 0);
708 return itemIndex;
712 /* Save an index item into so->currPos.items[itemIndex] */
713 static inline void
714 _hash_saveitem(HashScanOpaque so, int itemIndex,
715 OffsetNumber offnum, IndexTuple itup)
717 HashScanPosItem *currItem = &so->currPos.items[itemIndex];
719 currItem->heapTid = itup->t_tid;
720 currItem->indexOffset = offnum;