1 /*-------------------------------------------------------------------------
4 * search code for postgres hash tables
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashsearch.c
13 *-------------------------------------------------------------------------
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
21 #include "storage/predicate.h"
22 #include "utils/rel.h"
24 static bool _hash_readpage(IndexScanDesc scan
, Buffer
*bufP
,
26 static int _hash_load_qualified_items(IndexScanDesc scan
, Page page
,
27 OffsetNumber offnum
, ScanDirection dir
);
28 static inline void _hash_saveitem(HashScanOpaque so
, int itemIndex
,
29 OffsetNumber offnum
, IndexTuple itup
);
30 static void _hash_readnext(IndexScanDesc scan
, Buffer
*bufp
,
31 Page
*pagep
, HashPageOpaque
*opaquep
);
34 * _hash_next() -- Get the next item in a scan.
36 * On entry, so->currPos describes the current page, which may
37 * be pinned but not locked, and so->currPos.itemIndex identifies
38 * which item was previously returned.
40 * On successful exit, scan->xs_ctup.t_self is set to the TID
41 * of the next heap tuple. so->currPos is updated as needed.
43 * On failure exit (no more tuples), we return false with pin
44 * held on bucket page but no pins or locks held on overflow
48 _hash_next(IndexScanDesc scan
, ScanDirection dir
)
50 Relation rel
= scan
->indexRelation
;
51 HashScanOpaque so
= (HashScanOpaque
) scan
->opaque
;
52 HashScanPosItem
*currItem
;
55 bool end_of_scan
= false;
58 * Advance to the next tuple on the current page; or if done, try to read
59 * data from the next or previous page based on the scan direction. Before
60 * moving to the next or previous page make sure that we deal with all the
63 if (ScanDirectionIsForward(dir
))
65 if (++so
->currPos
.itemIndex
> so
->currPos
.lastItem
)
67 if (so
->numKilled
> 0)
68 _hash_kill_items(scan
);
70 blkno
= so
->currPos
.nextPage
;
71 if (BlockNumberIsValid(blkno
))
73 buf
= _hash_getbuf(rel
, blkno
, HASH_READ
, LH_OVERFLOW_PAGE
);
74 TestForOldSnapshot(scan
->xs_snapshot
, rel
, BufferGetPage(buf
));
75 if (!_hash_readpage(scan
, &buf
, dir
))
84 if (--so
->currPos
.itemIndex
< so
->currPos
.firstItem
)
86 if (so
->numKilled
> 0)
87 _hash_kill_items(scan
);
89 blkno
= so
->currPos
.prevPage
;
90 if (BlockNumberIsValid(blkno
))
92 buf
= _hash_getbuf(rel
, blkno
, HASH_READ
,
93 LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
94 TestForOldSnapshot(scan
->xs_snapshot
, rel
, BufferGetPage(buf
));
97 * We always maintain the pin on bucket page for whole scan
98 * operation, so releasing the additional pin we have acquired
101 if (buf
== so
->hashso_bucket_buf
||
102 buf
== so
->hashso_split_bucket_buf
)
103 _hash_dropbuf(rel
, buf
);
105 if (!_hash_readpage(scan
, &buf
, dir
))
115 _hash_dropscanbuf(rel
, so
);
116 HashScanPosInvalidate(so
->currPos
);
120 /* OK, itemIndex says what to return */
121 currItem
= &so
->currPos
.items
[so
->currPos
.itemIndex
];
122 scan
->xs_heaptid
= currItem
->heapTid
;
128 * Advance to next page in a bucket, if any. If we are scanning the bucket
129 * being populated during split operation then this function advances to the
130 * bucket being split after the last bucket page of bucket being populated.
133 _hash_readnext(IndexScanDesc scan
,
134 Buffer
*bufp
, Page
*pagep
, HashPageOpaque
*opaquep
)
137 Relation rel
= scan
->indexRelation
;
138 HashScanOpaque so
= (HashScanOpaque
) scan
->opaque
;
139 bool block_found
= false;
141 blkno
= (*opaquep
)->hasho_nextblkno
;
144 * Retain the pin on primary bucket page till the end of scan. Refer the
145 * comments in _hash_first to know the reason of retaining pin.
147 if (*bufp
== so
->hashso_bucket_buf
|| *bufp
== so
->hashso_split_bucket_buf
)
148 LockBuffer(*bufp
, BUFFER_LOCK_UNLOCK
);
150 _hash_relbuf(rel
, *bufp
);
152 *bufp
= InvalidBuffer
;
153 /* check for interrupts while we're not holding any buffer lock */
154 CHECK_FOR_INTERRUPTS();
155 if (BlockNumberIsValid(blkno
))
157 *bufp
= _hash_getbuf(rel
, blkno
, HASH_READ
, LH_OVERFLOW_PAGE
);
160 else if (so
->hashso_buc_populated
&& !so
->hashso_buc_split
)
163 * end of bucket, scan bucket being split if there was a split in
164 * progress at the start of scan.
166 *bufp
= so
->hashso_split_bucket_buf
;
169 * buffer for bucket being split must be valid as we acquire the pin
170 * on it before the start of scan and retain it till end of scan.
172 Assert(BufferIsValid(*bufp
));
174 LockBuffer(*bufp
, BUFFER_LOCK_SHARE
);
175 PredicateLockPage(rel
, BufferGetBlockNumber(*bufp
), scan
->xs_snapshot
);
178 * setting hashso_buc_split to true indicates that we are scanning
179 * bucket being split.
181 so
->hashso_buc_split
= true;
188 *pagep
= BufferGetPage(*bufp
);
189 TestForOldSnapshot(scan
->xs_snapshot
, rel
, *pagep
);
190 *opaquep
= (HashPageOpaque
) PageGetSpecialPointer(*pagep
);
195 * Advance to previous page in a bucket, if any. If the current scan has
196 * started during split operation then this function advances to bucket
197 * being populated after the first bucket page of bucket being split.
200 _hash_readprev(IndexScanDesc scan
,
201 Buffer
*bufp
, Page
*pagep
, HashPageOpaque
*opaquep
)
204 Relation rel
= scan
->indexRelation
;
205 HashScanOpaque so
= (HashScanOpaque
) scan
->opaque
;
208 blkno
= (*opaquep
)->hasho_prevblkno
;
211 * Retain the pin on primary bucket page till the end of scan. Refer the
212 * comments in _hash_first to know the reason of retaining pin.
214 if (*bufp
== so
->hashso_bucket_buf
|| *bufp
== so
->hashso_split_bucket_buf
)
216 LockBuffer(*bufp
, BUFFER_LOCK_UNLOCK
);
221 _hash_relbuf(rel
, *bufp
);
225 *bufp
= InvalidBuffer
;
226 /* check for interrupts while we're not holding any buffer lock */
227 CHECK_FOR_INTERRUPTS();
231 Assert(BlockNumberIsValid(blkno
));
232 *bufp
= _hash_getbuf(rel
, blkno
, HASH_READ
,
233 LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
234 *pagep
= BufferGetPage(*bufp
);
235 TestForOldSnapshot(scan
->xs_snapshot
, rel
, *pagep
);
236 *opaquep
= (HashPageOpaque
) PageGetSpecialPointer(*pagep
);
239 * We always maintain the pin on bucket page for whole scan operation,
240 * so releasing the additional pin we have acquired here.
242 if (*bufp
== so
->hashso_bucket_buf
|| *bufp
== so
->hashso_split_bucket_buf
)
243 _hash_dropbuf(rel
, *bufp
);
245 else if (so
->hashso_buc_populated
&& so
->hashso_buc_split
)
248 * end of bucket, scan bucket being populated if there was a split in
249 * progress at the start of scan.
251 *bufp
= so
->hashso_bucket_buf
;
254 * buffer for bucket being populated must be valid as we acquire the
255 * pin on it before the start of scan and retain it till end of scan.
257 Assert(BufferIsValid(*bufp
));
259 LockBuffer(*bufp
, BUFFER_LOCK_SHARE
);
260 *pagep
= BufferGetPage(*bufp
);
261 *opaquep
= (HashPageOpaque
) PageGetSpecialPointer(*pagep
);
263 /* move to the end of bucket chain */
264 while (BlockNumberIsValid((*opaquep
)->hasho_nextblkno
))
265 _hash_readnext(scan
, bufp
, pagep
, opaquep
);
268 * setting hashso_buc_split to false indicates that we are scanning
269 * bucket being populated.
271 so
->hashso_buc_split
= false;
276 * _hash_first() -- Find the first item in a scan.
278 * We find the first item (or, if backward scan, the last item) in the
279 * index that satisfies the qualification associated with the scan
282 * On successful exit, if the page containing current index tuple is an
283 * overflow page, both pin and lock are released whereas if it is a bucket
284 * page then it is pinned but not locked and data about the matching
285 * tuple(s) on the page has been loaded into so->currPos,
286 * scan->xs_ctup.t_self is set to the heap TID of the current tuple.
288 * On failure exit (no more tuples), we return false, with pin held on
289 * bucket page but no pins or locks held on overflow page.
292 _hash_first(IndexScanDesc scan
, ScanDirection dir
)
294 Relation rel
= scan
->indexRelation
;
295 HashScanOpaque so
= (HashScanOpaque
) scan
->opaque
;
301 HashPageOpaque opaque
;
302 HashScanPosItem
*currItem
;
304 pgstat_count_index_scan(rel
);
307 * We do not support hash scans with no index qualification, because we
308 * would have to read the whole index rather than just one bucket. That
309 * creates a whole raft of problems, since we haven't got a practical way
310 * to lock all the buckets against splits or compactions.
312 if (scan
->numberOfKeys
< 1)
314 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
315 errmsg("hash indexes do not support whole-index scans")));
317 /* There may be more than one index qual, but we hash only the first */
318 cur
= &scan
->keyData
[0];
320 /* We support only single-column hash indexes */
321 Assert(cur
->sk_attno
== 1);
322 /* And there's only one operator strategy, too */
323 Assert(cur
->sk_strategy
== HTEqualStrategyNumber
);
326 * If the constant in the index qual is NULL, assume it cannot match any
327 * items in the index.
329 if (cur
->sk_flags
& SK_ISNULL
)
333 * Okay to compute the hash key. We want to do this before acquiring any
334 * locks, in case a user-defined hash function happens to be slow.
336 * If scankey operator is not a cross-type comparison, we can use the
337 * cached hash function; otherwise gotta look it up in the catalogs.
339 * We support the convention that sk_subtype == InvalidOid means the
340 * opclass input type; this is a hack to simplify life for ScanKeyInit().
342 if (cur
->sk_subtype
== rel
->rd_opcintype
[0] ||
343 cur
->sk_subtype
== InvalidOid
)
344 hashkey
= _hash_datum2hashkey(rel
, cur
->sk_argument
);
346 hashkey
= _hash_datum2hashkey_type(rel
, cur
->sk_argument
,
349 so
->hashso_sk_hash
= hashkey
;
351 buf
= _hash_getbucketbuf_from_hashkey(rel
, hashkey
, HASH_READ
, NULL
);
352 PredicateLockPage(rel
, BufferGetBlockNumber(buf
), scan
->xs_snapshot
);
353 page
= BufferGetPage(buf
);
354 TestForOldSnapshot(scan
->xs_snapshot
, rel
, page
);
355 opaque
= (HashPageOpaque
) PageGetSpecialPointer(page
);
356 bucket
= opaque
->hasho_bucket
;
358 so
->hashso_bucket_buf
= buf
;
361 * If a bucket split is in progress, then while scanning the bucket being
362 * populated, we need to skip tuples that were copied from bucket being
363 * split. We also need to maintain a pin on the bucket being split to
364 * ensure that split-cleanup work done by vacuum doesn't remove tuples
365 * from it till this scan is done. We need to maintain a pin on the
366 * bucket being populated to ensure that vacuum doesn't squeeze that
367 * bucket till this scan is complete; otherwise, the ordering of tuples
368 * can't be maintained during forward and backward scans. Here, we have
369 * to be cautious about locking order: first, acquire the lock on bucket
370 * being split; then, release the lock on it but not the pin; then,
371 * acquire a lock on bucket being populated and again re-verify whether
372 * the bucket split is still in progress. Acquiring the lock on bucket
373 * being split first ensures that the vacuum waits for this scan to
376 if (H_BUCKET_BEING_POPULATED(opaque
))
378 BlockNumber old_blkno
;
381 old_blkno
= _hash_get_oldblock_from_newbucket(rel
, bucket
);
384 * release the lock on new bucket and re-acquire it after acquiring
385 * the lock on old bucket.
387 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
389 old_buf
= _hash_getbuf(rel
, old_blkno
, HASH_READ
, LH_BUCKET_PAGE
);
390 TestForOldSnapshot(scan
->xs_snapshot
, rel
, BufferGetPage(old_buf
));
393 * remember the split bucket buffer so as to use it later for
396 so
->hashso_split_bucket_buf
= old_buf
;
397 LockBuffer(old_buf
, BUFFER_LOCK_UNLOCK
);
399 LockBuffer(buf
, BUFFER_LOCK_SHARE
);
400 page
= BufferGetPage(buf
);
401 opaque
= (HashPageOpaque
) PageGetSpecialPointer(page
);
402 Assert(opaque
->hasho_bucket
== bucket
);
404 if (H_BUCKET_BEING_POPULATED(opaque
))
405 so
->hashso_buc_populated
= true;
408 _hash_dropbuf(rel
, so
->hashso_split_bucket_buf
);
409 so
->hashso_split_bucket_buf
= InvalidBuffer
;
413 /* If a backwards scan is requested, move to the end of the chain */
414 if (ScanDirectionIsBackward(dir
))
417 * Backward scans that start during split needs to start from end of
418 * bucket being split.
420 while (BlockNumberIsValid(opaque
->hasho_nextblkno
) ||
421 (so
->hashso_buc_populated
&& !so
->hashso_buc_split
))
422 _hash_readnext(scan
, &buf
, &page
, &opaque
);
425 /* remember which buffer we have pinned, if any */
426 Assert(BufferIsInvalid(so
->currPos
.buf
));
427 so
->currPos
.buf
= buf
;
429 /* Now find all the tuples satisfying the qualification from a page */
430 if (!_hash_readpage(scan
, &buf
, dir
))
433 /* OK, itemIndex says what to return */
434 currItem
= &so
->currPos
.items
[so
->currPos
.itemIndex
];
435 scan
->xs_heaptid
= currItem
->heapTid
;
437 /* if we're here, _hash_readpage found a valid tuples */
442 * _hash_readpage() -- Load data from current index page into so->currPos
444 * We scan all the items in the current index page and save them into
445 * so->currPos if it satisfies the qualification. If no matching items
446 * are found in the current page, we move to the next or previous page
447 * in a bucket chain as indicated by the direction.
449 * Return true if any matching items are found else return false.
452 _hash_readpage(IndexScanDesc scan
, Buffer
*bufP
, ScanDirection dir
)
454 Relation rel
= scan
->indexRelation
;
455 HashScanOpaque so
= (HashScanOpaque
) scan
->opaque
;
458 HashPageOpaque opaque
;
463 Assert(BufferIsValid(buf
));
464 _hash_checkpage(rel
, buf
, LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
465 page
= BufferGetPage(buf
);
466 opaque
= (HashPageOpaque
) PageGetSpecialPointer(page
);
468 so
->currPos
.buf
= buf
;
469 so
->currPos
.currPage
= BufferGetBlockNumber(buf
);
471 if (ScanDirectionIsForward(dir
))
473 BlockNumber prev_blkno
= InvalidBlockNumber
;
477 /* new page, locate starting position by binary search */
478 offnum
= _hash_binsearch(page
, so
->hashso_sk_hash
);
480 itemIndex
= _hash_load_qualified_items(scan
, page
, offnum
, dir
);
486 * Could not find any matching tuples in the current page, move to
487 * the next page. Before leaving the current page, deal with any
490 if (so
->numKilled
> 0)
491 _hash_kill_items(scan
);
494 * If this is a primary bucket page, hasho_prevblkno is not a real
497 if (so
->currPos
.buf
== so
->hashso_bucket_buf
||
498 so
->currPos
.buf
== so
->hashso_split_bucket_buf
)
499 prev_blkno
= InvalidBlockNumber
;
501 prev_blkno
= opaque
->hasho_prevblkno
;
503 _hash_readnext(scan
, &buf
, &page
, &opaque
);
504 if (BufferIsValid(buf
))
506 so
->currPos
.buf
= buf
;
507 so
->currPos
.currPage
= BufferGetBlockNumber(buf
);
512 * Remember next and previous block numbers for scrollable
513 * cursors to know the start position and return false
514 * indicating that no more matching tuples were found. Also,
515 * don't reset currPage or lsn, because we expect
516 * _hash_kill_items to be called for the old page after this
519 so
->currPos
.prevPage
= prev_blkno
;
520 so
->currPos
.nextPage
= InvalidBlockNumber
;
521 so
->currPos
.buf
= buf
;
526 so
->currPos
.firstItem
= 0;
527 so
->currPos
.lastItem
= itemIndex
- 1;
528 so
->currPos
.itemIndex
= 0;
532 BlockNumber next_blkno
= InvalidBlockNumber
;
536 /* new page, locate starting position by binary search */
537 offnum
= _hash_binsearch_last(page
, so
->hashso_sk_hash
);
539 itemIndex
= _hash_load_qualified_items(scan
, page
, offnum
, dir
);
541 if (itemIndex
!= MaxIndexTuplesPerPage
)
545 * Could not find any matching tuples in the current page, move to
546 * the previous page. Before leaving the current page, deal with
549 if (so
->numKilled
> 0)
550 _hash_kill_items(scan
);
552 if (so
->currPos
.buf
== so
->hashso_bucket_buf
||
553 so
->currPos
.buf
== so
->hashso_split_bucket_buf
)
554 next_blkno
= opaque
->hasho_nextblkno
;
556 _hash_readprev(scan
, &buf
, &page
, &opaque
);
557 if (BufferIsValid(buf
))
559 so
->currPos
.buf
= buf
;
560 so
->currPos
.currPage
= BufferGetBlockNumber(buf
);
565 * Remember next and previous block numbers for scrollable
566 * cursors to know the start position and return false
567 * indicating that no more matching tuples were found. Also,
568 * don't reset currPage or lsn, because we expect
569 * _hash_kill_items to be called for the old page after this
572 so
->currPos
.prevPage
= InvalidBlockNumber
;
573 so
->currPos
.nextPage
= next_blkno
;
574 so
->currPos
.buf
= buf
;
579 so
->currPos
.firstItem
= itemIndex
;
580 so
->currPos
.lastItem
= MaxIndexTuplesPerPage
- 1;
581 so
->currPos
.itemIndex
= MaxIndexTuplesPerPage
- 1;
584 if (so
->currPos
.buf
== so
->hashso_bucket_buf
||
585 so
->currPos
.buf
== so
->hashso_split_bucket_buf
)
587 so
->currPos
.prevPage
= InvalidBlockNumber
;
588 so
->currPos
.nextPage
= opaque
->hasho_nextblkno
;
589 LockBuffer(so
->currPos
.buf
, BUFFER_LOCK_UNLOCK
);
593 so
->currPos
.prevPage
= opaque
->hasho_prevblkno
;
594 so
->currPos
.nextPage
= opaque
->hasho_nextblkno
;
595 _hash_relbuf(rel
, so
->currPos
.buf
);
596 so
->currPos
.buf
= InvalidBuffer
;
599 Assert(so
->currPos
.firstItem
<= so
->currPos
.lastItem
);
604 * Load all the qualified items from a current index page
605 * into so->currPos. Helper function for _hash_readpage.
608 _hash_load_qualified_items(IndexScanDesc scan
, Page page
,
609 OffsetNumber offnum
, ScanDirection dir
)
611 HashScanOpaque so
= (HashScanOpaque
) scan
->opaque
;
616 maxoff
= PageGetMaxOffsetNumber(page
);
618 if (ScanDirectionIsForward(dir
))
620 /* load items[] in ascending order */
623 while (offnum
<= maxoff
)
625 Assert(offnum
>= FirstOffsetNumber
);
626 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, offnum
));
629 * skip the tuples that are moved by split operation for the scan
630 * that has started when split was in progress. Also, skip the
631 * tuples that are marked as dead.
633 if ((so
->hashso_buc_populated
&& !so
->hashso_buc_split
&&
634 (itup
->t_info
& INDEX_MOVED_BY_SPLIT_MASK
)) ||
635 (scan
->ignore_killed_tuples
&&
636 (ItemIdIsDead(PageGetItemId(page
, offnum
)))))
638 offnum
= OffsetNumberNext(offnum
); /* move forward */
642 if (so
->hashso_sk_hash
== _hash_get_indextuple_hashkey(itup
) &&
643 _hash_checkqual(scan
, itup
))
645 /* tuple is qualified, so remember it */
646 _hash_saveitem(so
, itemIndex
, offnum
, itup
);
652 * No more matching tuples exist in this page. so, exit while
658 offnum
= OffsetNumberNext(offnum
);
661 Assert(itemIndex
<= MaxIndexTuplesPerPage
);
666 /* load items[] in descending order */
667 itemIndex
= MaxIndexTuplesPerPage
;
669 while (offnum
>= FirstOffsetNumber
)
671 Assert(offnum
<= maxoff
);
672 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, offnum
));
675 * skip the tuples that are moved by split operation for the scan
676 * that has started when split was in progress. Also, skip the
677 * tuples that are marked as dead.
679 if ((so
->hashso_buc_populated
&& !so
->hashso_buc_split
&&
680 (itup
->t_info
& INDEX_MOVED_BY_SPLIT_MASK
)) ||
681 (scan
->ignore_killed_tuples
&&
682 (ItemIdIsDead(PageGetItemId(page
, offnum
)))))
684 offnum
= OffsetNumberPrev(offnum
); /* move back */
688 if (so
->hashso_sk_hash
== _hash_get_indextuple_hashkey(itup
) &&
689 _hash_checkqual(scan
, itup
))
692 /* tuple is qualified, so remember it */
693 _hash_saveitem(so
, itemIndex
, offnum
, itup
);
698 * No more matching tuples exist in this page. so, exit while
704 offnum
= OffsetNumberPrev(offnum
);
707 Assert(itemIndex
>= 0);
712 /* Save an index item into so->currPos.items[itemIndex] */
714 _hash_saveitem(HashScanOpaque so
, int itemIndex
,
715 OffsetNumber offnum
, IndexTuple itup
)
717 HashScanPosItem
*currItem
= &so
->currPos
.items
[itemIndex
];
719 currItem
->heapTid
= itup
->t_tid
;
720 currItem
->indexOffset
= offnum
;