1 /*-------------------------------------------------------------------------
4 * page utilities routines for the postgres inverted index access method.
7 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginbtree.c
12 *-------------------------------------------------------------------------
17 #include "access/gin_private.h"
18 #include "access/ginxlog.h"
19 #include "access/xloginsert.h"
20 #include "miscadmin.h"
21 #include "storage/predicate.h"
22 #include "utils/memutils.h"
23 #include "utils/rel.h"
25 static void ginFindParents(GinBtree btree
, GinBtreeStack
*stack
);
26 static bool ginPlaceToPage(GinBtree btree
, GinBtreeStack
*stack
,
27 void *insertdata
, BlockNumber updateblkno
,
28 Buffer childbuf
, GinStatsData
*buildStats
);
29 static void ginFinishSplit(GinBtree btree
, GinBtreeStack
*stack
,
30 bool freestack
, GinStatsData
*buildStats
);
33 * Lock buffer by needed method for search.
36 ginTraverseLock(Buffer buffer
, bool searchMode
)
39 int access
= GIN_SHARE
;
41 LockBuffer(buffer
, GIN_SHARE
);
42 page
= BufferGetPage(buffer
);
43 if (GinPageIsLeaf(page
))
45 if (searchMode
== false)
47 /* we should relock our page */
48 LockBuffer(buffer
, GIN_UNLOCK
);
49 LockBuffer(buffer
, GIN_EXCLUSIVE
);
51 /* But root can become non-leaf during relock */
52 if (!GinPageIsLeaf(page
))
54 /* restore old lock type (very rare) */
55 LockBuffer(buffer
, GIN_UNLOCK
);
56 LockBuffer(buffer
, GIN_SHARE
);
59 access
= GIN_EXCLUSIVE
;
67 * Descend the tree to the leaf page that contains or would contain the key
68 * we're searching for. The key should already be filled in 'btree', in
69 * tree-type specific manner. If btree->fullScan is true, descends to the
72 * If 'searchmode' is false, on return stack->buffer is exclusively locked,
73 * and the stack represents the full path to the root. Otherwise stack->buffer
74 * is share-locked, and stack->parent is NULL.
76 * If 'rootConflictCheck' is true, tree root is checked for serialization
80 ginFindLeafPage(GinBtree btree
, bool searchMode
,
81 bool rootConflictCheck
, Snapshot snapshot
)
85 stack
= (GinBtreeStack
*) palloc(sizeof(GinBtreeStack
));
86 stack
->blkno
= btree
->rootBlkno
;
87 stack
->buffer
= ReadBuffer(btree
->index
, btree
->rootBlkno
);
89 stack
->predictNumber
= 1;
91 if (rootConflictCheck
)
92 CheckForSerializableConflictIn(btree
->index
, NULL
, btree
->rootBlkno
);
100 stack
->off
= InvalidOffsetNumber
;
102 page
= BufferGetPage(stack
->buffer
);
103 TestForOldSnapshot(snapshot
, btree
->index
, page
);
105 access
= ginTraverseLock(stack
->buffer
, searchMode
);
108 * If we're going to modify the tree, finish any incomplete splits we
109 * encounter on the way.
111 if (!searchMode
&& GinPageIsIncompleteSplit(page
))
112 ginFinishSplit(btree
, stack
, false, NULL
);
115 * ok, page is correctly locked, we should check to move right ..,
116 * root never has a right link, so small optimization
118 while (btree
->fullScan
== false && stack
->blkno
!= btree
->rootBlkno
&&
119 btree
->isMoveRight(btree
, page
))
121 BlockNumber rightlink
= GinPageGetOpaque(page
)->rightlink
;
123 if (rightlink
== InvalidBlockNumber
)
127 stack
->buffer
= ginStepRight(stack
->buffer
, btree
->index
, access
);
128 stack
->blkno
= rightlink
;
129 page
= BufferGetPage(stack
->buffer
);
130 TestForOldSnapshot(snapshot
, btree
->index
, page
);
132 if (!searchMode
&& GinPageIsIncompleteSplit(page
))
133 ginFinishSplit(btree
, stack
, false, NULL
);
136 if (GinPageIsLeaf(page
)) /* we found, return locked page */
139 /* now we have correct buffer, try to find child */
140 child
= btree
->findChildPage(btree
, stack
);
142 LockBuffer(stack
->buffer
, GIN_UNLOCK
);
143 Assert(child
!= InvalidBlockNumber
);
144 Assert(stack
->blkno
!= child
);
148 /* in search mode we may forget path to leaf */
149 stack
->blkno
= child
;
150 stack
->buffer
= ReleaseAndReadBuffer(stack
->buffer
, btree
->index
, stack
->blkno
);
154 GinBtreeStack
*ptr
= (GinBtreeStack
*) palloc(sizeof(GinBtreeStack
));
158 stack
->blkno
= child
;
159 stack
->buffer
= ReadBuffer(btree
->index
, stack
->blkno
);
160 stack
->predictNumber
= 1;
166 * Step right from current page.
168 * The next page is locked first, before releasing the current page. This is
169 * crucial to protect from concurrent page deletion (see comment in
173 ginStepRight(Buffer buffer
, Relation index
, int lockmode
)
176 Page page
= BufferGetPage(buffer
);
177 bool isLeaf
= GinPageIsLeaf(page
);
178 bool isData
= GinPageIsData(page
);
179 BlockNumber blkno
= GinPageGetOpaque(page
)->rightlink
;
181 nextbuffer
= ReadBuffer(index
, blkno
);
182 LockBuffer(nextbuffer
, lockmode
);
183 UnlockReleaseBuffer(buffer
);
185 /* Sanity check that the page we stepped to is of similar kind. */
186 page
= BufferGetPage(nextbuffer
);
187 if (isLeaf
!= GinPageIsLeaf(page
) || isData
!= GinPageIsData(page
))
188 elog(ERROR
, "right sibling of GIN page is of different type");
194 freeGinBtreeStack(GinBtreeStack
*stack
)
198 GinBtreeStack
*tmp
= stack
->parent
;
200 if (stack
->buffer
!= InvalidBuffer
)
201 ReleaseBuffer(stack
->buffer
);
209 * Try to find parent for current stack position. Returns correct parent and
210 * child's offset in stack->parent. The root page is never released, to
211 * prevent conflict with vacuum process.
214 ginFindParents(GinBtree btree
, GinBtreeStack
*stack
)
225 * Unwind the stack all the way up to the root, leaving only the root
228 * Be careful not to release the pin on the root page! The pin on root
229 * page is required to lock out concurrent vacuums on the tree.
231 root
= stack
->parent
;
234 ReleaseBuffer(root
->buffer
);
238 Assert(root
->blkno
== btree
->rootBlkno
);
239 Assert(BufferGetBlockNumber(root
->buffer
) == btree
->rootBlkno
);
240 root
->off
= InvalidOffsetNumber
;
243 buffer
= root
->buffer
;
245 ptr
= (GinBtreeStack
*) palloc(sizeof(GinBtreeStack
));
249 LockBuffer(buffer
, GIN_EXCLUSIVE
);
250 page
= BufferGetPage(buffer
);
251 if (GinPageIsLeaf(page
))
252 elog(ERROR
, "Lost path");
254 if (GinPageIsIncompleteSplit(page
))
256 Assert(blkno
!= btree
->rootBlkno
);
258 ptr
->buffer
= buffer
;
261 * parent may be wrong, but if so, the ginFinishSplit call will
262 * recurse to call ginFindParents again to fix it.
265 ptr
->off
= InvalidOffsetNumber
;
267 ginFinishSplit(btree
, ptr
, false, NULL
);
270 leftmostBlkno
= btree
->getLeftMostChild(btree
, page
);
272 while ((offset
= btree
->findChildPtr(btree
, page
, stack
->blkno
, InvalidOffsetNumber
)) == InvalidOffsetNumber
)
274 blkno
= GinPageGetOpaque(page
)->rightlink
;
275 if (blkno
== InvalidBlockNumber
)
277 UnlockReleaseBuffer(buffer
);
280 buffer
= ginStepRight(buffer
, btree
->index
, GIN_EXCLUSIVE
);
281 page
= BufferGetPage(buffer
);
283 /* finish any incomplete splits, as above */
284 if (GinPageIsIncompleteSplit(page
))
286 Assert(blkno
!= btree
->rootBlkno
);
288 ptr
->buffer
= buffer
;
290 ptr
->off
= InvalidOffsetNumber
;
292 ginFinishSplit(btree
, ptr
, false, NULL
);
296 if (blkno
!= InvalidBlockNumber
)
299 ptr
->buffer
= buffer
;
300 ptr
->parent
= root
; /* it may be wrong, but in next call we will
307 /* Descend down to next level */
308 blkno
= leftmostBlkno
;
309 buffer
= ReadBuffer(btree
->index
, blkno
);
314 * Insert a new item to a page.
316 * Returns true if the insertion was finished. On false, the page was split and
317 * the parent needs to be updated. (A root split returns true as it doesn't
318 * need any further action by the caller to complete.)
320 * When inserting a downlink to an internal page, 'childbuf' contains the
321 * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
322 * atomically with the insert. Also, the existing item at offset stack->off
323 * in the target page is updated to point to updateblkno.
325 * stack->buffer is locked on entry, and is kept locked.
326 * Likewise for childbuf, if given.
329 ginPlaceToPage(GinBtree btree
, GinBtreeStack
*stack
,
330 void *insertdata
, BlockNumber updateblkno
,
331 Buffer childbuf
, GinStatsData
*buildStats
)
333 Page page
= BufferGetPage(stack
->buffer
);
337 Page childpage
= NULL
;
338 Page newlpage
= NULL
,
340 void *ptp_workspace
= NULL
;
341 MemoryContext tmpCxt
;
342 MemoryContext oldCxt
;
345 * We do all the work of this function and its subfunctions in a temporary
346 * memory context. This avoids leakages and simplifies APIs, since some
347 * subfunctions allocate storage that has to survive until we've finished
350 tmpCxt
= AllocSetContextCreate(CurrentMemoryContext
,
351 "ginPlaceToPage temporary context",
352 ALLOCSET_DEFAULT_SIZES
);
353 oldCxt
= MemoryContextSwitchTo(tmpCxt
);
355 if (GinPageIsData(page
))
356 xlflags
|= GIN_INSERT_ISDATA
;
357 if (GinPageIsLeaf(page
))
359 xlflags
|= GIN_INSERT_ISLEAF
;
360 Assert(!BufferIsValid(childbuf
));
361 Assert(updateblkno
== InvalidBlockNumber
);
365 Assert(BufferIsValid(childbuf
));
366 Assert(updateblkno
!= InvalidBlockNumber
);
367 childpage
= BufferGetPage(childbuf
);
371 * See if the incoming tuple will fit on the page. beginPlaceToPage will
372 * decide if the page needs to be split, and will compute the split
373 * contents if so. See comments for beginPlaceToPage and execPlaceToPage
374 * functions for more details of the API here.
376 rc
= btree
->beginPlaceToPage(btree
, stack
->buffer
, stack
,
377 insertdata
, updateblkno
,
379 &newlpage
, &newrpage
);
381 if (rc
== GPTP_NO_WORK
)
386 else if (rc
== GPTP_INSERT
)
388 /* It will fit, perform the insertion */
389 START_CRIT_SECTION();
391 if (RelationNeedsWAL(btree
->index
) && !btree
->isBuild
)
394 XLogRegisterBuffer(0, stack
->buffer
, REGBUF_STANDARD
);
395 if (BufferIsValid(childbuf
))
396 XLogRegisterBuffer(1, childbuf
, REGBUF_STANDARD
);
399 /* Perform the page update, and register any extra WAL data */
400 btree
->execPlaceToPage(btree
, stack
->buffer
, stack
,
401 insertdata
, updateblkno
, ptp_workspace
);
403 MarkBufferDirty(stack
->buffer
);
405 /* An insert to an internal page finishes the split of the child. */
406 if (BufferIsValid(childbuf
))
408 GinPageGetOpaque(childpage
)->flags
&= ~GIN_INCOMPLETE_SPLIT
;
409 MarkBufferDirty(childbuf
);
412 if (RelationNeedsWAL(btree
->index
) && !btree
->isBuild
)
416 BlockIdData childblknos
[2];
418 xlrec
.flags
= xlflags
;
420 XLogRegisterData((char *) &xlrec
, sizeof(ginxlogInsert
));
423 * Log information about child if this was an insertion of a
426 if (BufferIsValid(childbuf
))
428 BlockIdSet(&childblknos
[0], BufferGetBlockNumber(childbuf
));
429 BlockIdSet(&childblknos
[1], GinPageGetOpaque(childpage
)->rightlink
);
430 XLogRegisterData((char *) childblknos
,
431 sizeof(BlockIdData
) * 2);
434 recptr
= XLogInsert(RM_GIN_ID
, XLOG_GIN_INSERT
);
435 PageSetLSN(page
, recptr
);
436 if (BufferIsValid(childbuf
))
437 PageSetLSN(childpage
, recptr
);
442 /* Insertion is complete. */
445 else if (rc
== GPTP_SPLIT
)
448 * Didn't fit, need to split. The split has been computed in newlpage
449 * and newrpage, which are pointers to palloc'd pages, not associated
450 * with buffers. stack->buffer is not touched yet.
453 BlockNumber savedRightLink
;
455 Buffer lbuffer
= InvalidBuffer
;
456 Page newrootpg
= NULL
;
458 /* Get a new index page to become the right page */
459 rbuffer
= GinNewBuffer(btree
->index
);
461 /* During index build, count the new page */
465 buildStats
->nDataPages
++;
467 buildStats
->nEntryPages
++;
470 savedRightLink
= GinPageGetOpaque(page
)->rightlink
;
472 /* Begin setting up WAL record */
473 data
.node
= btree
->index
->rd_node
;
474 data
.flags
= xlflags
;
475 if (BufferIsValid(childbuf
))
477 data
.leftChildBlkno
= BufferGetBlockNumber(childbuf
);
478 data
.rightChildBlkno
= GinPageGetOpaque(childpage
)->rightlink
;
481 data
.leftChildBlkno
= data
.rightChildBlkno
= InvalidBlockNumber
;
483 if (stack
->parent
== NULL
)
486 * splitting the root, so we need to allocate new left page and
487 * place pointers to left and right page on root page.
489 lbuffer
= GinNewBuffer(btree
->index
);
491 /* During index build, count the new left page */
495 buildStats
->nDataPages
++;
497 buildStats
->nEntryPages
++;
500 data
.rrlink
= InvalidBlockNumber
;
501 data
.flags
|= GIN_SPLIT_ROOT
;
503 GinPageGetOpaque(newrpage
)->rightlink
= InvalidBlockNumber
;
504 GinPageGetOpaque(newlpage
)->rightlink
= BufferGetBlockNumber(rbuffer
);
507 * Construct a new root page containing downlinks to the new left
508 * and right pages. (Do this in a temporary copy rather than
509 * overwriting the original page directly, since we're not in the
510 * critical section yet.)
512 newrootpg
= PageGetTempPage(newrpage
);
513 GinInitPage(newrootpg
, GinPageGetOpaque(newlpage
)->flags
& ~(GIN_LEAF
| GIN_COMPRESSED
), BLCKSZ
);
515 btree
->fillRoot(btree
, newrootpg
,
516 BufferGetBlockNumber(lbuffer
), newlpage
,
517 BufferGetBlockNumber(rbuffer
), newrpage
);
519 if (GinPageIsLeaf(BufferGetPage(stack
->buffer
)))
522 PredicateLockPageSplit(btree
->index
,
523 BufferGetBlockNumber(stack
->buffer
),
524 BufferGetBlockNumber(lbuffer
));
526 PredicateLockPageSplit(btree
->index
,
527 BufferGetBlockNumber(stack
->buffer
),
528 BufferGetBlockNumber(rbuffer
));
534 /* splitting a non-root page */
535 data
.rrlink
= savedRightLink
;
537 GinPageGetOpaque(newrpage
)->rightlink
= savedRightLink
;
538 GinPageGetOpaque(newlpage
)->flags
|= GIN_INCOMPLETE_SPLIT
;
539 GinPageGetOpaque(newlpage
)->rightlink
= BufferGetBlockNumber(rbuffer
);
541 if (GinPageIsLeaf(BufferGetPage(stack
->buffer
)))
544 PredicateLockPageSplit(btree
->index
,
545 BufferGetBlockNumber(stack
->buffer
),
546 BufferGetBlockNumber(rbuffer
));
551 * OK, we have the new contents of the left page in a temporary copy
552 * now (newlpage), and likewise for the new contents of the
553 * newly-allocated right block. The original page is still unchanged.
555 * If this is a root split, we also have a temporary page containing
556 * the new contents of the root.
559 START_CRIT_SECTION();
561 MarkBufferDirty(rbuffer
);
562 MarkBufferDirty(stack
->buffer
);
565 * Restore the temporary copies over the real buffers.
567 if (stack
->parent
== NULL
)
569 /* Splitting the root, three pages to update */
570 MarkBufferDirty(lbuffer
);
571 memcpy(page
, newrootpg
, BLCKSZ
);
572 memcpy(BufferGetPage(lbuffer
), newlpage
, BLCKSZ
);
573 memcpy(BufferGetPage(rbuffer
), newrpage
, BLCKSZ
);
577 /* Normal split, only two pages to update */
578 memcpy(page
, newlpage
, BLCKSZ
);
579 memcpy(BufferGetPage(rbuffer
), newrpage
, BLCKSZ
);
582 /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */
583 if (BufferIsValid(childbuf
))
585 GinPageGetOpaque(childpage
)->flags
&= ~GIN_INCOMPLETE_SPLIT
;
586 MarkBufferDirty(childbuf
);
589 /* write WAL record */
590 if (RelationNeedsWAL(btree
->index
) && !btree
->isBuild
)
597 * We just take full page images of all the split pages. Splits
598 * are uncommon enough that it's not worth complicating the code
599 * to be more efficient.
601 if (stack
->parent
== NULL
)
603 XLogRegisterBuffer(0, lbuffer
, REGBUF_FORCE_IMAGE
| REGBUF_STANDARD
);
604 XLogRegisterBuffer(1, rbuffer
, REGBUF_FORCE_IMAGE
| REGBUF_STANDARD
);
605 XLogRegisterBuffer(2, stack
->buffer
, REGBUF_FORCE_IMAGE
| REGBUF_STANDARD
);
609 XLogRegisterBuffer(0, stack
->buffer
, REGBUF_FORCE_IMAGE
| REGBUF_STANDARD
);
610 XLogRegisterBuffer(1, rbuffer
, REGBUF_FORCE_IMAGE
| REGBUF_STANDARD
);
612 if (BufferIsValid(childbuf
))
613 XLogRegisterBuffer(3, childbuf
, REGBUF_STANDARD
);
615 XLogRegisterData((char *) &data
, sizeof(ginxlogSplit
));
617 recptr
= XLogInsert(RM_GIN_ID
, XLOG_GIN_SPLIT
);
619 PageSetLSN(page
, recptr
);
620 PageSetLSN(BufferGetPage(rbuffer
), recptr
);
621 if (stack
->parent
== NULL
)
622 PageSetLSN(BufferGetPage(lbuffer
), recptr
);
623 if (BufferIsValid(childbuf
))
624 PageSetLSN(childpage
, recptr
);
629 * We can release the locks/pins on the new pages now, but keep
630 * stack->buffer locked. childbuf doesn't get unlocked either.
632 UnlockReleaseBuffer(rbuffer
);
633 if (stack
->parent
== NULL
)
634 UnlockReleaseBuffer(lbuffer
);
637 * If we split the root, we're done. Otherwise the split is not
638 * complete until the downlink for the new page has been inserted to
641 result
= (stack
->parent
== NULL
);
645 elog(ERROR
, "invalid return code from GIN beginPlaceToPage method: %d", rc
);
646 result
= false; /* keep compiler quiet */
649 /* Clean up temp context */
650 MemoryContextSwitchTo(oldCxt
);
651 MemoryContextDelete(tmpCxt
);
657 * Finish a split by inserting the downlink for the new page to parent.
659 * On entry, stack->buffer is exclusively locked.
661 * If freestack is true, all the buffers are released and unlocked as we
662 * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept
663 * locked, and stack is unmodified, except for possibly moving right to find
664 * the correct parent of page.
667 ginFinishSplit(GinBtree btree
, GinBtreeStack
*stack
, bool freestack
,
668 GinStatsData
*buildStats
)
675 * freestack == false when we encounter an incompletely split page during
676 * a scan, while freestack == true is used in the normal scenario that a
677 * split is finished right after the initial insert.
680 elog(DEBUG1
, "finishing incomplete split of block %u in gin index \"%s\"",
681 stack
->blkno
, RelationGetRelationName(btree
->index
));
683 /* this loop crawls up the stack until the insertion is complete */
686 GinBtreeStack
*parent
= stack
->parent
;
688 BlockNumber updateblkno
;
690 /* search parent to lock */
691 LockBuffer(parent
->buffer
, GIN_EXCLUSIVE
);
694 * If the parent page was incompletely split, finish that split first,
695 * then continue with the current one.
697 * Note: we have to finish *all* incomplete splits we encounter, even
698 * if we have to move right. Otherwise we might choose as the target a
699 * page that has no downlink in the parent, and splitting it further
702 if (GinPageIsIncompleteSplit(BufferGetPage(parent
->buffer
)))
703 ginFinishSplit(btree
, parent
, false, buildStats
);
705 /* move right if it's needed */
706 page
= BufferGetPage(parent
->buffer
);
707 while ((parent
->off
= btree
->findChildPtr(btree
, page
, stack
->blkno
, parent
->off
)) == InvalidOffsetNumber
)
709 if (GinPageRightMost(page
))
712 * rightmost page, but we don't find parent, we should use
715 LockBuffer(parent
->buffer
, GIN_UNLOCK
);
716 ginFindParents(btree
, stack
);
717 parent
= stack
->parent
;
718 Assert(parent
!= NULL
);
722 parent
->buffer
= ginStepRight(parent
->buffer
, btree
->index
, GIN_EXCLUSIVE
);
723 parent
->blkno
= BufferGetBlockNumber(parent
->buffer
);
724 page
= BufferGetPage(parent
->buffer
);
726 if (GinPageIsIncompleteSplit(BufferGetPage(parent
->buffer
)))
727 ginFinishSplit(btree
, parent
, false, buildStats
);
730 /* insert the downlink */
731 insertdata
= btree
->prepareDownlink(btree
, stack
->buffer
);
732 updateblkno
= GinPageGetOpaque(BufferGetPage(stack
->buffer
))->rightlink
;
733 done
= ginPlaceToPage(btree
, parent
,
734 insertdata
, updateblkno
,
735 stack
->buffer
, buildStats
);
739 * If the caller requested to free the stack, unlock and release the
740 * child buffer now. Otherwise keep it pinned and locked, but if we
741 * have to recurse up the tree, we can unlock the upper pages, only
742 * keeping the page at the bottom of the stack locked.
744 if (!first
|| freestack
)
745 LockBuffer(stack
->buffer
, GIN_UNLOCK
);
748 ReleaseBuffer(stack
->buffer
);
756 /* unlock the parent */
757 LockBuffer(stack
->buffer
, GIN_UNLOCK
);
760 freeGinBtreeStack(stack
);
764 * Insert a value to tree described by stack.
766 * The value to be inserted is given in 'insertdata'. Its format depends
767 * on whether this is an entry or data tree, ginInsertValue just passes it
768 * through to the tree-specific callback function.
770 * During an index build, buildStats is non-null and the counters it contains
771 * are incremented as needed.
773 * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
776 ginInsertValue(GinBtree btree
, GinBtreeStack
*stack
, void *insertdata
,
777 GinStatsData
*buildStats
)
781 /* If the leaf page was incompletely split, finish the split first */
782 if (GinPageIsIncompleteSplit(BufferGetPage(stack
->buffer
)))
783 ginFinishSplit(btree
, stack
, false, buildStats
);
785 done
= ginPlaceToPage(btree
, stack
,
786 insertdata
, InvalidBlockNumber
,
787 InvalidBuffer
, buildStats
);
790 LockBuffer(stack
->buffer
, GIN_UNLOCK
);
791 freeGinBtreeStack(stack
);
794 ginFinishSplit(btree
, stack
, true, buildStats
);