1 /*-------------------------------------------------------------------------
4 * heap page pruning and HOT-chain management code
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/heap/pruneheap.c
13 *-------------------------------------------------------------------------
17 #include "access/heapam.h"
18 #include "access/heapam_xlog.h"
19 #include "access/htup_details.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "catalog/catalog.h"
23 #include "miscadmin.h"
25 #include "storage/bufmgr.h"
26 #include "utils/snapmgr.h"
27 #include "utils/rel.h"
28 #include "utils/snapmgr.h"
30 /* Working data for heap_page_prune and subroutines */
35 /* tuple visibility test, initialized for the relation */
36 GlobalVisState
*vistest
;
39 * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
40 * been computed (done on demand, and only if
41 * OldSnapshotThresholdActive()). The first time a tuple is about to be
42 * removed based on the limited horizon, old_snap_used is set to true, and
43 * SetOldSnapshotThresholdTimestamp() is called. See
44 * heap_prune_satisfies_vacuum().
46 TimestampTz old_snap_ts
;
47 TransactionId old_snap_xmin
;
50 TransactionId new_prune_xid
; /* new prune hint value for page */
51 TransactionId latestRemovedXid
; /* latest xid to be removed by this prune */
52 int nredirected
; /* numbers of entries in arrays below */
55 /* arrays that accumulate indexes of items to be changed */
56 OffsetNumber redirected
[MaxHeapTuplesPerPage
* 2];
57 OffsetNumber nowdead
[MaxHeapTuplesPerPage
];
58 OffsetNumber nowunused
[MaxHeapTuplesPerPage
];
59 /* marked[i] is true if item i is entered in one of the above arrays */
60 bool marked
[MaxHeapTuplesPerPage
+ 1];
64 static int heap_prune_chain(Buffer buffer
,
65 OffsetNumber rootoffnum
,
67 static void heap_prune_record_prunable(PruneState
*prstate
, TransactionId xid
);
68 static void heap_prune_record_redirect(PruneState
*prstate
,
69 OffsetNumber offnum
, OffsetNumber rdoffnum
);
70 static void heap_prune_record_dead(PruneState
*prstate
, OffsetNumber offnum
);
71 static void heap_prune_record_unused(PruneState
*prstate
, OffsetNumber offnum
);
75 * Optionally prune and repair fragmentation in the specified page.
77 * This is an opportunistic function. It will perform housekeeping
78 * only if the page heuristically looks like a candidate for pruning and we
79 * can acquire buffer cleanup lock without blocking.
81 * Note: this is called quite often. It's important that it fall out quickly
82 * if there's not any use in pruning.
84 * Caller must have pin on the buffer, and must *not* have a lock on it.
87 heap_page_prune_opt(Relation relation
, Buffer buffer
)
89 Page page
= BufferGetPage(buffer
);
90 TransactionId prune_xid
;
91 GlobalVisState
*vistest
;
92 TransactionId limited_xmin
= InvalidTransactionId
;
93 TimestampTz limited_ts
= 0;
97 * We can't write WAL in recovery mode, so there's no point trying to
98 * clean the page. The primary will likely issue a cleaning WAL record
99 * soon anyway, so this is no particular loss.
101 if (RecoveryInProgress())
105 * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
106 * currently are broken, and discussion of what to do about them is
108 * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
110 if (old_snapshot_threshold
== 0)
111 SnapshotTooOldMagicForTest();
114 * First check whether there's any chance there's something to prune,
115 * determining the appropriate horizon is a waste if there's no prune_xid
116 * (i.e. no updates/deletes left potentially dead tuples around).
118 prune_xid
= ((PageHeader
) page
)->pd_prune_xid
;
119 if (!TransactionIdIsValid(prune_xid
))
123 * Check whether prune_xid indicates that there may be dead rows that can
126 * It is OK to check the old snapshot limit before acquiring the cleanup
127 * lock because the worst that can happen is that we are not quite as
128 * aggressive about the cleanup (by however many transaction IDs are
129 * consumed between this point and acquiring the lock). This allows us to
130 * save significant overhead in the case where the page is found not to be
133 * Even if old_snapshot_threshold is set, we first check whether the page
134 * can be pruned without. Both because
135 * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
136 * unnecessarily relying on old_snapshot_threshold avoids causing
139 vistest
= GlobalVisTestFor(relation
);
141 if (!GlobalVisTestIsRemovableXid(vistest
, prune_xid
))
143 if (!OldSnapshotThresholdActive())
146 if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest
),
148 &limited_xmin
, &limited_ts
))
151 if (!TransactionIdPrecedes(prune_xid
, limited_xmin
))
156 * We prune when a previous UPDATE failed to find enough space on the page
157 * for a new tuple version, or when free space falls below the relation's
158 * fill-factor target (but not less than 10%).
160 * Checking free space here is questionable since we aren't holding any
161 * lock on the buffer; in the worst case we could get a bogus answer. It's
162 * unlikely to be *seriously* wrong, though, since reading either pd_lower
163 * or pd_upper is probably atomic. Avoiding taking a lock seems more
164 * important than sometimes getting a wrong answer in what is after all
165 * just a heuristic estimate.
167 minfree
= RelationGetTargetPageFreeSpace(relation
,
168 HEAP_DEFAULT_FILLFACTOR
);
169 minfree
= Max(minfree
, BLCKSZ
/ 10);
171 if (PageIsFull(page
) || PageGetHeapFreeSpace(page
) < minfree
)
173 /* OK, try to get exclusive buffer lock */
174 if (!ConditionalLockBufferForCleanup(buffer
))
178 * Now that we have buffer lock, get accurate information about the
179 * page's free space, and recheck the heuristic about whether to
180 * prune. (We needn't recheck PageIsPrunable, since no one else could
181 * have pruned while we hold pin.)
183 if (PageIsFull(page
) || PageGetHeapFreeSpace(page
) < minfree
)
188 ndeleted
= heap_page_prune(relation
, buffer
, vistest
, limited_xmin
,
189 limited_ts
, &nnewlpdead
, NULL
);
192 * Report the number of tuples reclaimed to pgstats. This is
193 * ndeleted minus the number of newly-LP_DEAD-set items.
195 * We derive the number of dead tuples like this to avoid totally
196 * forgetting about items that were set to LP_DEAD, since they
197 * still need to be cleaned up by VACUUM. We only want to count
198 * heap-only tuples that just became LP_UNUSED in our report,
201 * VACUUM doesn't have to compensate in the same way when it
202 * tracks ndeleted, since it will set the same LP_DEAD items to
203 * LP_UNUSED separately.
205 if (ndeleted
> nnewlpdead
)
206 pgstat_update_heap_dead_tuples(relation
,
207 ndeleted
- nnewlpdead
);
210 /* And release buffer lock */
211 LockBuffer(buffer
, BUFFER_LOCK_UNLOCK
);
214 * We avoid reuse of any free space created on the page by unrelated
215 * UPDATEs/INSERTs by opting to not update the FSM at this point. The
216 * free space should be reused by UPDATEs to *this* page.
223 * Prune and repair fragmentation in the specified page.
225 * Caller must have pin and buffer cleanup lock on the page. Note that we
226 * don't update the FSM information for page on caller's behalf.
228 * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
229 * (see heap_prune_satisfies_vacuum and
230 * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
231 * either have been set by TransactionIdLimitedForOldSnapshots, or
232 * InvalidTransactionId/0 respectively.
234 * Sets *nnewlpdead for caller, indicating the number of items that were
235 * newly set LP_DEAD during prune operation.
237 * off_loc is the offset location required by the caller to use in error
240 * Returns the number of tuples deleted from the page during this call.
243 heap_page_prune(Relation relation
, Buffer buffer
,
244 GlobalVisState
*vistest
,
245 TransactionId old_snap_xmin
,
246 TimestampTz old_snap_ts
,
248 OffsetNumber
*off_loc
)
251 Page page
= BufferGetPage(buffer
);
257 * Our strategy is to scan the page and make lists of items to change,
258 * then apply the changes within a critical section. This keeps as much
259 * logic as possible out of the critical section, and also ensures that
260 * WAL replay will work the same as the normal case.
262 * First, initialize the new pd_prune_xid value to zero (indicating no
263 * prunable tuples). If we find any tuples which may soon become
264 * prunable, we will save the lowest relevant XID in new_prune_xid. Also
265 * initialize the rest of our working state.
267 prstate
.new_prune_xid
= InvalidTransactionId
;
268 prstate
.rel
= relation
;
269 prstate
.vistest
= vistest
;
270 prstate
.old_snap_xmin
= old_snap_xmin
;
271 prstate
.old_snap_ts
= old_snap_ts
;
272 prstate
.old_snap_used
= false;
273 prstate
.latestRemovedXid
= InvalidTransactionId
;
274 prstate
.nredirected
= prstate
.ndead
= prstate
.nunused
= 0;
275 memset(prstate
.marked
, 0, sizeof(prstate
.marked
));
278 maxoff
= PageGetMaxOffsetNumber(page
);
279 for (offnum
= FirstOffsetNumber
;
281 offnum
= OffsetNumberNext(offnum
))
285 /* Ignore items already processed as part of an earlier chain */
286 if (prstate
.marked
[offnum
])
290 * Set the offset number so that we can display it along with any
291 * error that occurred while processing this tuple.
296 /* Nothing to do if slot is empty or already dead */
297 itemid
= PageGetItemId(page
, offnum
);
298 if (!ItemIdIsUsed(itemid
) || ItemIdIsDead(itemid
))
301 /* Process this item or chain of items */
302 ndeleted
+= heap_prune_chain(buffer
, offnum
, &prstate
);
305 /* Clear the offset information once we have processed the given page. */
307 *off_loc
= InvalidOffsetNumber
;
309 /* Any error while applying the changes is critical */
310 START_CRIT_SECTION();
312 /* Have we found any prunable items? */
313 if (prstate
.nredirected
> 0 || prstate
.ndead
> 0 || prstate
.nunused
> 0)
316 * Apply the planned item changes, then repair page fragmentation, and
317 * update the page's hint bit about whether it has free line pointers.
319 heap_page_prune_execute(buffer
,
320 prstate
.redirected
, prstate
.nredirected
,
321 prstate
.nowdead
, prstate
.ndead
,
322 prstate
.nowunused
, prstate
.nunused
);
325 * Update the page's pd_prune_xid field to either zero, or the lowest
326 * XID of any soon-prunable tuple.
328 ((PageHeader
) page
)->pd_prune_xid
= prstate
.new_prune_xid
;
331 * Also clear the "page is full" flag, since there's no point in
332 * repeating the prune/defrag process until something else happens to
337 MarkBufferDirty(buffer
);
340 * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
342 if (RelationNeedsWAL(relation
))
347 xlrec
.latestRemovedXid
= prstate
.latestRemovedXid
;
348 xlrec
.nredirected
= prstate
.nredirected
;
349 xlrec
.ndead
= prstate
.ndead
;
352 XLogRegisterData((char *) &xlrec
, SizeOfHeapPrune
);
354 XLogRegisterBuffer(0, buffer
, REGBUF_STANDARD
);
357 * The OffsetNumber arrays are not actually in the buffer, but we
358 * pretend that they are. When XLogInsert stores the whole
359 * buffer, the offset arrays need not be stored too.
361 if (prstate
.nredirected
> 0)
362 XLogRegisterBufData(0, (char *) prstate
.redirected
,
363 prstate
.nredirected
*
364 sizeof(OffsetNumber
) * 2);
366 if (prstate
.ndead
> 0)
367 XLogRegisterBufData(0, (char *) prstate
.nowdead
,
368 prstate
.ndead
* sizeof(OffsetNumber
));
370 if (prstate
.nunused
> 0)
371 XLogRegisterBufData(0, (char *) prstate
.nowunused
,
372 prstate
.nunused
* sizeof(OffsetNumber
));
374 recptr
= XLogInsert(RM_HEAP2_ID
, XLOG_HEAP2_PRUNE
);
376 PageSetLSN(BufferGetPage(buffer
), recptr
);
382 * If we didn't prune anything, but have found a new value for the
383 * pd_prune_xid field, update it and mark the buffer dirty. This is
384 * treated as a non-WAL-logged hint.
386 * Also clear the "page is full" flag if it is set, since there's no
387 * point in repeating the prune/defrag process until something else
388 * happens to the page.
390 if (((PageHeader
) page
)->pd_prune_xid
!= prstate
.new_prune_xid
||
393 ((PageHeader
) page
)->pd_prune_xid
= prstate
.new_prune_xid
;
395 MarkBufferDirtyHint(buffer
, true);
401 /* Record number of newly-set-LP_DEAD items for caller */
402 *nnewlpdead
= prstate
.ndead
;
409 * Perform visibility checks for heap pruning.
411 * This is more complicated than just using GlobalVisTestIsRemovableXid()
412 * because of old_snapshot_threshold. We only want to increase the threshold
413 * that triggers errors for old snapshots when we actually decide to remove a
414 * row based on the limited horizon.
416 * Due to its cost we also only want to call
417 * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
418 * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
419 * still want to be able to remove rows that are too new to be removed
420 * according to prstate->vistest, but that can be removed based on
421 * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
422 * demand in here, if appropriate.
425 heap_prune_satisfies_vacuum(PruneState
*prstate
, HeapTuple tup
, Buffer buffer
)
428 TransactionId dead_after
;
430 res
= HeapTupleSatisfiesVacuumHorizon(tup
, buffer
, &dead_after
);
432 if (res
!= HEAPTUPLE_RECENTLY_DEAD
)
436 * If we are already relying on the limited xmin, there is no need to
437 * delay doing so anymore.
439 if (prstate
->old_snap_used
)
441 Assert(TransactionIdIsValid(prstate
->old_snap_xmin
));
443 if (TransactionIdPrecedes(dead_after
, prstate
->old_snap_xmin
))
444 res
= HEAPTUPLE_DEAD
;
449 * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
450 * row dead. If not, and old_snapshot_threshold is enabled, try to use the
453 if (GlobalVisTestIsRemovableXid(prstate
->vistest
, dead_after
))
454 res
= HEAPTUPLE_DEAD
;
455 else if (OldSnapshotThresholdActive())
457 /* haven't determined limited horizon yet, requests */
458 if (!TransactionIdIsValid(prstate
->old_snap_xmin
))
460 TransactionId horizon
=
461 GlobalVisTestNonRemovableHorizon(prstate
->vistest
);
463 TransactionIdLimitedForOldSnapshots(horizon
, prstate
->rel
,
464 &prstate
->old_snap_xmin
,
465 &prstate
->old_snap_ts
);
468 if (TransactionIdIsValid(prstate
->old_snap_xmin
) &&
469 TransactionIdPrecedes(dead_after
, prstate
->old_snap_xmin
))
472 * About to remove row based on snapshot_too_old. Need to raise
473 * the threshold so problematic accesses would error.
475 Assert(!prstate
->old_snap_used
);
476 SetOldSnapshotThresholdTimestamp(prstate
->old_snap_ts
,
477 prstate
->old_snap_xmin
);
478 prstate
->old_snap_used
= true;
479 res
= HEAPTUPLE_DEAD
;
488 * Prune specified line pointer or a HOT chain originating at line pointer.
490 * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
491 * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
492 * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
493 * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
494 * DEAD, the heap_prune_satisfies_vacuum test is just too coarse to detect it.
496 * In general, pruning must never leave behind a DEAD tuple that still has
497 * tuple storage. VACUUM isn't prepared to deal with that case. That's why
498 * VACUUM prunes the same heap page a second time (without dropping its lock
499 * in the interim) when it sees a newly DEAD tuple that we initially saw as
500 * in-progress. Retrying pruning like this can only happen when an inserting
501 * transaction concurrently aborts.
503 * The root line pointer is redirected to the tuple immediately after the
504 * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
505 * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
506 * tuple, which we treat as a chain of length 1.)
508 * prstate->vistest is used to distinguish whether tuples are DEAD or
511 * We don't actually change the page here, except perhaps for hint-bit updates
512 * caused by heap_prune_satisfies_vacuum. We just add entries to the arrays in
513 * prstate showing the changes to be made. Items to be redirected are added
514 * to the redirected[] array (two entries per redirection); items to be set to
515 * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
516 * state are added to nowunused[].
518 * Returns the number of tuples (to be) deleted from the page.
521 heap_prune_chain(Buffer buffer
, OffsetNumber rootoffnum
, PruneState
*prstate
)
524 Page dp
= (Page
) BufferGetPage(buffer
);
525 TransactionId priorXmax
= InvalidTransactionId
;
527 HeapTupleHeader htup
;
528 OffsetNumber latestdead
= InvalidOffsetNumber
,
529 maxoff
= PageGetMaxOffsetNumber(dp
),
531 OffsetNumber chainitems
[MaxHeapTuplesPerPage
];
536 tup
.t_tableOid
= RelationGetRelid(prstate
->rel
);
538 rootlp
= PageGetItemId(dp
, rootoffnum
);
541 * If it's a heap-only tuple, then it is not the start of a HOT chain.
543 if (ItemIdIsNormal(rootlp
))
545 htup
= (HeapTupleHeader
) PageGetItem(dp
, rootlp
);
548 tup
.t_len
= ItemIdGetLength(rootlp
);
549 ItemPointerSet(&(tup
.t_self
), BufferGetBlockNumber(buffer
), rootoffnum
);
551 if (HeapTupleHeaderIsHeapOnly(htup
))
554 * If the tuple is DEAD and doesn't chain to anything else, mark
555 * it unused immediately. (If it does chain, we can only remove
556 * it as part of pruning its chain.)
558 * We need this primarily to handle aborted HOT updates, that is,
559 * XMIN_INVALID heap-only tuples. Those might not be linked to by
560 * any chain, since the parent tuple might be re-updated before
561 * any pruning occurs. So we have to be able to reap them
562 * separately from chain-pruning. (Note that
563 * HeapTupleHeaderIsHotUpdated will never return true for an
564 * XMIN_INVALID tuple, so this code will work even when there were
565 * sequential updates within the aborted transaction.)
567 * Note that we might first arrive at a dead heap-only tuple
568 * either here or while following a chain below. Whichever path
569 * gets there first will mark the tuple unused.
571 if (heap_prune_satisfies_vacuum(prstate
, &tup
, buffer
)
572 == HEAPTUPLE_DEAD
&& !HeapTupleHeaderIsHotUpdated(htup
))
574 heap_prune_record_unused(prstate
, rootoffnum
);
575 HeapTupleHeaderAdvanceLatestRemovedXid(htup
,
576 &prstate
->latestRemovedXid
);
580 /* Nothing more to do */
585 /* Start from the root tuple */
588 /* while not end of the chain */
595 /* Sanity check (pure paranoia) */
596 if (offnum
< FirstOffsetNumber
)
600 * An offset past the end of page's line pointer array is possible
601 * when the array was truncated (original item must have been unused)
606 /* If item is already processed, stop --- it must not be same chain */
607 if (prstate
->marked
[offnum
])
610 lp
= PageGetItemId(dp
, offnum
);
612 /* Unused item obviously isn't part of the chain */
613 if (!ItemIdIsUsed(lp
))
617 * If we are looking at the redirected root line pointer, jump to the
618 * first normal tuple in the chain. If we find a redirect somewhere
619 * else, stop --- it must not be same chain.
621 if (ItemIdIsRedirected(lp
))
624 break; /* not at start of chain */
625 chainitems
[nchain
++] = offnum
;
626 offnum
= ItemIdGetRedirect(rootlp
);
631 * Likewise, a dead line pointer can't be part of the chain. (We
632 * already eliminated the case of dead root tuple outside this
635 if (ItemIdIsDead(lp
))
638 Assert(ItemIdIsNormal(lp
));
639 htup
= (HeapTupleHeader
) PageGetItem(dp
, lp
);
642 tup
.t_len
= ItemIdGetLength(lp
);
643 ItemPointerSet(&(tup
.t_self
), BufferGetBlockNumber(buffer
), offnum
);
646 * Check the tuple XMIN against prior XMAX, if any
648 if (TransactionIdIsValid(priorXmax
) &&
649 !TransactionIdEquals(HeapTupleHeaderGetXmin(htup
), priorXmax
))
653 * OK, this tuple is indeed a member of the chain.
655 chainitems
[nchain
++] = offnum
;
658 * Check tuple's visibility status.
660 tupdead
= recent_dead
= false;
662 switch (heap_prune_satisfies_vacuum(prstate
, &tup
, buffer
))
668 case HEAPTUPLE_RECENTLY_DEAD
:
672 * This tuple may soon become DEAD. Update the hint field so
673 * that the page is reconsidered for pruning in future.
675 heap_prune_record_prunable(prstate
,
676 HeapTupleHeaderGetUpdateXid(htup
));
679 case HEAPTUPLE_DELETE_IN_PROGRESS
:
682 * This tuple may soon become DEAD. Update the hint field so
683 * that the page is reconsidered for pruning in future.
685 heap_prune_record_prunable(prstate
,
686 HeapTupleHeaderGetUpdateXid(htup
));
690 case HEAPTUPLE_INSERT_IN_PROGRESS
:
693 * If we wanted to optimize for aborts, we might consider
694 * marking the page prunable when we see INSERT_IN_PROGRESS.
695 * But we don't. See related decisions about when to mark the
696 * page prunable in heapam.c.
701 elog(ERROR
, "unexpected HeapTupleSatisfiesVacuum result");
706 * Remember the last DEAD tuple seen. We will advance past
707 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
708 * but we can't advance past anything else. We have to make sure that
709 * we don't miss any DEAD tuples, since DEAD tuples that still have
710 * tuple storage after pruning will confuse VACUUM.
715 HeapTupleHeaderAdvanceLatestRemovedXid(htup
,
716 &prstate
->latestRemovedXid
);
718 else if (!recent_dead
)
722 * If the tuple is not HOT-updated, then we are at the end of this
725 if (!HeapTupleHeaderIsHotUpdated(htup
))
728 /* HOT implies it can't have moved to different partition */
729 Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup
));
732 * Advance to next chain member.
734 Assert(ItemPointerGetBlockNumber(&htup
->t_ctid
) ==
735 BufferGetBlockNumber(buffer
));
736 offnum
= ItemPointerGetOffsetNumber(&htup
->t_ctid
);
737 priorXmax
= HeapTupleHeaderGetUpdateXid(htup
);
741 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
742 * the DEAD tuples at the start of the chain are removed and the root line
743 * pointer is appropriately redirected.
745 if (OffsetNumberIsValid(latestdead
))
748 * Mark as unused each intermediate item that we are able to remove
751 * When the previous item is the last dead tuple seen, we are at the
752 * right candidate for redirection.
754 for (i
= 1; (i
< nchain
) && (chainitems
[i
- 1] != latestdead
); i
++)
756 heap_prune_record_unused(prstate
, chainitems
[i
]);
761 * If the root entry had been a normal tuple, we are deleting it, so
762 * count it in the result. But changing a redirect (even to DEAD
763 * state) doesn't count.
765 if (ItemIdIsNormal(rootlp
))
769 * If the DEAD tuple is at the end of the chain, the entire chain is
770 * dead and the root line pointer can be marked dead. Otherwise just
771 * redirect the root to the correct chain member.
774 heap_prune_record_dead(prstate
, rootoffnum
);
776 heap_prune_record_redirect(prstate
, rootoffnum
, chainitems
[i
]);
778 else if (nchain
< 2 && ItemIdIsRedirected(rootlp
))
781 * We found a redirect item that doesn't point to a valid follow-on
782 * item. This can happen if the loop in heap_page_prune caused us to
783 * visit the dead successor of a redirect item before visiting the
784 * redirect item. We can clean up by setting the redirect item to
787 heap_prune_record_dead(prstate
, rootoffnum
);
793 /* Record lowest soon-prunable XID */
795 heap_prune_record_prunable(PruneState
*prstate
, TransactionId xid
)
798 * This should exactly match the PageSetPrunable macro. We can't store
799 * directly into the page header yet, so we update working state.
801 Assert(TransactionIdIsNormal(xid
));
802 if (!TransactionIdIsValid(prstate
->new_prune_xid
) ||
803 TransactionIdPrecedes(xid
, prstate
->new_prune_xid
))
804 prstate
->new_prune_xid
= xid
;
807 /* Record line pointer to be redirected */
809 heap_prune_record_redirect(PruneState
*prstate
,
810 OffsetNumber offnum
, OffsetNumber rdoffnum
)
812 Assert(prstate
->nredirected
< MaxHeapTuplesPerPage
);
813 prstate
->redirected
[prstate
->nredirected
* 2] = offnum
;
814 prstate
->redirected
[prstate
->nredirected
* 2 + 1] = rdoffnum
;
815 prstate
->nredirected
++;
816 Assert(!prstate
->marked
[offnum
]);
817 prstate
->marked
[offnum
] = true;
818 Assert(!prstate
->marked
[rdoffnum
]);
819 prstate
->marked
[rdoffnum
] = true;
822 /* Record line pointer to be marked dead */
824 heap_prune_record_dead(PruneState
*prstate
, OffsetNumber offnum
)
826 Assert(prstate
->ndead
< MaxHeapTuplesPerPage
);
827 prstate
->nowdead
[prstate
->ndead
] = offnum
;
829 Assert(!prstate
->marked
[offnum
]);
830 prstate
->marked
[offnum
] = true;
833 /* Record line pointer to be marked unused */
835 heap_prune_record_unused(PruneState
*prstate
, OffsetNumber offnum
)
837 Assert(prstate
->nunused
< MaxHeapTuplesPerPage
);
838 prstate
->nowunused
[prstate
->nunused
] = offnum
;
840 Assert(!prstate
->marked
[offnum
]);
841 prstate
->marked
[offnum
] = true;
846 * Perform the actual page changes needed by heap_page_prune.
847 * It is expected that the caller has a full cleanup lock on the
851 heap_page_prune_execute(Buffer buffer
,
852 OffsetNumber
*redirected
, int nredirected
,
853 OffsetNumber
*nowdead
, int ndead
,
854 OffsetNumber
*nowunused
, int nunused
)
856 Page page
= (Page
) BufferGetPage(buffer
);
857 OffsetNumber
*offnum
;
858 HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY
;
860 /* Shouldn't be called unless there's something to do */
861 Assert(nredirected
> 0 || ndead
> 0 || nunused
> 0);
863 /* Update all redirected line pointers */
865 for (int i
= 0; i
< nredirected
; i
++)
867 OffsetNumber fromoff
= *offnum
++;
868 OffsetNumber tooff
= *offnum
++;
869 ItemId fromlp
= PageGetItemId(page
, fromoff
);
870 ItemId tolp PG_USED_FOR_ASSERTS_ONLY
;
872 #ifdef USE_ASSERT_CHECKING
875 * Any existing item that we set as an LP_REDIRECT (any 'from' item)
876 * must be the first item from a HOT chain. If the item has tuple
877 * storage then it can't be a heap-only tuple. Otherwise we are just
878 * maintaining an existing LP_REDIRECT from an existing HOT chain that
879 * has been pruned at least once before now.
881 if (!ItemIdIsRedirected(fromlp
))
883 Assert(ItemIdHasStorage(fromlp
) && ItemIdIsNormal(fromlp
));
885 htup
= (HeapTupleHeader
) PageGetItem(page
, fromlp
);
886 Assert(!HeapTupleHeaderIsHeapOnly(htup
));
890 /* We shouldn't need to redundantly set the redirect */
891 Assert(ItemIdGetRedirect(fromlp
) != tooff
);
895 * The item that we're about to set as an LP_REDIRECT (the 'from'
896 * item) will point to an existing item (the 'to' item) that is
897 * already a heap-only tuple. There can be at most one LP_REDIRECT
898 * item per HOT chain.
900 * We need to keep around an LP_REDIRECT item (after original
901 * non-heap-only root tuple gets pruned away) so that it's always
902 * possible for VACUUM to easily figure out what TID to delete from
903 * indexes when an entire HOT chain becomes dead. A heap-only tuple
904 * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
907 tolp
= PageGetItemId(page
, tooff
);
908 Assert(ItemIdHasStorage(tolp
) && ItemIdIsNormal(tolp
));
909 htup
= (HeapTupleHeader
) PageGetItem(page
, tolp
);
910 Assert(HeapTupleHeaderIsHeapOnly(htup
));
913 ItemIdSetRedirect(fromlp
, tooff
);
916 /* Update all now-dead line pointers */
918 for (int i
= 0; i
< ndead
; i
++)
920 OffsetNumber off
= *offnum
++;
921 ItemId lp
= PageGetItemId(page
, off
);
923 #ifdef USE_ASSERT_CHECKING
926 * An LP_DEAD line pointer must be left behind when the original item
927 * (which is dead to everybody) could still be referenced by a TID in
928 * an index. This should never be necessary with any individual
929 * heap-only tuple item, though. (It's not clear how much of a problem
930 * that would be, but there is no reason to allow it.)
932 if (ItemIdHasStorage(lp
))
934 Assert(ItemIdIsNormal(lp
));
935 htup
= (HeapTupleHeader
) PageGetItem(page
, lp
);
936 Assert(!HeapTupleHeaderIsHeapOnly(htup
));
940 /* Whole HOT chain becomes dead */
941 Assert(ItemIdIsRedirected(lp
));
948 /* Update all now-unused line pointers */
950 for (int i
= 0; i
< nunused
; i
++)
952 OffsetNumber off
= *offnum
++;
953 ItemId lp
= PageGetItemId(page
, off
);
955 #ifdef USE_ASSERT_CHECKING
958 * Only heap-only tuples can become LP_UNUSED during pruning. They
959 * don't need to be left in place as LP_DEAD items until VACUUM gets
960 * around to doing index vacuuming.
962 Assert(ItemIdHasStorage(lp
) && ItemIdIsNormal(lp
));
963 htup
= (HeapTupleHeader
) PageGetItem(page
, lp
);
964 Assert(HeapTupleHeaderIsHeapOnly(htup
));
971 * Finally, repair any fragmentation, and update the page's hint bit about
972 * whether it has free pointers.
974 PageRepairFragmentation(page
);
979 * For all items in this page, find their respective root line pointers.
980 * If item k is part of a HOT-chain with root at item j, then we set
981 * root_offsets[k - 1] = j.
983 * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
984 * Unused entries are filled with InvalidOffsetNumber (zero).
986 * The function must be called with at least share lock on the buffer, to
987 * prevent concurrent prune operations.
989 * Note: The information collected here is valid only as long as the caller
990 * holds a pin on the buffer. Once pin is released, a tuple might be pruned
991 * and reused by a completely unrelated tuple.
994 heap_get_root_tuples(Page page
, OffsetNumber
*root_offsets
)
999 MemSet(root_offsets
, InvalidOffsetNumber
,
1000 MaxHeapTuplesPerPage
* sizeof(OffsetNumber
));
1002 maxoff
= PageGetMaxOffsetNumber(page
);
1003 for (offnum
= FirstOffsetNumber
; offnum
<= maxoff
; offnum
= OffsetNumberNext(offnum
))
1005 ItemId lp
= PageGetItemId(page
, offnum
);
1006 HeapTupleHeader htup
;
1007 OffsetNumber nextoffnum
;
1008 TransactionId priorXmax
;
1010 /* skip unused and dead items */
1011 if (!ItemIdIsUsed(lp
) || ItemIdIsDead(lp
))
1014 if (ItemIdIsNormal(lp
))
1016 htup
= (HeapTupleHeader
) PageGetItem(page
, lp
);
1019 * Check if this tuple is part of a HOT-chain rooted at some other
1020 * tuple. If so, skip it for now; we'll process it when we find
1023 if (HeapTupleHeaderIsHeapOnly(htup
))
1027 * This is either a plain tuple or the root of a HOT-chain.
1028 * Remember it in the mapping.
1030 root_offsets
[offnum
- 1] = offnum
;
1032 /* If it's not the start of a HOT-chain, we're done with it */
1033 if (!HeapTupleHeaderIsHotUpdated(htup
))
1036 /* Set up to scan the HOT-chain */
1037 nextoffnum
= ItemPointerGetOffsetNumber(&htup
->t_ctid
);
1038 priorXmax
= HeapTupleHeaderGetUpdateXid(htup
);
1042 /* Must be a redirect item. We do not set its root_offsets entry */
1043 Assert(ItemIdIsRedirected(lp
));
1044 /* Set up to scan the HOT-chain */
1045 nextoffnum
= ItemIdGetRedirect(lp
);
1046 priorXmax
= InvalidTransactionId
;
1050 * Now follow the HOT-chain and collect other tuples in the chain.
1052 * Note: Even though this is a nested loop, the complexity of the
1053 * function is O(N) because a tuple in the page should be visited not
1054 * more than twice, once in the outer loop and once in HOT-chain
1059 /* Sanity check (pure paranoia) */
1060 if (offnum
< FirstOffsetNumber
)
1064 * An offset past the end of page's line pointer array is possible
1065 * when the array was truncated
1067 if (offnum
> maxoff
)
1070 lp
= PageGetItemId(page
, nextoffnum
);
1072 /* Check for broken chains */
1073 if (!ItemIdIsNormal(lp
))
1076 htup
= (HeapTupleHeader
) PageGetItem(page
, lp
);
1078 if (TransactionIdIsValid(priorXmax
) &&
1079 !TransactionIdEquals(priorXmax
, HeapTupleHeaderGetXmin(htup
)))
1082 /* Remember the root line pointer for this item */
1083 root_offsets
[nextoffnum
- 1] = offnum
;
1085 /* Advance to next chain member, if any */
1086 if (!HeapTupleHeaderIsHotUpdated(htup
))
1089 /* HOT implies it can't have moved to different partition */
1090 Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup
));
1092 nextoffnum
= ItemPointerGetOffsetNumber(&htup
->t_ctid
);
1093 priorXmax
= HeapTupleHeaderGetUpdateXid(htup
);