Update copyright for 2022
[pgsql.git] / src / backend / access / heap / pruneheap.c
blob7ebd1e5181e9333a3ea76ce637780fc3f43659b1
1 /*-------------------------------------------------------------------------
3 * pruneheap.c
4 * heap page pruning and HOT-chain management code
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/access/heap/pruneheap.c
13 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include "access/heapam.h"
18 #include "access/heapam_xlog.h"
19 #include "access/htup_details.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "catalog/catalog.h"
23 #include "miscadmin.h"
24 #include "pgstat.h"
25 #include "storage/bufmgr.h"
26 #include "utils/snapmgr.h"
27 #include "utils/rel.h"
28 #include "utils/snapmgr.h"
30 /* Working data for heap_page_prune and subroutines */
31 typedef struct
33 Relation rel;
35 /* tuple visibility test, initialized for the relation */
36 GlobalVisState *vistest;
39 * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
40 * been computed (done on demand, and only if
41 * OldSnapshotThresholdActive()). The first time a tuple is about to be
42 * removed based on the limited horizon, old_snap_used is set to true, and
43 * SetOldSnapshotThresholdTimestamp() is called. See
44 * heap_prune_satisfies_vacuum().
46 TimestampTz old_snap_ts;
47 TransactionId old_snap_xmin;
48 bool old_snap_used;
50 TransactionId new_prune_xid; /* new prune hint value for page */
51 TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
52 int nredirected; /* numbers of entries in arrays below */
53 int ndead;
54 int nunused;
55 /* arrays that accumulate indexes of items to be changed */
56 OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
57 OffsetNumber nowdead[MaxHeapTuplesPerPage];
58 OffsetNumber nowunused[MaxHeapTuplesPerPage];
59 /* marked[i] is true if item i is entered in one of the above arrays */
60 bool marked[MaxHeapTuplesPerPage + 1];
61 } PruneState;
63 /* Local functions */
64 static int heap_prune_chain(Buffer buffer,
65 OffsetNumber rootoffnum,
66 PruneState *prstate);
67 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
68 static void heap_prune_record_redirect(PruneState *prstate,
69 OffsetNumber offnum, OffsetNumber rdoffnum);
70 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
71 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
75 * Optionally prune and repair fragmentation in the specified page.
77 * This is an opportunistic function. It will perform housekeeping
78 * only if the page heuristically looks like a candidate for pruning and we
79 * can acquire buffer cleanup lock without blocking.
81 * Note: this is called quite often. It's important that it fall out quickly
82 * if there's not any use in pruning.
84 * Caller must have pin on the buffer, and must *not* have a lock on it.
86 void
87 heap_page_prune_opt(Relation relation, Buffer buffer)
89 Page page = BufferGetPage(buffer);
90 TransactionId prune_xid;
91 GlobalVisState *vistest;
92 TransactionId limited_xmin = InvalidTransactionId;
93 TimestampTz limited_ts = 0;
94 Size minfree;
97 * We can't write WAL in recovery mode, so there's no point trying to
98 * clean the page. The primary will likely issue a cleaning WAL record
99 * soon anyway, so this is no particular loss.
101 if (RecoveryInProgress())
102 return;
105 * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
106 * currently are broken, and discussion of what to do about them is
107 * ongoing. See
108 * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
110 if (old_snapshot_threshold == 0)
111 SnapshotTooOldMagicForTest();
114 * First check whether there's any chance there's something to prune,
115 * determining the appropriate horizon is a waste if there's no prune_xid
116 * (i.e. no updates/deletes left potentially dead tuples around).
118 prune_xid = ((PageHeader) page)->pd_prune_xid;
119 if (!TransactionIdIsValid(prune_xid))
120 return;
123 * Check whether prune_xid indicates that there may be dead rows that can
124 * be cleaned up.
126 * It is OK to check the old snapshot limit before acquiring the cleanup
127 * lock because the worst that can happen is that we are not quite as
128 * aggressive about the cleanup (by however many transaction IDs are
129 * consumed between this point and acquiring the lock). This allows us to
130 * save significant overhead in the case where the page is found not to be
131 * prunable.
133 * Even if old_snapshot_threshold is set, we first check whether the page
134 * can be pruned without. Both because
135 * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
136 * unnecessarily relying on old_snapshot_threshold avoids causing
137 * conflicts.
139 vistest = GlobalVisTestFor(relation);
141 if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
143 if (!OldSnapshotThresholdActive())
144 return;
146 if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
147 relation,
148 &limited_xmin, &limited_ts))
149 return;
151 if (!TransactionIdPrecedes(prune_xid, limited_xmin))
152 return;
156 * We prune when a previous UPDATE failed to find enough space on the page
157 * for a new tuple version, or when free space falls below the relation's
158 * fill-factor target (but not less than 10%).
160 * Checking free space here is questionable since we aren't holding any
161 * lock on the buffer; in the worst case we could get a bogus answer. It's
162 * unlikely to be *seriously* wrong, though, since reading either pd_lower
163 * or pd_upper is probably atomic. Avoiding taking a lock seems more
164 * important than sometimes getting a wrong answer in what is after all
165 * just a heuristic estimate.
167 minfree = RelationGetTargetPageFreeSpace(relation,
168 HEAP_DEFAULT_FILLFACTOR);
169 minfree = Max(minfree, BLCKSZ / 10);
171 if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
173 /* OK, try to get exclusive buffer lock */
174 if (!ConditionalLockBufferForCleanup(buffer))
175 return;
178 * Now that we have buffer lock, get accurate information about the
179 * page's free space, and recheck the heuristic about whether to
180 * prune. (We needn't recheck PageIsPrunable, since no one else could
181 * have pruned while we hold pin.)
183 if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
185 int ndeleted,
186 nnewlpdead;
188 ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin,
189 limited_ts, &nnewlpdead, NULL);
192 * Report the number of tuples reclaimed to pgstats. This is
193 * ndeleted minus the number of newly-LP_DEAD-set items.
195 * We derive the number of dead tuples like this to avoid totally
196 * forgetting about items that were set to LP_DEAD, since they
197 * still need to be cleaned up by VACUUM. We only want to count
198 * heap-only tuples that just became LP_UNUSED in our report,
199 * which don't.
201 * VACUUM doesn't have to compensate in the same way when it
202 * tracks ndeleted, since it will set the same LP_DEAD items to
203 * LP_UNUSED separately.
205 if (ndeleted > nnewlpdead)
206 pgstat_update_heap_dead_tuples(relation,
207 ndeleted - nnewlpdead);
210 /* And release buffer lock */
211 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
214 * We avoid reuse of any free space created on the page by unrelated
215 * UPDATEs/INSERTs by opting to not update the FSM at this point. The
216 * free space should be reused by UPDATEs to *this* page.
223 * Prune and repair fragmentation in the specified page.
225 * Caller must have pin and buffer cleanup lock on the page. Note that we
226 * don't update the FSM information for page on caller's behalf.
228 * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
229 * (see heap_prune_satisfies_vacuum and
230 * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
231 * either have been set by TransactionIdLimitedForOldSnapshots, or
232 * InvalidTransactionId/0 respectively.
234 * Sets *nnewlpdead for caller, indicating the number of items that were
235 * newly set LP_DEAD during prune operation.
237 * off_loc is the offset location required by the caller to use in error
238 * callback.
240 * Returns the number of tuples deleted from the page during this call.
243 heap_page_prune(Relation relation, Buffer buffer,
244 GlobalVisState *vistest,
245 TransactionId old_snap_xmin,
246 TimestampTz old_snap_ts,
247 int *nnewlpdead,
248 OffsetNumber *off_loc)
250 int ndeleted = 0;
251 Page page = BufferGetPage(buffer);
252 OffsetNumber offnum,
253 maxoff;
254 PruneState prstate;
257 * Our strategy is to scan the page and make lists of items to change,
258 * then apply the changes within a critical section. This keeps as much
259 * logic as possible out of the critical section, and also ensures that
260 * WAL replay will work the same as the normal case.
262 * First, initialize the new pd_prune_xid value to zero (indicating no
263 * prunable tuples). If we find any tuples which may soon become
264 * prunable, we will save the lowest relevant XID in new_prune_xid. Also
265 * initialize the rest of our working state.
267 prstate.new_prune_xid = InvalidTransactionId;
268 prstate.rel = relation;
269 prstate.vistest = vistest;
270 prstate.old_snap_xmin = old_snap_xmin;
271 prstate.old_snap_ts = old_snap_ts;
272 prstate.old_snap_used = false;
273 prstate.latestRemovedXid = InvalidTransactionId;
274 prstate.nredirected = prstate.ndead = prstate.nunused = 0;
275 memset(prstate.marked, 0, sizeof(prstate.marked));
277 /* Scan the page */
278 maxoff = PageGetMaxOffsetNumber(page);
279 for (offnum = FirstOffsetNumber;
280 offnum <= maxoff;
281 offnum = OffsetNumberNext(offnum))
283 ItemId itemid;
285 /* Ignore items already processed as part of an earlier chain */
286 if (prstate.marked[offnum])
287 continue;
290 * Set the offset number so that we can display it along with any
291 * error that occurred while processing this tuple.
293 if (off_loc)
294 *off_loc = offnum;
296 /* Nothing to do if slot is empty or already dead */
297 itemid = PageGetItemId(page, offnum);
298 if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
299 continue;
301 /* Process this item or chain of items */
302 ndeleted += heap_prune_chain(buffer, offnum, &prstate);
305 /* Clear the offset information once we have processed the given page. */
306 if (off_loc)
307 *off_loc = InvalidOffsetNumber;
309 /* Any error while applying the changes is critical */
310 START_CRIT_SECTION();
312 /* Have we found any prunable items? */
313 if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
316 * Apply the planned item changes, then repair page fragmentation, and
317 * update the page's hint bit about whether it has free line pointers.
319 heap_page_prune_execute(buffer,
320 prstate.redirected, prstate.nredirected,
321 prstate.nowdead, prstate.ndead,
322 prstate.nowunused, prstate.nunused);
325 * Update the page's pd_prune_xid field to either zero, or the lowest
326 * XID of any soon-prunable tuple.
328 ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
331 * Also clear the "page is full" flag, since there's no point in
332 * repeating the prune/defrag process until something else happens to
333 * the page.
335 PageClearFull(page);
337 MarkBufferDirty(buffer);
340 * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
342 if (RelationNeedsWAL(relation))
344 xl_heap_prune xlrec;
345 XLogRecPtr recptr;
347 xlrec.latestRemovedXid = prstate.latestRemovedXid;
348 xlrec.nredirected = prstate.nredirected;
349 xlrec.ndead = prstate.ndead;
351 XLogBeginInsert();
352 XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);
354 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
357 * The OffsetNumber arrays are not actually in the buffer, but we
358 * pretend that they are. When XLogInsert stores the whole
359 * buffer, the offset arrays need not be stored too.
361 if (prstate.nredirected > 0)
362 XLogRegisterBufData(0, (char *) prstate.redirected,
363 prstate.nredirected *
364 sizeof(OffsetNumber) * 2);
366 if (prstate.ndead > 0)
367 XLogRegisterBufData(0, (char *) prstate.nowdead,
368 prstate.ndead * sizeof(OffsetNumber));
370 if (prstate.nunused > 0)
371 XLogRegisterBufData(0, (char *) prstate.nowunused,
372 prstate.nunused * sizeof(OffsetNumber));
374 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);
376 PageSetLSN(BufferGetPage(buffer), recptr);
379 else
382 * If we didn't prune anything, but have found a new value for the
383 * pd_prune_xid field, update it and mark the buffer dirty. This is
384 * treated as a non-WAL-logged hint.
386 * Also clear the "page is full" flag if it is set, since there's no
387 * point in repeating the prune/defrag process until something else
388 * happens to the page.
390 if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
391 PageIsFull(page))
393 ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
394 PageClearFull(page);
395 MarkBufferDirtyHint(buffer, true);
399 END_CRIT_SECTION();
401 /* Record number of newly-set-LP_DEAD items for caller */
402 *nnewlpdead = prstate.ndead;
404 return ndeleted;
409 * Perform visibility checks for heap pruning.
411 * This is more complicated than just using GlobalVisTestIsRemovableXid()
412 * because of old_snapshot_threshold. We only want to increase the threshold
413 * that triggers errors for old snapshots when we actually decide to remove a
414 * row based on the limited horizon.
416 * Due to its cost we also only want to call
417 * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
418 * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
419 * still want to be able to remove rows that are too new to be removed
420 * according to prstate->vistest, but that can be removed based on
421 * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
422 * demand in here, if appropriate.
424 static HTSV_Result
425 heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
427 HTSV_Result res;
428 TransactionId dead_after;
430 res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
432 if (res != HEAPTUPLE_RECENTLY_DEAD)
433 return res;
436 * If we are already relying on the limited xmin, there is no need to
437 * delay doing so anymore.
439 if (prstate->old_snap_used)
441 Assert(TransactionIdIsValid(prstate->old_snap_xmin));
443 if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
444 res = HEAPTUPLE_DEAD;
445 return res;
449 * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
450 * row dead. If not, and old_snapshot_threshold is enabled, try to use the
451 * lowered horizon.
453 if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
454 res = HEAPTUPLE_DEAD;
455 else if (OldSnapshotThresholdActive())
457 /* haven't determined limited horizon yet, requests */
458 if (!TransactionIdIsValid(prstate->old_snap_xmin))
460 TransactionId horizon =
461 GlobalVisTestNonRemovableHorizon(prstate->vistest);
463 TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
464 &prstate->old_snap_xmin,
465 &prstate->old_snap_ts);
468 if (TransactionIdIsValid(prstate->old_snap_xmin) &&
469 TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
472 * About to remove row based on snapshot_too_old. Need to raise
473 * the threshold so problematic accesses would error.
475 Assert(!prstate->old_snap_used);
476 SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
477 prstate->old_snap_xmin);
478 prstate->old_snap_used = true;
479 res = HEAPTUPLE_DEAD;
483 return res;
488 * Prune specified line pointer or a HOT chain originating at line pointer.
490 * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
491 * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
492 * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
493 * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
494 * DEAD, the heap_prune_satisfies_vacuum test is just too coarse to detect it.
496 * In general, pruning must never leave behind a DEAD tuple that still has
497 * tuple storage. VACUUM isn't prepared to deal with that case. That's why
498 * VACUUM prunes the same heap page a second time (without dropping its lock
499 * in the interim) when it sees a newly DEAD tuple that we initially saw as
500 * in-progress. Retrying pruning like this can only happen when an inserting
501 * transaction concurrently aborts.
503 * The root line pointer is redirected to the tuple immediately after the
504 * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
505 * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
506 * tuple, which we treat as a chain of length 1.)
508 * prstate->vistest is used to distinguish whether tuples are DEAD or
509 * RECENTLY_DEAD.
511 * We don't actually change the page here, except perhaps for hint-bit updates
512 * caused by heap_prune_satisfies_vacuum. We just add entries to the arrays in
513 * prstate showing the changes to be made. Items to be redirected are added
514 * to the redirected[] array (two entries per redirection); items to be set to
515 * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
516 * state are added to nowunused[].
518 * Returns the number of tuples (to be) deleted from the page.
520 static int
521 heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
523 int ndeleted = 0;
524 Page dp = (Page) BufferGetPage(buffer);
525 TransactionId priorXmax = InvalidTransactionId;
526 ItemId rootlp;
527 HeapTupleHeader htup;
528 OffsetNumber latestdead = InvalidOffsetNumber,
529 maxoff = PageGetMaxOffsetNumber(dp),
530 offnum;
531 OffsetNumber chainitems[MaxHeapTuplesPerPage];
532 int nchain = 0,
534 HeapTupleData tup;
536 tup.t_tableOid = RelationGetRelid(prstate->rel);
538 rootlp = PageGetItemId(dp, rootoffnum);
541 * If it's a heap-only tuple, then it is not the start of a HOT chain.
543 if (ItemIdIsNormal(rootlp))
545 htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
547 tup.t_data = htup;
548 tup.t_len = ItemIdGetLength(rootlp);
549 ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum);
551 if (HeapTupleHeaderIsHeapOnly(htup))
554 * If the tuple is DEAD and doesn't chain to anything else, mark
555 * it unused immediately. (If it does chain, we can only remove
556 * it as part of pruning its chain.)
558 * We need this primarily to handle aborted HOT updates, that is,
559 * XMIN_INVALID heap-only tuples. Those might not be linked to by
560 * any chain, since the parent tuple might be re-updated before
561 * any pruning occurs. So we have to be able to reap them
562 * separately from chain-pruning. (Note that
563 * HeapTupleHeaderIsHotUpdated will never return true for an
564 * XMIN_INVALID tuple, so this code will work even when there were
565 * sequential updates within the aborted transaction.)
567 * Note that we might first arrive at a dead heap-only tuple
568 * either here or while following a chain below. Whichever path
569 * gets there first will mark the tuple unused.
571 if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
572 == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
574 heap_prune_record_unused(prstate, rootoffnum);
575 HeapTupleHeaderAdvanceLatestRemovedXid(htup,
576 &prstate->latestRemovedXid);
577 ndeleted++;
580 /* Nothing more to do */
581 return ndeleted;
585 /* Start from the root tuple */
586 offnum = rootoffnum;
588 /* while not end of the chain */
589 for (;;)
591 ItemId lp;
592 bool tupdead,
593 recent_dead;
595 /* Sanity check (pure paranoia) */
596 if (offnum < FirstOffsetNumber)
597 break;
600 * An offset past the end of page's line pointer array is possible
601 * when the array was truncated (original item must have been unused)
603 if (offnum > maxoff)
604 break;
606 /* If item is already processed, stop --- it must not be same chain */
607 if (prstate->marked[offnum])
608 break;
610 lp = PageGetItemId(dp, offnum);
612 /* Unused item obviously isn't part of the chain */
613 if (!ItemIdIsUsed(lp))
614 break;
617 * If we are looking at the redirected root line pointer, jump to the
618 * first normal tuple in the chain. If we find a redirect somewhere
619 * else, stop --- it must not be same chain.
621 if (ItemIdIsRedirected(lp))
623 if (nchain > 0)
624 break; /* not at start of chain */
625 chainitems[nchain++] = offnum;
626 offnum = ItemIdGetRedirect(rootlp);
627 continue;
631 * Likewise, a dead line pointer can't be part of the chain. (We
632 * already eliminated the case of dead root tuple outside this
633 * function.)
635 if (ItemIdIsDead(lp))
636 break;
638 Assert(ItemIdIsNormal(lp));
639 htup = (HeapTupleHeader) PageGetItem(dp, lp);
641 tup.t_data = htup;
642 tup.t_len = ItemIdGetLength(lp);
643 ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
646 * Check the tuple XMIN against prior XMAX, if any
648 if (TransactionIdIsValid(priorXmax) &&
649 !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
650 break;
653 * OK, this tuple is indeed a member of the chain.
655 chainitems[nchain++] = offnum;
658 * Check tuple's visibility status.
660 tupdead = recent_dead = false;
662 switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
664 case HEAPTUPLE_DEAD:
665 tupdead = true;
666 break;
668 case HEAPTUPLE_RECENTLY_DEAD:
669 recent_dead = true;
672 * This tuple may soon become DEAD. Update the hint field so
673 * that the page is reconsidered for pruning in future.
675 heap_prune_record_prunable(prstate,
676 HeapTupleHeaderGetUpdateXid(htup));
677 break;
679 case HEAPTUPLE_DELETE_IN_PROGRESS:
682 * This tuple may soon become DEAD. Update the hint field so
683 * that the page is reconsidered for pruning in future.
685 heap_prune_record_prunable(prstate,
686 HeapTupleHeaderGetUpdateXid(htup));
687 break;
689 case HEAPTUPLE_LIVE:
690 case HEAPTUPLE_INSERT_IN_PROGRESS:
693 * If we wanted to optimize for aborts, we might consider
694 * marking the page prunable when we see INSERT_IN_PROGRESS.
695 * But we don't. See related decisions about when to mark the
696 * page prunable in heapam.c.
698 break;
700 default:
701 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
702 break;
706 * Remember the last DEAD tuple seen. We will advance past
707 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
708 * but we can't advance past anything else. We have to make sure that
709 * we don't miss any DEAD tuples, since DEAD tuples that still have
710 * tuple storage after pruning will confuse VACUUM.
712 if (tupdead)
714 latestdead = offnum;
715 HeapTupleHeaderAdvanceLatestRemovedXid(htup,
716 &prstate->latestRemovedXid);
718 else if (!recent_dead)
719 break;
722 * If the tuple is not HOT-updated, then we are at the end of this
723 * HOT-update chain.
725 if (!HeapTupleHeaderIsHotUpdated(htup))
726 break;
728 /* HOT implies it can't have moved to different partition */
729 Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
732 * Advance to next chain member.
734 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
735 BufferGetBlockNumber(buffer));
736 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
737 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
741 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
742 * the DEAD tuples at the start of the chain are removed and the root line
743 * pointer is appropriately redirected.
745 if (OffsetNumberIsValid(latestdead))
748 * Mark as unused each intermediate item that we are able to remove
749 * from the chain.
751 * When the previous item is the last dead tuple seen, we are at the
752 * right candidate for redirection.
754 for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
756 heap_prune_record_unused(prstate, chainitems[i]);
757 ndeleted++;
761 * If the root entry had been a normal tuple, we are deleting it, so
762 * count it in the result. But changing a redirect (even to DEAD
763 * state) doesn't count.
765 if (ItemIdIsNormal(rootlp))
766 ndeleted++;
769 * If the DEAD tuple is at the end of the chain, the entire chain is
770 * dead and the root line pointer can be marked dead. Otherwise just
771 * redirect the root to the correct chain member.
773 if (i >= nchain)
774 heap_prune_record_dead(prstate, rootoffnum);
775 else
776 heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
778 else if (nchain < 2 && ItemIdIsRedirected(rootlp))
781 * We found a redirect item that doesn't point to a valid follow-on
782 * item. This can happen if the loop in heap_page_prune caused us to
783 * visit the dead successor of a redirect item before visiting the
784 * redirect item. We can clean up by setting the redirect item to
785 * DEAD state.
787 heap_prune_record_dead(prstate, rootoffnum);
790 return ndeleted;
793 /* Record lowest soon-prunable XID */
794 static void
795 heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
798 * This should exactly match the PageSetPrunable macro. We can't store
799 * directly into the page header yet, so we update working state.
801 Assert(TransactionIdIsNormal(xid));
802 if (!TransactionIdIsValid(prstate->new_prune_xid) ||
803 TransactionIdPrecedes(xid, prstate->new_prune_xid))
804 prstate->new_prune_xid = xid;
807 /* Record line pointer to be redirected */
808 static void
809 heap_prune_record_redirect(PruneState *prstate,
810 OffsetNumber offnum, OffsetNumber rdoffnum)
812 Assert(prstate->nredirected < MaxHeapTuplesPerPage);
813 prstate->redirected[prstate->nredirected * 2] = offnum;
814 prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
815 prstate->nredirected++;
816 Assert(!prstate->marked[offnum]);
817 prstate->marked[offnum] = true;
818 Assert(!prstate->marked[rdoffnum]);
819 prstate->marked[rdoffnum] = true;
822 /* Record line pointer to be marked dead */
823 static void
824 heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
826 Assert(prstate->ndead < MaxHeapTuplesPerPage);
827 prstate->nowdead[prstate->ndead] = offnum;
828 prstate->ndead++;
829 Assert(!prstate->marked[offnum]);
830 prstate->marked[offnum] = true;
833 /* Record line pointer to be marked unused */
834 static void
835 heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
837 Assert(prstate->nunused < MaxHeapTuplesPerPage);
838 prstate->nowunused[prstate->nunused] = offnum;
839 prstate->nunused++;
840 Assert(!prstate->marked[offnum]);
841 prstate->marked[offnum] = true;
846 * Perform the actual page changes needed by heap_page_prune.
847 * It is expected that the caller has a full cleanup lock on the
848 * buffer.
850 void
851 heap_page_prune_execute(Buffer buffer,
852 OffsetNumber *redirected, int nredirected,
853 OffsetNumber *nowdead, int ndead,
854 OffsetNumber *nowunused, int nunused)
856 Page page = (Page) BufferGetPage(buffer);
857 OffsetNumber *offnum;
858 HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
860 /* Shouldn't be called unless there's something to do */
861 Assert(nredirected > 0 || ndead > 0 || nunused > 0);
863 /* Update all redirected line pointers */
864 offnum = redirected;
865 for (int i = 0; i < nredirected; i++)
867 OffsetNumber fromoff = *offnum++;
868 OffsetNumber tooff = *offnum++;
869 ItemId fromlp = PageGetItemId(page, fromoff);
870 ItemId tolp PG_USED_FOR_ASSERTS_ONLY;
872 #ifdef USE_ASSERT_CHECKING
875 * Any existing item that we set as an LP_REDIRECT (any 'from' item)
876 * must be the first item from a HOT chain. If the item has tuple
877 * storage then it can't be a heap-only tuple. Otherwise we are just
878 * maintaining an existing LP_REDIRECT from an existing HOT chain that
879 * has been pruned at least once before now.
881 if (!ItemIdIsRedirected(fromlp))
883 Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
885 htup = (HeapTupleHeader) PageGetItem(page, fromlp);
886 Assert(!HeapTupleHeaderIsHeapOnly(htup));
888 else
890 /* We shouldn't need to redundantly set the redirect */
891 Assert(ItemIdGetRedirect(fromlp) != tooff);
895 * The item that we're about to set as an LP_REDIRECT (the 'from'
896 * item) will point to an existing item (the 'to' item) that is
897 * already a heap-only tuple. There can be at most one LP_REDIRECT
898 * item per HOT chain.
900 * We need to keep around an LP_REDIRECT item (after original
901 * non-heap-only root tuple gets pruned away) so that it's always
902 * possible for VACUUM to easily figure out what TID to delete from
903 * indexes when an entire HOT chain becomes dead. A heap-only tuple
904 * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
905 * tuple can.
907 tolp = PageGetItemId(page, tooff);
908 Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
909 htup = (HeapTupleHeader) PageGetItem(page, tolp);
910 Assert(HeapTupleHeaderIsHeapOnly(htup));
911 #endif
913 ItemIdSetRedirect(fromlp, tooff);
916 /* Update all now-dead line pointers */
917 offnum = nowdead;
918 for (int i = 0; i < ndead; i++)
920 OffsetNumber off = *offnum++;
921 ItemId lp = PageGetItemId(page, off);
923 #ifdef USE_ASSERT_CHECKING
926 * An LP_DEAD line pointer must be left behind when the original item
927 * (which is dead to everybody) could still be referenced by a TID in
928 * an index. This should never be necessary with any individual
929 * heap-only tuple item, though. (It's not clear how much of a problem
930 * that would be, but there is no reason to allow it.)
932 if (ItemIdHasStorage(lp))
934 Assert(ItemIdIsNormal(lp));
935 htup = (HeapTupleHeader) PageGetItem(page, lp);
936 Assert(!HeapTupleHeaderIsHeapOnly(htup));
938 else
940 /* Whole HOT chain becomes dead */
941 Assert(ItemIdIsRedirected(lp));
943 #endif
945 ItemIdSetDead(lp);
948 /* Update all now-unused line pointers */
949 offnum = nowunused;
950 for (int i = 0; i < nunused; i++)
952 OffsetNumber off = *offnum++;
953 ItemId lp = PageGetItemId(page, off);
955 #ifdef USE_ASSERT_CHECKING
958 * Only heap-only tuples can become LP_UNUSED during pruning. They
959 * don't need to be left in place as LP_DEAD items until VACUUM gets
960 * around to doing index vacuuming.
962 Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
963 htup = (HeapTupleHeader) PageGetItem(page, lp);
964 Assert(HeapTupleHeaderIsHeapOnly(htup));
965 #endif
967 ItemIdSetUnused(lp);
971 * Finally, repair any fragmentation, and update the page's hint bit about
972 * whether it has free pointers.
974 PageRepairFragmentation(page);
979 * For all items in this page, find their respective root line pointers.
980 * If item k is part of a HOT-chain with root at item j, then we set
981 * root_offsets[k - 1] = j.
983 * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
984 * Unused entries are filled with InvalidOffsetNumber (zero).
986 * The function must be called with at least share lock on the buffer, to
987 * prevent concurrent prune operations.
989 * Note: The information collected here is valid only as long as the caller
990 * holds a pin on the buffer. Once pin is released, a tuple might be pruned
991 * and reused by a completely unrelated tuple.
993 void
994 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
996 OffsetNumber offnum,
997 maxoff;
999 MemSet(root_offsets, InvalidOffsetNumber,
1000 MaxHeapTuplesPerPage * sizeof(OffsetNumber));
1002 maxoff = PageGetMaxOffsetNumber(page);
1003 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
1005 ItemId lp = PageGetItemId(page, offnum);
1006 HeapTupleHeader htup;
1007 OffsetNumber nextoffnum;
1008 TransactionId priorXmax;
1010 /* skip unused and dead items */
1011 if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
1012 continue;
1014 if (ItemIdIsNormal(lp))
1016 htup = (HeapTupleHeader) PageGetItem(page, lp);
1019 * Check if this tuple is part of a HOT-chain rooted at some other
1020 * tuple. If so, skip it for now; we'll process it when we find
1021 * its root.
1023 if (HeapTupleHeaderIsHeapOnly(htup))
1024 continue;
1027 * This is either a plain tuple or the root of a HOT-chain.
1028 * Remember it in the mapping.
1030 root_offsets[offnum - 1] = offnum;
1032 /* If it's not the start of a HOT-chain, we're done with it */
1033 if (!HeapTupleHeaderIsHotUpdated(htup))
1034 continue;
1036 /* Set up to scan the HOT-chain */
1037 nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1038 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1040 else
1042 /* Must be a redirect item. We do not set its root_offsets entry */
1043 Assert(ItemIdIsRedirected(lp));
1044 /* Set up to scan the HOT-chain */
1045 nextoffnum = ItemIdGetRedirect(lp);
1046 priorXmax = InvalidTransactionId;
1050 * Now follow the HOT-chain and collect other tuples in the chain.
1052 * Note: Even though this is a nested loop, the complexity of the
1053 * function is O(N) because a tuple in the page should be visited not
1054 * more than twice, once in the outer loop and once in HOT-chain
1055 * chases.
1057 for (;;)
1059 /* Sanity check (pure paranoia) */
1060 if (offnum < FirstOffsetNumber)
1061 break;
1064 * An offset past the end of page's line pointer array is possible
1065 * when the array was truncated
1067 if (offnum > maxoff)
1068 break;
1070 lp = PageGetItemId(page, nextoffnum);
1072 /* Check for broken chains */
1073 if (!ItemIdIsNormal(lp))
1074 break;
1076 htup = (HeapTupleHeader) PageGetItem(page, lp);
1078 if (TransactionIdIsValid(priorXmax) &&
1079 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
1080 break;
1082 /* Remember the root line pointer for this item */
1083 root_offsets[nextoffnum - 1] = offnum;
1085 /* Advance to next chain member, if any */
1086 if (!HeapTupleHeaderIsHotUpdated(htup))
1087 break;
1089 /* HOT implies it can't have moved to different partition */
1090 Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1092 nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1093 priorXmax = HeapTupleHeaderGetUpdateXid(htup);