src/backend/commands/vacuumlazy.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuumlazy.c
   4  *        Concurrent ("lazy") vacuuming.
   5  *
   6  *
   7  * The major space usage for LAZY VACUUM is storage for the array of dead
   8  * tuple TIDs, with the next biggest need being storage for per-disk-page
   9  * free space info.  We want to ensure we can vacuum even the very largest
  10  * relations with finite memory space usage.  To do that, we set upper bounds
  11  * on the number of tuples and pages we will keep track of at once.
  12  *
  13  * We are willing to use at most maintenance_work_mem memory space to keep
  14  * track of dead tuples.  We initially allocate an array of TIDs of that size,
  15  * with an upper limit that depends on table size (this limit ensures we don't
  16  * allocate a huge area uselessly for vacuuming small tables).  If the array
  17  * threatens to overflow, we suspend the heap scan phase and perform a pass of
  18  * index cleanup and page compaction, then resume the heap scan with an empty
  19  * TID array.
  20  *
  21  * If we're processing a table with no indexes, we can just vacuum each page
  22  * as we go; there's no need to save up multiple tuples to minimize the number
  23  * of index scans performed.  So we don't use maintenance_work_mem memory for
  24  * the TID array, just enough to hold as many heap tuples as fit on one page.
  25  *
  26  *
  27  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  28  * Portions Copyright (c) 1994, Regents of the University of California
  29  *
  30  *
  31  * IDENTIFICATION
  32  *        $PostgreSQL$
  33  *
  34  *-------------------------------------------------------------------------
  35  */
  36 #include "postgres.h"
  37
  38 #include <math.h>
  39
  40 #include "access/genam.h"
  41 #include "access/heapam.h"
  42 #include "access/transam.h"
  43 #include "access/visibilitymap.h"
  44 #include "catalog/storage.h"
  45 #include "commands/dbcommands.h"
  46 #include "commands/vacuum.h"
  47 #include "miscadmin.h"
  48 #include "pgstat.h"
  49 #include "postmaster/autovacuum.h"
  50 #include "storage/bufmgr.h"
  51 #include "storage/freespace.h"
  52 #include "storage/lmgr.h"
  53 #include "utils/inval.h"
  54 #include "utils/lsyscache.h"
  55 #include "utils/memutils.h"
  56 #include "utils/pg_rusage.h"
  57 #include "utils/tqual.h"
  58
  59
  60 /*
  61  * Space/time tradeoff parameters: do these need to be user-tunable?
  62  *
  63  * To consider truncating the relation, we want there to be at least
  64  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  65  * is less) potentially-freeable pages.
  66  */
  67 #define REL_TRUNCATE_MINIMUM    1000
  68 #define REL_TRUNCATE_FRACTION   16
  69
  70 /*
  71  * Guesstimation of number of dead tuples per page.  This is used to
  72  * provide an upper limit to memory allocated when vacuuming small
  73  * tables.
  74  */
  75 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
  76
  77 typedef struct LVRelStats
  78 {
  79         /* hasindex = true means two-pass strategy; false means one-pass */
  80         bool            hasindex;
  81         /* Overall statistics about rel */
  82         BlockNumber rel_pages;
  83         double          rel_tuples;
  84         BlockNumber pages_removed;
  85         double          tuples_deleted;
  86         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
  87         /* List of TIDs of tuples we intend to delete */
  88         /* NB: this list is ordered by TID address */
  89         int                     num_dead_tuples;        /* current # of entries */
  90         int                     max_dead_tuples;        /* # slots allocated in array */
  91         ItemPointer dead_tuples;        /* array of ItemPointerData */
  92         int                     num_index_scans;
  93         bool            scanned_all;    /* have we scanned all pages (this far)? */
  94 } LVRelStats;
  95
  96
  97 /* A few variables that don't seem worth passing around as parameters */
  98 static int      elevel = -1;
  99
 100 static TransactionId OldestXmin;
 101 static TransactionId FreezeLimit;
 102
 103 static BufferAccessStrategy vac_strategy;
 104
 105
 106 /* non-export function prototypes */
 107 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 108                            Relation *Irel, int nindexes, bool scan_all);
 109 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
 110 static void lazy_vacuum_index(Relation indrel,
 111                                   IndexBulkDeleteResult **stats,
 112                                   LVRelStats *vacrelstats);
 113 static void lazy_cleanup_index(Relation indrel,
 114                                    IndexBulkDeleteResult *stats,
 115                                    LVRelStats *vacrelstats);
 116 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 117                                  int tupindex, LVRelStats *vacrelstats);
 118 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
 119 static BlockNumber count_nondeletable_pages(Relation onerel,
 120                                                  LVRelStats *vacrelstats);
 121 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
 122 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
 123                                            ItemPointer itemptr);
 124 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
 125 static int      vac_cmp_itemptr(const void *left, const void *right);
 126
 127
 128 /*
 129  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 130  *
 131  *              This routine vacuums a single heap, cleans out its indexes, and
 132  *              updates its relpages and reltuples statistics.
 133  *
 134  *              At entry, we have already established a transaction and opened
 135  *              and locked the relation.
 136  */
 137 void
 138 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 139                                 BufferAccessStrategy bstrategy, bool *scanned_all)
 140 {
 141         LVRelStats *vacrelstats;
 142         Relation   *Irel;
 143         int                     nindexes;
 144         BlockNumber possibly_freeable;
 145         PGRUsage        ru0;
 146         TimestampTz starttime = 0;
 147         bool            scan_all;
 148         TransactionId freezeTableLimit;
 149
 150         pg_rusage_init(&ru0);
 151
 152         /* measure elapsed time iff autovacuum logging requires it */
 153         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
 154                 starttime = GetCurrentTimestamp();
 155
 156         if (vacstmt->verbose)
 157                 elevel = INFO;
 158         else
 159                 elevel = DEBUG2;
 160
 161         vac_strategy = bstrategy;
 162
 163         vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
 164                                                   onerel->rd_rel->relisshared,
 165                                                   &OldestXmin, &FreezeLimit, &freezeTableLimit);
 166         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
 167                                                                                          freezeTableLimit);
 168
 169         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 170
 171         vacrelstats->num_index_scans = 0;
 172         vacrelstats->scanned_all = true; /* will be cleared if we skip a page */
 173
 174         /* Open all indexes of the relation */
 175         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
 176         vacrelstats->hasindex = (nindexes > 0);
 177
 178         /* Do the vacuuming */
 179         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
 180
 181         /* Done with indexes */
 182         vac_close_indexes(nindexes, Irel, NoLock);
 183
 184         /*
 185          * Optionally truncate the relation.
 186          *
 187          * Don't even think about it unless we have a shot at releasing a goodly
 188          * number of pages.  Otherwise, the time taken isn't worth it.
 189          */
 190         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
 191         if (possibly_freeable > 0 &&
 192                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
 193                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
 194                 lazy_truncate_heap(onerel, vacrelstats);
 195
 196         /* Vacuum the Free Space Map */
 197         FreeSpaceMapVacuum(onerel);
 198
 199         /*
 200          * Update statistics in pg_class.  But only if we didn't skip any pages;
 201          * the tuple count only includes tuples from the pages we've visited, and
 202          * we haven't frozen tuples in unvisited pages either.  The page count is
 203          * accurate in any case, but because we use the reltuples / relpages
 204          * ratio in the planner, it's better to not update relpages either if we
 205          * can't update reltuples.
 206          */
 207         if (vacrelstats->scanned_all)
 208                 vac_update_relstats(onerel,
 209                                                         vacrelstats->rel_pages, vacrelstats->rel_tuples,
 210                                                         vacrelstats->hasindex,
 211                                                         FreezeLimit);
 212
 213         /* report results to the stats collector, too */
 214         pgstat_report_vacuum(RelationGetRelid(onerel),
 215                                                  onerel->rd_rel->relisshared,
 216                                                  vacrelstats->scanned_all,
 217                                                  vacstmt->analyze, vacrelstats->rel_tuples);
 218
 219         /* and log the action if appropriate */
 220         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
 221         {
 222                 if (Log_autovacuum_min_duration == 0 ||
 223                         TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
 224                                                                            Log_autovacuum_min_duration))
 225                         ereport(LOG,
 226                                         (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
 227                                                         "pages: %d removed, %d remain\n"
 228                                                         "tuples: %.0f removed, %.0f remain\n"
 229                                                         "system usage: %s",
 230                                                         get_database_name(MyDatabaseId),
 231                                                         get_namespace_name(RelationGetNamespace(onerel)),
 232                                                         RelationGetRelationName(onerel),
 233                                                         vacrelstats->num_index_scans,
 234                                                   vacrelstats->pages_removed, vacrelstats->rel_pages,
 235                                                 vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
 236                                                         pg_rusage_show(&ru0))));
 237         }
 238
 239         if (scanned_all)
 240                 *scanned_all = vacrelstats->scanned_all;
 241 }
 242
 243
 244 /*
 245  *      lazy_scan_heap() -- scan an open heap relation
 246  *
 247  *              This routine sets commit status bits, builds lists of dead tuples
 248  *              and pages with free space, and calculates statistics on the number
 249  *              of live tuples in the heap.  When done, or when we run low on space
 250  *              for dead-tuple TIDs, invoke vacuuming of indexes and heap.
 251  *
 252  *              If there are no indexes then we just vacuum each dirty page as we
 253  *              process it, since there's no point in gathering many tuples.
 254  */
 255 static void
 256 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 257                            Relation *Irel, int nindexes, bool scan_all)
 258 {
 259         BlockNumber nblocks,
 260                                 blkno;
 261         HeapTupleData tuple;
 262         char       *relname;
 263         BlockNumber empty_pages,
 264                                 scanned_pages,
 265                                 vacuumed_pages;
 266         double          num_tuples,
 267                                 tups_vacuumed,
 268                                 nkeep,
 269                                 nunused;
 270         IndexBulkDeleteResult **indstats;
 271         int                     i;
 272         PGRUsage        ru0;
 273         Buffer          vmbuffer = InvalidBuffer;
 274
 275         pg_rusage_init(&ru0);
 276
 277         relname = RelationGetRelationName(onerel);
 278         ereport(elevel,
 279                         (errmsg("vacuuming \"%s.%s\"",
 280                                         get_namespace_name(RelationGetNamespace(onerel)),
 281                                         relname)));
 282
 283         empty_pages = vacuumed_pages = scanned_pages = 0;
 284         num_tuples = tups_vacuumed = nkeep = nunused = 0;
 285
 286         indstats = (IndexBulkDeleteResult **)
 287                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
 288
 289         nblocks = RelationGetNumberOfBlocks(onerel);
 290         vacrelstats->rel_pages = nblocks;
 291         vacrelstats->nonempty_pages = 0;
 292
 293         lazy_space_alloc(vacrelstats, nblocks);
 294
 295         for (blkno = 0; blkno < nblocks; blkno++)
 296         {
 297                 Buffer          buf;
 298                 Page            page;
 299                 OffsetNumber offnum,
 300                                         maxoff;
 301                 bool            tupgone,
 302                                         hastup;
 303                 int                     prev_dead_count;
 304                 OffsetNumber frozen[MaxOffsetNumber];
 305                 int                     nfrozen;
 306                 Size            freespace;
 307                 bool            all_visible_according_to_vm = false;
 308                 bool            all_visible;
 309
 310                 /*
 311                  * Skip pages that don't require vacuuming according to the
 312                  * visibility map.
 313                  */
 314                 if (!scan_all)
 315                 {
 316                         all_visible_according_to_vm =
 317                                 visibilitymap_test(onerel, blkno, &vmbuffer);
 318                         if (all_visible_according_to_vm)
 319                         {
 320                                 vacrelstats->scanned_all = false;
 321                                 continue;
 322                         }
 323                 }
 324
 325                 vacuum_delay_point();
 326
 327                 scanned_pages++;
 328
 329                 /*
 330                  * If we are close to overrunning the available space for dead-tuple
 331                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
 332                  */
 333                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
 334                         vacrelstats->num_dead_tuples > 0)
 335                 {
 336                         /* Remove index entries */
 337                         for (i = 0; i < nindexes; i++)
 338                                 lazy_vacuum_index(Irel[i],
 339                                                                   &indstats[i],
 340                                                                   vacrelstats);
 341                         /* Remove tuples from heap */
 342                         lazy_vacuum_heap(onerel, vacrelstats);
 343                         /* Forget the now-vacuumed tuples, and press on */
 344                         vacrelstats->num_dead_tuples = 0;
 345                         vacrelstats->num_index_scans++;
 346                 }
 347
 348                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
 349                                                                  RBM_NORMAL, vac_strategy);
 350
 351                 /* We need buffer cleanup lock so that we can prune HOT chains. */
 352                 LockBufferForCleanup(buf);
 353
 354                 page = BufferGetPage(buf);
 355
 356                 if (PageIsNew(page))
 357                 {
 358                         /*
 359                          * An all-zeroes page could be left over if a backend extends the
 360                          * relation but crashes before initializing the page. Reclaim such
 361                          * pages for use.
 362                          *
 363                          * We have to be careful here because we could be looking at a
 364                          * page that someone has just added to the relation and not yet
 365                          * been able to initialize (see RelationGetBufferForTuple). To
 366                          * protect against that, release the buffer lock, grab the
 367                          * relation extension lock momentarily, and re-lock the buffer. If
 368                          * the page is still uninitialized by then, it must be left over
 369                          * from a crashed backend, and we can initialize it.
 370                          *
 371                          * We don't really need the relation lock when this is a new or
 372                          * temp relation, but it's probably not worth the code space to
 373                          * check that, since this surely isn't a critical path.
 374                          *
 375                          * Note: the comparable code in vacuum.c need not worry because
 376                          * it's got exclusive lock on the whole relation.
 377                          */
 378                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 379                         LockRelationForExtension(onerel, ExclusiveLock);
 380                         UnlockRelationForExtension(onerel, ExclusiveLock);
 381                         LockBufferForCleanup(buf);
 382                         if (PageIsNew(page))
 383                         {
 384                                 ereport(WARNING,
 385                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
 386                                                 relname, blkno)));
 387                                 PageInit(page, BufferGetPageSize(buf), 0);
 388                                 empty_pages++;
 389                         }
 390                         freespace = PageGetHeapFreeSpace(page);
 391                         MarkBufferDirty(buf);
 392                         UnlockReleaseBuffer(buf);
 393
 394                         RecordPageWithFreeSpace(onerel, blkno, freespace);
 395                         continue;
 396                 }
 397
 398                 if (PageIsEmpty(page))
 399                 {
 400                         empty_pages++;
 401                         freespace = PageGetHeapFreeSpace(page);
 402
 403                         if (!PageIsAllVisible(page))
 404                         {
 405                                 SetBufferCommitInfoNeedsSave(buf);
 406                                 PageSetAllVisible(page);
 407                         }
 408
 409                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 410
 411                         /* Update the visibility map */
 412                         if (!all_visible_according_to_vm)
 413                         {
 414                                 visibilitymap_pin(onerel, blkno, &vmbuffer);
 415                                 LockBuffer(buf, BUFFER_LOCK_SHARE);
 416                                 if (PageIsAllVisible(page))
 417                                         visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
 418                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 419                         }
 420
 421                         ReleaseBuffer(buf);
 422                         RecordPageWithFreeSpace(onerel, blkno, freespace);
 423                         continue;
 424                 }
 425
 426                 /*
 427                  * Prune all HOT-update chains in this page.
 428                  *
 429                  * We count tuples removed by the pruning step as removed by VACUUM.
 430                  */
 431                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
 432                                                                                  false, false);
 433
 434                 /*
 435                  * Now scan the page to collect vacuumable items and check for tuples
 436                  * requiring freezing.
 437                  */
 438                 all_visible = true;
 439                 nfrozen = 0;
 440                 hastup = false;
 441                 prev_dead_count = vacrelstats->num_dead_tuples;
 442                 maxoff = PageGetMaxOffsetNumber(page);
 443                 for (offnum = FirstOffsetNumber;
 444                          offnum <= maxoff;
 445                          offnum = OffsetNumberNext(offnum))
 446                 {
 447                         ItemId          itemid;
 448
 449                         itemid = PageGetItemId(page, offnum);
 450
 451                         /* Unused items require no processing, but we count 'em */
 452                         if (!ItemIdIsUsed(itemid))
 453                         {
 454                                 nunused += 1;
 455                                 continue;
 456                         }
 457
 458                         /* Redirect items mustn't be touched */
 459                         if (ItemIdIsRedirected(itemid))
 460                         {
 461                                 hastup = true;  /* this page won't be truncatable */
 462                                 continue;
 463                         }
 464
 465                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
 466
 467                         /*
 468                          * DEAD item pointers are to be vacuumed normally; but we don't
 469                          * count them in tups_vacuumed, else we'd be double-counting (at
 470                          * least in the common case where heap_page_prune() just freed up
 471                          * a non-HOT tuple).
 472                          */
 473                         if (ItemIdIsDead(itemid))
 474                         {
 475                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
 476                                 all_visible = false;
 477                                 continue;
 478                         }
 479
 480                         Assert(ItemIdIsNormal(itemid));
 481
 482                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 483                         tuple.t_len = ItemIdGetLength(itemid);
 484
 485                         tupgone = false;
 486
 487                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 488                         {
 489                                 case HEAPTUPLE_DEAD:
 490
 491                                         /*
 492                                          * Ordinarily, DEAD tuples would have been removed by
 493                                          * heap_page_prune(), but it's possible that the tuple
 494                                          * state changed since heap_page_prune() looked.  In
 495                                          * particular an INSERT_IN_PROGRESS tuple could have
 496                                          * changed to DEAD if the inserter aborted.  So this
 497                                          * cannot be considered an error condition.
 498                                          *
 499                                          * If the tuple is HOT-updated then it must only be
 500                                          * removed by a prune operation; so we keep it just as if
 501                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
 502                                          * tuple, we choose to keep it, because it'll be a lot
 503                                          * cheaper to get rid of it in the next pruning pass than
 504                                          * to treat it like an indexed tuple.
 505                                          */
 506                                         if (HeapTupleIsHotUpdated(&tuple) ||
 507                                                 HeapTupleIsHeapOnly(&tuple))
 508                                                 nkeep += 1;
 509                                         else
 510                                                 tupgone = true; /* we can delete the tuple */
 511                                         all_visible = false;
 512                                         break;
 513                                 case HEAPTUPLE_LIVE:
 514                                         /* Tuple is good --- but let's do some validity checks */
 515                                         if (onerel->rd_rel->relhasoids &&
 516                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
 517                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
 518                                                          relname, blkno, offnum);
 519
 520                                         /*
 521                                          * Is the tuple definitely visible to all transactions?
 522                                          *
 523                                          * NB: Like with per-tuple hint bits, we can't set the
 524                                          * PD_ALL_VISIBLE flag if the inserter committed
 525                                          * asynchronously. See SetHintBits for more info. Check
 526                                          * that the HEAP_XMIN_COMMITTED hint bit is set because of
 527                                          * that.
 528                                          */
 529                                         if (all_visible)
 530                                         {
 531                                                 TransactionId xmin;
 532
 533                                                 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
 534                                                 {
 535                                                         all_visible = false;
 536                                                         break;
 537                                                 }
 538                                                 /*
 539                                                  * The inserter definitely committed. But is it
 540                                                  * old enough that everyone sees it as committed?
 541                                                  */
 542                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
 543                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
 544                                                 {
 545                                                         all_visible = false;
 546                                                         break;
 547                                                 }
 548                                         }
 549                                         break;
 550                                 case HEAPTUPLE_RECENTLY_DEAD:
 551
 552                                         /*
 553                                          * If tuple is recently deleted then we must not remove it
 554                                          * from relation.
 555                                          */
 556                                         nkeep += 1;
 557                                         all_visible = false;
 558                                         break;
 559                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
 560                                         /* This is an expected case during concurrent vacuum */
 561                                         all_visible = false;
 562                                         break;
 563                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
 564                                         /* This is an expected case during concurrent vacuum */
 565                                         all_visible = false;
 566                                         break;
 567                                 default:
 568                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
 569                                         break;
 570                         }
 571
 572                         if (tupgone)
 573                         {
 574                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
 575                                 tups_vacuumed += 1;
 576                         }
 577                         else
 578                         {
 579                                 num_tuples += 1;
 580                                 hastup = true;
 581
 582                                 /*
 583                                  * Each non-removable tuple must be checked to see if it needs
 584                                  * freezing.  Note we already have exclusive buffer lock.
 585                                  */
 586                                 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
 587                                                                           InvalidBuffer))
 588                                         frozen[nfrozen++] = offnum;
 589                         }
 590                 }                                               /* scan along page */
 591
 592                 /*
 593                  * If we froze any tuples, mark the buffer dirty, and write a WAL
 594                  * record recording the changes.  We must log the changes to be
 595                  * crash-safe against future truncation of CLOG.
 596                  */
 597                 if (nfrozen > 0)
 598                 {
 599                         MarkBufferDirty(buf);
 600                         /* no XLOG for temp tables, though */
 601                         if (!onerel->rd_istemp)
 602                         {
 603                                 XLogRecPtr      recptr;
 604
 605                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
 606                                                                                  frozen, nfrozen);
 607                                 PageSetLSN(page, recptr);
 608                                 PageSetTLI(page, ThisTimeLineID);
 609                         }
 610                 }
 611
 612                 /*
 613                  * If there are no indexes then we can vacuum the page right now
 614                  * instead of doing a second scan.
 615                  */
 616                 if (nindexes == 0 &&
 617                         vacrelstats->num_dead_tuples > 0)
 618                 {
 619                         /* Remove tuples from heap */
 620                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
 621                         /* Forget the now-vacuumed tuples, and press on */
 622                         vacrelstats->num_dead_tuples = 0;
 623                         vacuumed_pages++;
 624                 }
 625
 626                 freespace = PageGetHeapFreeSpace(page);
 627
 628                 /* Update the all-visible flag on the page */
 629                 if (!PageIsAllVisible(page) && all_visible)
 630                 {
 631                         SetBufferCommitInfoNeedsSave(buf);
 632                         PageSetAllVisible(page);
 633                 }
 634                 else if (PageIsAllVisible(page) && !all_visible)
 635                 {
 636                         elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set");
 637                         SetBufferCommitInfoNeedsSave(buf);
 638                         PageClearAllVisible(page);
 639
 640                         /*
 641                          * Normally, we would drop the lock on the heap page before
 642                          * updating the visibility map, but since this is a can't-happen
 643                          * case anyway, don't bother.
 644                          */
 645                         visibilitymap_clear(onerel, blkno);
 646                 }
 647
 648                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 649
 650                 /* Update the visibility map */
 651                 if (!all_visible_according_to_vm && all_visible)
 652                 {
 653                         visibilitymap_pin(onerel, blkno, &vmbuffer);
 654                         LockBuffer(buf, BUFFER_LOCK_SHARE);
 655                         if (PageIsAllVisible(page))
 656                                 visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
 657                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 658                 }
 659
 660                 ReleaseBuffer(buf);
 661
 662                 /* Remember the location of the last page with nonremovable tuples */
 663                 if (hastup)
 664                         vacrelstats->nonempty_pages = blkno + 1;
 665
 666                 /*
 667                  * If we remembered any tuples for deletion, then the page will be
 668                  * visited again by lazy_vacuum_heap, which will compute and record
 669                  * its post-compaction free space.      If not, then we're done with this
 670                  * page, so remember its free space as-is.      (This path will always be
 671                  * taken if there are no indexes.)
 672                  */
 673                 if (vacrelstats->num_dead_tuples == prev_dead_count)
 674                         RecordPageWithFreeSpace(onerel, blkno, freespace);
 675         }
 676
 677         /* save stats for use later */
 678         vacrelstats->rel_tuples = num_tuples;
 679         vacrelstats->tuples_deleted = tups_vacuumed;
 680
 681         /* If any tuples need to be deleted, perform final vacuum cycle */
 682         /* XXX put a threshold on min number of tuples here? */
 683         if (vacrelstats->num_dead_tuples > 0)
 684         {
 685                 /* Remove index entries */
 686                 for (i = 0; i < nindexes; i++)
 687                         lazy_vacuum_index(Irel[i],
 688                                                           &indstats[i],
 689                                                           vacrelstats);
 690                 /* Remove tuples from heap */
 691                 lazy_vacuum_heap(onerel, vacrelstats);
 692                 vacrelstats->num_index_scans++;
 693         }
 694
 695         /* Release the pin on the visibility map page */
 696         if (BufferIsValid(vmbuffer))
 697         {
 698                 ReleaseBuffer(vmbuffer);
 699                 vmbuffer = InvalidBuffer;
 700         }
 701
 702         /* Do post-vacuum cleanup and statistics update for each index */
 703         for (i = 0; i < nindexes; i++)
 704                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
 705
 706         /* If no indexes, make log report that lazy_vacuum_heap would've made */
 707         if (vacuumed_pages)
 708                 ereport(elevel,
 709                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
 710                                                 RelationGetRelationName(onerel),
 711                                                 tups_vacuumed, vacuumed_pages)));
 712
 713         ereport(elevel,
 714                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
 715                                         RelationGetRelationName(onerel),
 716                                         tups_vacuumed, num_tuples, scanned_pages, nblocks),
 717                          errdetail("%.0f dead row versions cannot be removed yet.\n"
 718                                            "There were %.0f unused item pointers.\n"
 719                                            "%u pages are entirely empty.\n"
 720                                            "%s.",
 721                                            nkeep,
 722                                            nunused,
 723                                            empty_pages,
 724                                            pg_rusage_show(&ru0))));
 725 }
 726
 727
 728 /*
 729  *      lazy_vacuum_heap() -- second pass over the heap
 730  *
 731  *              This routine marks dead tuples as unused and compacts out free
 732  *              space on their pages.  Pages not having dead tuples recorded from
 733  *              lazy_scan_heap are not visited at all.
 734  *
 735  * Note: the reason for doing this as a second pass is we cannot remove
 736  * the tuples until we've removed their index entries, and we want to
 737  * process index entry removal in batches as large as possible.
 738  */
 739 static void
 740 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 741 {
 742         int                     tupindex;
 743         int                     npages;
 744         PGRUsage        ru0;
 745
 746         pg_rusage_init(&ru0);
 747         npages = 0;
 748
 749         tupindex = 0;
 750         while (tupindex < vacrelstats->num_dead_tuples)
 751         {
 752                 BlockNumber tblk;
 753                 Buffer          buf;
 754                 Page            page;
 755                 Size            freespace;
 756
 757                 vacuum_delay_point();
 758
 759                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 760                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
 761                                                                  vac_strategy);
 762                 LockBufferForCleanup(buf);
 763                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
 764
 765                 /* Now that we've compacted the page, record its available space */
 766                 page = BufferGetPage(buf);
 767                 freespace = PageGetHeapFreeSpace(page);
 768
 769                 UnlockReleaseBuffer(buf);
 770                 RecordPageWithFreeSpace(onerel, tblk, freespace);
 771                 npages++;
 772         }
 773
 774         ereport(elevel,
 775                         (errmsg("\"%s\": removed %d row versions in %d pages",
 776                                         RelationGetRelationName(onerel),
 777                                         tupindex, npages),
 778                          errdetail("%s.",
 779                                            pg_rusage_show(&ru0))));
 780 }
 781
 782 /*
 783  *      lazy_vacuum_page() -- free dead tuples on a page
 784  *                                       and repair its fragmentation.
 785  *
 786  * Caller must hold pin and buffer cleanup lock on the buffer.
 787  *
 788  * tupindex is the index in vacrelstats->dead_tuples of the first dead
 789  * tuple for this page.  We assume the rest follow sequentially.
 790  * The return value is the first tupindex after the tuples of this page.
 791  */
 792 static int
 793 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 794                                  int tupindex, LVRelStats *vacrelstats)
 795 {
 796         Page            page = BufferGetPage(buffer);
 797         OffsetNumber unused[MaxOffsetNumber];
 798         int                     uncnt = 0;
 799
 800         START_CRIT_SECTION();
 801
 802         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
 803         {
 804                 BlockNumber tblk;
 805                 OffsetNumber toff;
 806                 ItemId          itemid;
 807
 808                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 809                 if (tblk != blkno)
 810                         break;                          /* past end of tuples for this block */
 811                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
 812                 itemid = PageGetItemId(page, toff);
 813                 ItemIdSetUnused(itemid);
 814                 unused[uncnt++] = toff;
 815         }
 816
 817         PageRepairFragmentation(page);
 818
 819         MarkBufferDirty(buffer);
 820
 821         /* XLOG stuff */
 822         if (!onerel->rd_istemp)
 823         {
 824                 XLogRecPtr      recptr;
 825
 826                 recptr = log_heap_clean(onerel, buffer,
 827                                                                 NULL, 0, NULL, 0,
 828                                                                 unused, uncnt,
 829                                                                 false);
 830                 PageSetLSN(page, recptr);
 831                 PageSetTLI(page, ThisTimeLineID);
 832         }
 833
 834         END_CRIT_SECTION();
 835
 836         return tupindex;
 837 }
 838
 839 /*
 840  *      lazy_vacuum_index() -- vacuum one index relation.
 841  *
 842  *              Delete all the index entries pointing to tuples listed in
 843  *              vacrelstats->dead_tuples, and update running statistics.
 844  */
 845 static void
 846 lazy_vacuum_index(Relation indrel,
 847                                   IndexBulkDeleteResult **stats,
 848                                   LVRelStats *vacrelstats)
 849 {
 850         IndexVacuumInfo ivinfo;
 851         PGRUsage        ru0;
 852
 853         pg_rusage_init(&ru0);
 854
 855         ivinfo.index = indrel;
 856         ivinfo.vacuum_full = false;
 857         ivinfo.message_level = elevel;
 858         /* We don't yet know rel_tuples, so pass -1 */
 859         ivinfo.num_heap_tuples = -1;
 860         ivinfo.strategy = vac_strategy;
 861
 862         /* Do bulk deletion */
 863         *stats = index_bulk_delete(&ivinfo, *stats,
 864                                                            lazy_tid_reaped, (void *) vacrelstats);
 865
 866         ereport(elevel,
 867                         (errmsg("scanned index \"%s\" to remove %d row versions",
 868                                         RelationGetRelationName(indrel),
 869                                         vacrelstats->num_dead_tuples),
 870                          errdetail("%s.", pg_rusage_show(&ru0))));
 871 }
 872
 873 /*
 874  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
 875  */
 876 static void
 877 lazy_cleanup_index(Relation indrel,
 878                                    IndexBulkDeleteResult *stats,
 879                                    LVRelStats *vacrelstats)
 880 {
 881         IndexVacuumInfo ivinfo;
 882         PGRUsage        ru0;
 883
 884         pg_rusage_init(&ru0);
 885
 886         ivinfo.index = indrel;
 887         ivinfo.vacuum_full = false;
 888         ivinfo.message_level = elevel;
 889         ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
 890         ivinfo.strategy = vac_strategy;
 891
 892         stats = index_vacuum_cleanup(&ivinfo, stats);
 893
 894         if (!stats)
 895                 return;
 896
 897         /* now update statistics in pg_class */
 898         vac_update_relstats(indrel,
 899                                                 stats->num_pages, stats->num_index_tuples,
 900                                                 false, InvalidTransactionId);
 901
 902         ereport(elevel,
 903                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
 904                                         RelationGetRelationName(indrel),
 905                                         stats->num_index_tuples,
 906                                         stats->num_pages),
 907                          errdetail("%.0f index row versions were removed.\n"
 908                          "%u index pages have been deleted, %u are currently reusable.\n"
 909                                            "%s.",
 910                                            stats->tuples_removed,
 911                                            stats->pages_deleted, stats->pages_free,
 912                                            pg_rusage_show(&ru0))));
 913
 914         pfree(stats);
 915 }
 916
 917 /*
 918  * lazy_truncate_heap - try to truncate off any empty pages at the end
 919  */
 920 static void
 921 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 922 {
 923         BlockNumber old_rel_pages = vacrelstats->rel_pages;
 924         BlockNumber new_rel_pages;
 925         PGRUsage        ru0;
 926
 927         pg_rusage_init(&ru0);
 928
 929         /*
 930          * We need full exclusive lock on the relation in order to do truncation.
 931          * If we can't get it, give up rather than waiting --- we don't want to
 932          * block other backends, and we don't want to deadlock (which is quite
 933          * possible considering we already hold a lower-grade lock).
 934          */
 935         if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
 936                 return;
 937
 938         /*
 939          * Now that we have exclusive lock, look to see if the rel has grown
 940          * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
 941          * newly added pages presumably contain non-deletable tuples.
 942          */
 943         new_rel_pages = RelationGetNumberOfBlocks(onerel);
 944         if (new_rel_pages != old_rel_pages)
 945         {
 946                 /* might as well use the latest news when we update pg_class stats */
 947                 vacrelstats->rel_pages = new_rel_pages;
 948                 UnlockRelation(onerel, AccessExclusiveLock);
 949                 return;
 950         }
 951
 952         /*
 953          * Scan backwards from the end to verify that the end pages actually
 954          * contain no tuples.  This is *necessary*, not optional, because other
 955          * backends could have added tuples to these pages whilst we were
 956          * vacuuming.
 957          */
 958         new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
 959
 960         if (new_rel_pages >= old_rel_pages)
 961         {
 962                 /* can't do anything after all */
 963                 UnlockRelation(onerel, AccessExclusiveLock);
 964                 return;
 965         }
 966
 967         /*
 968          * Okay to truncate.
 969          */
 970         RelationTruncate(onerel, new_rel_pages);
 971
 972         /* force relcache inval so all backends reset their rd_targblock */
 973         CacheInvalidateRelcache(onerel);
 974
 975         /*
 976          * Note: once we have truncated, we *must* keep the exclusive lock until
 977          * commit.      The sinval message won't be sent until commit, and other
 978          * backends must see it and reset their rd_targblock values before they
 979          * can safely access the table again.
 980          */
 981
 982         /* update statistics */
 983         vacrelstats->rel_pages = new_rel_pages;
 984         vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
 985
 986         ereport(elevel,
 987                         (errmsg("\"%s\": truncated %u to %u pages",
 988                                         RelationGetRelationName(onerel),
 989                                         old_rel_pages, new_rel_pages),
 990                          errdetail("%s.",
 991                                            pg_rusage_show(&ru0))));
 992 }
 993
 994 /*
 995  * Rescan end pages to verify that they are (still) empty of tuples.
 996  *
 997  * Returns number of nondeletable pages (last nonempty page + 1).
 998  */
 999 static BlockNumber
1000 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1001 {
1002         BlockNumber blkno;
1003
1004         /* Strange coding of loop control is needed because blkno is unsigned */
1005         blkno = vacrelstats->rel_pages;
1006         while (blkno > vacrelstats->nonempty_pages)
1007         {
1008                 Buffer          buf;
1009                 Page            page;
1010                 OffsetNumber offnum,
1011                                         maxoff;
1012                 bool            hastup;
1013
1014                 /*
1015                  * We don't insert a vacuum delay point here, because we have an
1016                  * exclusive lock on the table which we want to hold for as short a
1017                  * time as possible.  We still need to check for interrupts however.
1018                  */
1019                 CHECK_FOR_INTERRUPTS();
1020
1021                 blkno--;
1022
1023                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1024                                                                  RBM_NORMAL, vac_strategy);
1025
1026                 /* In this phase we only need shared access to the buffer */
1027                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1028
1029                 page = BufferGetPage(buf);
1030
1031                 if (PageIsNew(page) || PageIsEmpty(page))
1032                 {
1033                         /* PageIsNew probably shouldn't happen... */
1034                         UnlockReleaseBuffer(buf);
1035                         continue;
1036                 }
1037
1038                 hastup = false;
1039                 maxoff = PageGetMaxOffsetNumber(page);
1040                 for (offnum = FirstOffsetNumber;
1041                          offnum <= maxoff;
1042                          offnum = OffsetNumberNext(offnum))
1043                 {
1044                         ItemId          itemid;
1045
1046                         itemid = PageGetItemId(page, offnum);
1047
1048                         /*
1049                          * Note: any non-unused item should be taken as a reason to keep
1050                          * this page.  We formerly thought that DEAD tuples could be
1051                          * thrown away, but that's not so, because we'd not have cleaned
1052                          * out their index entries.
1053                          */
1054                         if (ItemIdIsUsed(itemid))
1055                         {
1056                                 hastup = true;
1057                                 break;                  /* can stop scanning */
1058                         }
1059                 }                                               /* scan along page */
1060
1061                 UnlockReleaseBuffer(buf);
1062
1063                 /* Done scanning if we found a tuple here */
1064                 if (hastup)
1065                         return blkno + 1;
1066         }
1067
1068         /*
1069          * If we fall out of the loop, all the previously-thought-to-be-empty
1070          * pages still are; we need not bother to look at the last known-nonempty
1071          * page.
1072          */
1073         return vacrelstats->nonempty_pages;
1074 }
1075
1076 /*
1077  * lazy_space_alloc - space allocation decisions for lazy vacuum
1078  *
1079  * See the comments at the head of this file for rationale.
1080  */
1081 static void
1082 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1083 {
1084         long            maxtuples;
1085
1086         if (vacrelstats->hasindex)
1087         {
1088                 maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
1089                 maxtuples = Min(maxtuples, INT_MAX);
1090                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1091
1092                 /* curious coding here to ensure the multiplication can't overflow */
1093                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1094                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1095
1096                 /* stay sane if small maintenance_work_mem */
1097                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1098         }
1099         else
1100         {
1101                 maxtuples = MaxHeapTuplesPerPage;
1102         }
1103
1104         vacrelstats->num_dead_tuples = 0;
1105         vacrelstats->max_dead_tuples = (int) maxtuples;
1106         vacrelstats->dead_tuples = (ItemPointer)
1107                 palloc(maxtuples * sizeof(ItemPointerData));
1108 }
1109
1110 /*
1111  * lazy_record_dead_tuple - remember one deletable tuple
1112  */
1113 static void
1114 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1115                                            ItemPointer itemptr)
1116 {
1117         /*
1118          * The array shouldn't overflow under normal behavior, but perhaps it
1119          * could if we are given a really small maintenance_work_mem. In that
1120          * case, just forget the last few tuples (we'll get 'em next time).
1121          */
1122         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1123         {
1124                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1125                 vacrelstats->num_dead_tuples++;
1126         }
1127 }
1128
1129 /*
1130  *      lazy_tid_reaped() -- is a particular tid deletable?
1131  *
1132  *              This has the right signature to be an IndexBulkDeleteCallback.
1133  *
1134  *              Assumes dead_tuples array is in sorted order.
1135  */
1136 static bool
1137 lazy_tid_reaped(ItemPointer itemptr, void *state)
1138 {
1139         LVRelStats *vacrelstats = (LVRelStats *) state;
1140         ItemPointer res;
1141
1142         res = (ItemPointer) bsearch((void *) itemptr,
1143                                                                 (void *) vacrelstats->dead_tuples,
1144                                                                 vacrelstats->num_dead_tuples,
1145                                                                 sizeof(ItemPointerData),
1146                                                                 vac_cmp_itemptr);
1147
1148         return (res != NULL);
1149 }
1150
1151 /*
1152  * Comparator routines for use with qsort() and bsearch().
1153  */
1154 static int
1155 vac_cmp_itemptr(const void *left, const void *right)
1156 {
1157         BlockNumber lblk,
1158                                 rblk;
1159         OffsetNumber loff,
1160                                 roff;
1161
1162         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1163         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1164
1165         if (lblk < rblk)
1166                 return -1;
1167         if (lblk > rblk)
1168                 return 1;
1169
1170         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1171         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1172
1173         if (loff < roff)
1174                 return -1;
1175         if (loff > roff)
1176                 return 1;
1177
1178         return 0;
1179 }