src/backend/storage/buffer/bufmgr.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * bufmgr.c
   4  *        buffer manager interface routines
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 /*
  16  * Principal entry points:
  17  *
  18  * ReadBuffer() -- find or create a buffer holding the requested page,
  19  *              and pin it so that no one can destroy it while this process
  20  *              is using it.
  21  *
  22  * ReleaseBuffer() -- unpin a buffer
  23  *
  24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
  25  *              The disk write is delayed until buffer replacement or checkpoint.
  26  *
  27  * See also these files:
  28  *              freelist.c -- chooses victim for buffer replacement
  29  *              buf_table.c -- manages the buffer lookup table
  30  */
  31 #include "postgres.h"
  32
  33 #include <sys/file.h>
  34 #include <unistd.h>
  35
  36 #include "catalog/catalog.h"
  37 #include "miscadmin.h"
  38 #include "pg_trace.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "storage/buf_internals.h"
  42 #include "storage/bufmgr.h"
  43 #include "storage/ipc.h"
  44 #include "storage/proc.h"
  45 #include "storage/smgr.h"
  46 #include "utils/rel.h"
  47 #include "utils/resowner.h"
  48
  49
  50 /* Note: these two macros only work on shared buffers, not local ones! */
  51 #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  52 #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
  53
  54 /* Note: this macro only works on local buffers, not shared ones! */
  55 #define LocalBufHdrGetBlock(bufHdr) \
  56         LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
  57
  58 /* Bits in SyncOneBuffer's return value */
  59 #define BUF_WRITTEN                             0x01
  60 #define BUF_REUSABLE                    0x02
  61
  62
  63 /* GUC variables */
  64 bool            zero_damaged_pages = false;
  65 int                     bgwriter_lru_maxpages = 100;
  66 double          bgwriter_lru_multiplier = 2.0;
  67
  68 /*
  69  * How many buffers PrefetchBuffer callers should try to stay ahead of their
  70  * ReadBuffer calls by.  This is maintained by the assign hook for
  71  * effective_io_concurrency.  Zero means "never prefetch".
  72  */
  73 int                     target_prefetch_pages = 0;
  74
  75 /* local state for StartBufferIO and related functions */
  76 static volatile BufferDesc *InProgressBuf = NULL;
  77 static bool IsForInput;
  78
  79 /* local state for LockBufferForCleanup */
  80 static volatile BufferDesc *PinCountWaitBuf = NULL;
  81
  82
  83 static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
  84                                   ForkNumber forkNum, BlockNumber blockNum,
  85                                   ReadBufferMode mode, BufferAccessStrategy strategy,
  86                                   bool *hit);
  87 static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
  88 static void PinBuffer_Locked(volatile BufferDesc *buf);
  89 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
  90 static void BufferSync(int flags);
  91 static int      SyncOneBuffer(int buf_id, bool skip_recently_used);
  92 static void WaitIO(volatile BufferDesc *buf);
  93 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
  94 static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  95                                   int set_flag_bits);
  96 static void buffer_write_error_callback(void *arg);
  97 static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
  98                         BlockNumber blockNum,
  99                         BufferAccessStrategy strategy,
 100                         bool *foundPtr);
 101 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 102 static void AtProcExit_Buffers(int code, Datum arg);
 103
 104
 105 /*
 106  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
 107  *
 108  * This is named by analogy to ReadBuffer but doesn't actually allocate a
 109  * buffer.      Instead it tries to ensure that a future ReadBuffer for the given
 110  * block will not be delayed by the I/O.  Prefetching is optional.
 111  * No-op if prefetching isn't compiled in.
 112  */
 113 void
 114 PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 115 {
 116 #ifdef USE_PREFETCH
 117         Assert(RelationIsValid(reln));
 118         Assert(BlockNumberIsValid(blockNum));
 119
 120         /* Open it at the smgr level if not already done */
 121         RelationOpenSmgr(reln);
 122
 123         if (reln->rd_istemp)
 124         {
 125                 /* see comments in ReadBufferExtended */
 126                 if (RELATION_IS_OTHER_TEMP(reln))
 127                         ereport(ERROR,
 128                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 129                                 errmsg("cannot access temporary tables of other sessions")));
 130
 131                 /* pass it off to localbuf.c */
 132                 LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
 133         }
 134         else
 135         {
 136                 BufferTag       newTag;         /* identity of requested block */
 137                 uint32          newHash;        /* hash value for newTag */
 138                 LWLockId        newPartitionLock;       /* buffer partition lock for it */
 139                 int                     buf_id;
 140
 141                 /* create a tag so we can lookup the buffer */
 142                 INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode, forkNum, blockNum);
 143
 144                 /* determine its hash code and partition lock ID */
 145                 newHash = BufTableHashCode(&newTag);
 146                 newPartitionLock = BufMappingPartitionLock(newHash);
 147
 148                 /* see if the block is in the buffer pool already */
 149                 LWLockAcquire(newPartitionLock, LW_SHARED);
 150                 buf_id = BufTableLookup(&newTag, newHash);
 151                 LWLockRelease(newPartitionLock);
 152
 153                 /* If not in buffers, initiate prefetch */
 154                 if (buf_id < 0)
 155                         smgrprefetch(reln->rd_smgr, forkNum, blockNum);
 156
 157                 /*
 158                  * If the block *is* in buffers, we do nothing.  This is not really
 159                  * ideal: the block might be just about to be evicted, which would be
 160                  * stupid since we know we are going to need it soon.  But the only
 161                  * easy answer is to bump the usage_count, which does not seem like a
 162                  * great solution: when the caller does ultimately touch the block,
 163                  * usage_count would get bumped again, resulting in too much
 164                  * favoritism for blocks that are involved in a prefetch sequence. A
 165                  * real fix would involve some additional per-buffer state, and it's
 166                  * not clear that there's enough of a problem to justify that.
 167                  */
 168         }
 169 #endif   /* USE_PREFETCH */
 170 }
 171
 172
 173 /*
 174  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
 175  *              fork with RBM_NORMAL mode and default strategy.
 176  */
 177 Buffer
 178 ReadBuffer(Relation reln, BlockNumber blockNum)
 179 {
 180         return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
 181 }
 182
 183 /*
 184  * ReadBufferExtended -- returns a buffer containing the requested
 185  *              block of the requested relation.  If the blknum
 186  *              requested is P_NEW, extend the relation file and
 187  *              allocate a new block.  (Caller is responsible for
 188  *              ensuring that only one backend tries to extend a
 189  *              relation at the same time!)
 190  *
 191  * Returns: the buffer number for the buffer containing
 192  *              the block read.  The returned buffer has been pinned.
 193  *              Does not return on error --- elog's instead.
 194  *
 195  * Assume when this function is called, that reln has been opened already.
 196  *
 197  * In RBM_NORMAL mode, the page is read from disk, and the page header is
 198  * validated. An error is thrown if the page header is not valid.
 199  *
 200  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
 201  * valid, the page is zeroed instead of throwing an error. This is intended
 202  * for non-critical data, where the caller is prepared to repair errors.
 203  *
 204  * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
 205  * with zeros instead of reading it from disk.  Useful when the caller is
 206  * going to fill the page from scratch, since this saves I/O and avoids
 207  * unnecessary failure if the page-on-disk has corrupt page headers.
 208  * Caution: do not use this mode to read a page that is beyond the relation's
 209  * current physical EOF; that is likely to cause problems in md.c when
 210  * the page is modified and written out. P_NEW is OK, though.
 211  *
 212  * If strategy is not NULL, a nondefault buffer access strategy is used.
 213  * See buffer/README for details.
 214  */
 215 Buffer
 216 ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
 217                                    ReadBufferMode mode, BufferAccessStrategy strategy)
 218 {
 219         bool            hit;
 220         Buffer          buf;
 221
 222         /* Open it at the smgr level if not already done */
 223         RelationOpenSmgr(reln);
 224
 225         /*
 226          * Reject attempts to read non-local temporary relations; we would be
 227          * likely to get wrong data since we have no visibility into the owning
 228          * session's local buffers.
 229          */
 230         if (RELATION_IS_OTHER_TEMP(reln))
 231                 ereport(ERROR,
 232                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 233                                  errmsg("cannot access temporary tables of other sessions")));
 234
 235         /*
 236          * Read the buffer, and update pgstat counters to reflect a cache hit or
 237          * miss.
 238          */
 239         pgstat_count_buffer_read(reln);
 240         buf = ReadBuffer_common(reln->rd_smgr, reln->rd_istemp, forkNum, blockNum,
 241                                                         mode, strategy, &hit);
 242         if (hit)
 243                 pgstat_count_buffer_hit(reln);
 244         return buf;
 245 }
 246
 247
 248 /*
 249  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
 250  *              a relcache entry for the relation.
 251  *
 252  * NB: caller is assumed to know what it's doing if isTemp is true.
 253  */
 254 Buffer
 255 ReadBufferWithoutRelcache(RelFileNode rnode, bool isTemp,
 256                                                   ForkNumber forkNum, BlockNumber blockNum,
 257                                                   ReadBufferMode mode, BufferAccessStrategy strategy)
 258 {
 259         bool            hit;
 260
 261         SMgrRelation smgr = smgropen(rnode);
 262
 263         return ReadBuffer_common(smgr, isTemp, forkNum, blockNum, mode, strategy,
 264                                                          &hit);
 265 }
 266
 267
 268 /*
 269  * ReadBuffer_common -- common logic for all ReadBuffer variants
 270  *
 271  * *hit is set to true if the request was satisfied from shared buffer cache.
 272  */
 273 static Buffer
 274 ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
 275                                   BlockNumber blockNum, ReadBufferMode mode,
 276                                   BufferAccessStrategy strategy, bool *hit)
 277 {
 278         volatile BufferDesc *bufHdr;
 279         Block           bufBlock;
 280         bool            found;
 281         bool            isExtend;
 282
 283         *hit = false;
 284
 285         /* Make sure we will have room to remember the buffer pin */
 286         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 287
 288         isExtend = (blockNum == P_NEW);
 289
 290         TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
 291                                                                            smgr->smgr_rnode.spcNode,
 292                                                                            smgr->smgr_rnode.dbNode,
 293                                                                            smgr->smgr_rnode.relNode,
 294                                                                            isLocalBuf,
 295                                                                            isExtend);
 296
 297         /* Substitute proper block number if caller asked for P_NEW */
 298         if (isExtend)
 299                 blockNum = smgrnblocks(smgr, forkNum);
 300
 301         if (isLocalBuf)
 302         {
 303                 ReadLocalBufferCount++;
 304                 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
 305                 if (found)
 306                         LocalBufferHitCount++;
 307         }
 308         else
 309         {
 310                 ReadBufferCount++;
 311
 312                 /*
 313                  * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
 314                  * not currently in memory.
 315                  */
 316                 bufHdr = BufferAlloc(smgr, forkNum, blockNum, strategy, &found);
 317                 if (found)
 318                         BufferHitCount++;
 319         }
 320
 321         /* At this point we do NOT hold any locks. */
 322
 323         /* if it was already in the buffer pool, we're done */
 324         if (found)
 325         {
 326                 if (!isExtend)
 327                 {
 328                         /* Just need to update stats before we exit */
 329                         *hit = true;
 330
 331                         if (VacuumCostActive)
 332                                 VacuumCostBalance += VacuumCostPageHit;
 333
 334                         TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
 335                                                                                           smgr->smgr_rnode.spcNode,
 336                                                                                           smgr->smgr_rnode.dbNode,
 337                                                                                           smgr->smgr_rnode.relNode,
 338                                                                                           isLocalBuf,
 339                                                                                           isExtend,
 340                                                                                           found);
 341
 342                         return BufferDescriptorGetBuffer(bufHdr);
 343                 }
 344
 345                 /*
 346                  * We get here only in the corner case where we are trying to extend
 347                  * the relation but we found a pre-existing buffer marked BM_VALID.
 348                  * This can happen because mdread doesn't complain about reads beyond
 349                  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
 350                  * read a block beyond EOF could have left a "valid" zero-filled
 351                  * buffer.      Unfortunately, we have also seen this case occurring
 352                  * because of buggy Linux kernels that sometimes return an
 353                  * lseek(SEEK_END) result that doesn't account for a recent write. In
 354                  * that situation, the pre-existing buffer would contain valid data
 355                  * that we don't want to overwrite.  Since the legitimate case should
 356                  * always have left a zero-filled buffer, complain if not PageIsNew.
 357                  */
 358                 bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 359                 if (!PageIsNew((Page) bufBlock))
 360                         ereport(ERROR,
 361                          (errmsg("unexpected data beyond EOF in block %u of relation %s",
 362                                          blockNum, relpath(smgr->smgr_rnode, forkNum)),
 363                           errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
 364
 365                 /*
 366                  * We *must* do smgrextend before succeeding, else the page will not
 367                  * be reserved by the kernel, and the next P_NEW call will decide to
 368                  * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
 369                  * call that BufferAlloc didn't, and proceed.
 370                  */
 371                 if (isLocalBuf)
 372                 {
 373                         /* Only need to adjust flags */
 374                         Assert(bufHdr->flags & BM_VALID);
 375                         bufHdr->flags &= ~BM_VALID;
 376                 }
 377                 else
 378                 {
 379                         /*
 380                          * Loop to handle the very small possibility that someone re-sets
 381                          * BM_VALID between our clearing it and StartBufferIO inspecting
 382                          * it.
 383                          */
 384                         do
 385                         {
 386                                 LockBufHdr(bufHdr);
 387                                 Assert(bufHdr->flags & BM_VALID);
 388                                 bufHdr->flags &= ~BM_VALID;
 389                                 UnlockBufHdr(bufHdr);
 390                         } while (!StartBufferIO(bufHdr, true));
 391                 }
 392         }
 393
 394         /*
 395          * if we have gotten to this point, we have allocated a buffer for the
 396          * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
 397          * if it's a shared buffer.
 398          *
 399          * Note: if smgrextend fails, we will end up with a buffer that is
 400          * allocated but not marked BM_VALID.  P_NEW will still select the same
 401          * block number (because the relation didn't get any longer on disk) and
 402          * so future attempts to extend the relation will find the same buffer (if
 403          * it's not been recycled) but come right back here to try smgrextend
 404          * again.
 405          */
 406         Assert(!(bufHdr->flags & BM_VALID));            /* spinlock not needed */
 407
 408         bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 409
 410         if (isExtend)
 411         {
 412                 /* new buffers are zero-filled */
 413                 MemSet((char *) bufBlock, 0, BLCKSZ);
 414                 smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, isLocalBuf);
 415         }
 416         else
 417         {
 418                 /*
 419                  * Read in the page, unless the caller intends to overwrite it and
 420                  * just wants us to allocate a buffer.
 421                  */
 422                 if (mode == RBM_ZERO)
 423                         MemSet((char *) bufBlock, 0, BLCKSZ);
 424                 else
 425                 {
 426                         smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
 427
 428                         /* check for garbage data */
 429                         if (!PageHeaderIsValid((PageHeader) bufBlock))
 430                         {
 431                                 if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
 432                                 {
 433                                         ereport(WARNING,
 434                                                         (errcode(ERRCODE_DATA_CORRUPTED),
 435                                                          errmsg("invalid page header in block %u of relation %s; zeroing out page",
 436                                                                         blockNum,
 437                                                                         relpath(smgr->smgr_rnode, forkNum))));
 438                                         MemSet((char *) bufBlock, 0, BLCKSZ);
 439                                 }
 440                                 else
 441                                         ereport(ERROR,
 442                                                         (errcode(ERRCODE_DATA_CORRUPTED),
 443                                          errmsg("invalid page header in block %u of relation %s",
 444                                                         blockNum,
 445                                                         relpath(smgr->smgr_rnode, forkNum))));
 446                         }
 447                 }
 448         }
 449
 450         if (isLocalBuf)
 451         {
 452                 /* Only need to adjust flags */
 453                 bufHdr->flags |= BM_VALID;
 454         }
 455         else
 456         {
 457                 /* Set BM_VALID, terminate IO, and wake up any waiters */
 458                 TerminateBufferIO(bufHdr, false, BM_VALID);
 459         }
 460
 461         if (VacuumCostActive)
 462                 VacuumCostBalance += VacuumCostPageMiss;
 463
 464         TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
 465                                                                           smgr->smgr_rnode.spcNode,
 466                                                                           smgr->smgr_rnode.dbNode,
 467                                                                           smgr->smgr_rnode.relNode,
 468                                                                           isLocalBuf,
 469                                                                           isExtend,
 470                                                                           found);
 471
 472         return BufferDescriptorGetBuffer(bufHdr);
 473 }
 474
 475 /*
 476  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
 477  *              buffer.  If no buffer exists already, selects a replacement
 478  *              victim and evicts the old page, but does NOT read in new page.
 479  *
 480  * "strategy" can be a buffer replacement strategy object, or NULL for
 481  * the default strategy.  The selected buffer's usage_count is advanced when
 482  * using the default strategy, but otherwise possibly not (see PinBuffer).
 483  *
 484  * The returned buffer is pinned and is already marked as holding the
 485  * desired page.  If it already did have the desired page, *foundPtr is
 486  * set TRUE.  Otherwise, *foundPtr is set FALSE and the buffer is marked
 487  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
 488  *
 489  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
 490  * we keep it for simplicity in ReadBuffer.
 491  *
 492  * No locks are held either at entry or exit.
 493  */
 494 static volatile BufferDesc *
 495 BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
 496                         BlockNumber blockNum,
 497                         BufferAccessStrategy strategy,
 498                         bool *foundPtr)
 499 {
 500         BufferTag       newTag;                 /* identity of requested block */
 501         uint32          newHash;                /* hash value for newTag */
 502         LWLockId        newPartitionLock;               /* buffer partition lock for it */
 503         BufferTag       oldTag;                 /* previous identity of selected buffer */
 504         uint32          oldHash;                /* hash value for oldTag */
 505         LWLockId        oldPartitionLock;               /* buffer partition lock for it */
 506         BufFlags        oldFlags;
 507         int                     buf_id;
 508         volatile BufferDesc *buf;
 509         bool            valid;
 510
 511         /* create a tag so we can lookup the buffer */
 512         INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum);
 513
 514         /* determine its hash code and partition lock ID */
 515         newHash = BufTableHashCode(&newTag);
 516         newPartitionLock = BufMappingPartitionLock(newHash);
 517
 518         /* see if the block is in the buffer pool already */
 519         LWLockAcquire(newPartitionLock, LW_SHARED);
 520         buf_id = BufTableLookup(&newTag, newHash);
 521         if (buf_id >= 0)
 522         {
 523                 /*
 524                  * Found it.  Now, pin the buffer so no one can steal it from the
 525                  * buffer pool, and check to see if the correct data has been loaded
 526                  * into the buffer.
 527                  */
 528                 buf = &BufferDescriptors[buf_id];
 529
 530                 valid = PinBuffer(buf, strategy);
 531
 532                 /* Can release the mapping lock as soon as we've pinned it */
 533                 LWLockRelease(newPartitionLock);
 534
 535                 *foundPtr = TRUE;
 536
 537                 if (!valid)
 538                 {
 539                         /*
 540                          * We can only get here if (a) someone else is still reading in
 541                          * the page, or (b) a previous read attempt failed.  We have to
 542                          * wait for any active read attempt to finish, and then set up our
 543                          * own read attempt if the page is still not BM_VALID.
 544                          * StartBufferIO does it all.
 545                          */
 546                         if (StartBufferIO(buf, true))
 547                         {
 548                                 /*
 549                                  * If we get here, previous attempts to read the buffer must
 550                                  * have failed ... but we shall bravely try again.
 551                                  */
 552                                 *foundPtr = FALSE;
 553                         }
 554                 }
 555
 556                 return buf;
 557         }
 558
 559         /*
 560          * Didn't find it in the buffer pool.  We'll have to initialize a new
 561          * buffer.      Remember to unlock the mapping lock while doing the work.
 562          */
 563         LWLockRelease(newPartitionLock);
 564
 565         /* Loop here in case we have to try another victim buffer */
 566         for (;;)
 567         {
 568                 bool            lock_held;
 569
 570                 /*
 571                  * Select a victim buffer.      The buffer is returned with its header
 572                  * spinlock still held!  Also (in most cases) the BufFreelistLock is
 573                  * still held, since it would be bad to hold the spinlock while
 574                  * possibly waking up other processes.
 575                  */
 576                 buf = StrategyGetBuffer(strategy, &lock_held);
 577
 578                 Assert(buf->refcount == 0);
 579
 580                 /* Must copy buffer flags while we still hold the spinlock */
 581                 oldFlags = buf->flags;
 582
 583                 /* Pin the buffer and then release the buffer spinlock */
 584                 PinBuffer_Locked(buf);
 585
 586                 /* Now it's safe to release the freelist lock */
 587                 if (lock_held)
 588                         LWLockRelease(BufFreelistLock);
 589
 590                 /*
 591                  * If the buffer was dirty, try to write it out.  There is a race
 592                  * condition here, in that someone might dirty it after we released it
 593                  * above, or even while we are writing it out (since our share-lock
 594                  * won't prevent hint-bit updates).  We will recheck the dirty bit
 595                  * after re-locking the buffer header.
 596                  */
 597                 if (oldFlags & BM_DIRTY)
 598                 {
 599                         /*
 600                          * We need a share-lock on the buffer contents to write it out
 601                          * (else we might write invalid data, eg because someone else is
 602                          * compacting the page contents while we write).  We must use a
 603                          * conditional lock acquisition here to avoid deadlock.  Even
 604                          * though the buffer was not pinned (and therefore surely not
 605                          * locked) when StrategyGetBuffer returned it, someone else could
 606                          * have pinned and exclusive-locked it by the time we get here. If
 607                          * we try to get the lock unconditionally, we'd block waiting for
 608                          * them; if they later block waiting for us, deadlock ensues.
 609                          * (This has been observed to happen when two backends are both
 610                          * trying to split btree index pages, and the second one just
 611                          * happens to be trying to split the page the first one got from
 612                          * StrategyGetBuffer.)
 613                          */
 614                         if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
 615                         {
 616                                 /*
 617                                  * If using a nondefault strategy, and writing the buffer
 618                                  * would require a WAL flush, let the strategy decide whether
 619                                  * to go ahead and write/reuse the buffer or to choose another
 620                                  * victim.      We need lock to inspect the page LSN, so this
 621                                  * can't be done inside StrategyGetBuffer.
 622                                  */
 623                                 if (strategy != NULL &&
 624                                         XLogNeedsFlush(BufferGetLSN(buf)) &&
 625                                         StrategyRejectBuffer(strategy, buf))
 626                                 {
 627                                         /* Drop lock/pin and loop around for another buffer */
 628                                         LWLockRelease(buf->content_lock);
 629                                         UnpinBuffer(buf, true);
 630                                         continue;
 631                                 }
 632
 633                                 /* OK, do the I/O */
 634                                 TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
 635                                                                                                         smgr->smgr_rnode.spcNode,
 636                                                                                                          smgr->smgr_rnode.dbNode,
 637                                                                                                    smgr->smgr_rnode.relNode);
 638
 639                                 FlushBuffer(buf, NULL);
 640                                 LWLockRelease(buf->content_lock);
 641
 642                                 TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
 643                                                                                                         smgr->smgr_rnode.spcNode,
 644                                                                                                          smgr->smgr_rnode.dbNode,
 645                                                                                                    smgr->smgr_rnode.relNode);
 646                         }
 647                         else
 648                         {
 649                                 /*
 650                                  * Someone else has locked the buffer, so give it up and loop
 651                                  * back to get another one.
 652                                  */
 653                                 UnpinBuffer(buf, true);
 654                                 continue;
 655                         }
 656                 }
 657
 658                 /*
 659                  * To change the association of a valid buffer, we'll need to have
 660                  * exclusive lock on both the old and new mapping partitions.
 661                  */
 662                 if (oldFlags & BM_TAG_VALID)
 663                 {
 664                         /*
 665                          * Need to compute the old tag's hashcode and partition lock ID.
 666                          * XXX is it worth storing the hashcode in BufferDesc so we need
 667                          * not recompute it here?  Probably not.
 668                          */
 669                         oldTag = buf->tag;
 670                         oldHash = BufTableHashCode(&oldTag);
 671                         oldPartitionLock = BufMappingPartitionLock(oldHash);
 672
 673                         /*
 674                          * Must lock the lower-numbered partition first to avoid
 675                          * deadlocks.
 676                          */
 677                         if (oldPartitionLock < newPartitionLock)
 678                         {
 679                                 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 680                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 681                         }
 682                         else if (oldPartitionLock > newPartitionLock)
 683                         {
 684                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 685                                 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 686                         }
 687                         else
 688                         {
 689                                 /* only one partition, only one lock */
 690                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 691                         }
 692                 }
 693                 else
 694                 {
 695                         /* if it wasn't valid, we need only the new partition */
 696                         LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 697                         /* these just keep the compiler quiet about uninit variables */
 698                         oldHash = 0;
 699                         oldPartitionLock = 0;
 700                 }
 701
 702                 /*
 703                  * Try to make a hashtable entry for the buffer under its new tag.
 704                  * This could fail because while we were writing someone else
 705                  * allocated another buffer for the same block we want to read in.
 706                  * Note that we have not yet removed the hashtable entry for the old
 707                  * tag.
 708                  */
 709                 buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
 710
 711                 if (buf_id >= 0)
 712                 {
 713                         /*
 714                          * Got a collision. Someone has already done what we were about to
 715                          * do. We'll just handle this as if it were found in the buffer
 716                          * pool in the first place.  First, give up the buffer we were
 717                          * planning to use.
 718                          */
 719                         UnpinBuffer(buf, true);
 720
 721                         /* Can give up that buffer's mapping partition lock now */
 722                         if ((oldFlags & BM_TAG_VALID) &&
 723                                 oldPartitionLock != newPartitionLock)
 724                                 LWLockRelease(oldPartitionLock);
 725
 726                         /* remaining code should match code at top of routine */
 727
 728                         buf = &BufferDescriptors[buf_id];
 729
 730                         valid = PinBuffer(buf, strategy);
 731
 732                         /* Can release the mapping lock as soon as we've pinned it */
 733                         LWLockRelease(newPartitionLock);
 734
 735                         *foundPtr = TRUE;
 736
 737                         if (!valid)
 738                         {
 739                                 /*
 740                                  * We can only get here if (a) someone else is still reading
 741                                  * in the page, or (b) a previous read attempt failed.  We
 742                                  * have to wait for any active read attempt to finish, and
 743                                  * then set up our own read attempt if the page is still not
 744                                  * BM_VALID.  StartBufferIO does it all.
 745                                  */
 746                                 if (StartBufferIO(buf, true))
 747                                 {
 748                                         /*
 749                                          * If we get here, previous attempts to read the buffer
 750                                          * must have failed ... but we shall bravely try again.
 751                                          */
 752                                         *foundPtr = FALSE;
 753                                 }
 754                         }
 755
 756                         return buf;
 757                 }
 758
 759                 /*
 760                  * Need to lock the buffer header too in order to change its tag.
 761                  */
 762                 LockBufHdr(buf);
 763
 764                 /*
 765                  * Somebody could have pinned or re-dirtied the buffer while we were
 766                  * doing the I/O and making the new hashtable entry.  If so, we can't
 767                  * recycle this buffer; we must undo everything we've done and start
 768                  * over with a new victim buffer.
 769                  */
 770                 oldFlags = buf->flags;
 771                 if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
 772                         break;
 773
 774                 UnlockBufHdr(buf);
 775                 BufTableDelete(&newTag, newHash);
 776                 if ((oldFlags & BM_TAG_VALID) &&
 777                         oldPartitionLock != newPartitionLock)
 778                         LWLockRelease(oldPartitionLock);
 779                 LWLockRelease(newPartitionLock);
 780                 UnpinBuffer(buf, true);
 781         }
 782
 783         /*
 784          * Okay, it's finally safe to rename the buffer.
 785          *
 786          * Clearing BM_VALID here is necessary, clearing the dirtybits is just
 787          * paranoia.  We also reset the usage_count since any recency of use of
 788          * the old content is no longer relevant.  (The usage_count starts out at
 789          * 1 so that the buffer can survive one clock-sweep pass.)
 790          */
 791         buf->tag = newTag;
 792         buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR);
 793         buf->flags |= BM_TAG_VALID;
 794         buf->usage_count = 1;
 795
 796         UnlockBufHdr(buf);
 797
 798         if (oldFlags & BM_TAG_VALID)
 799         {
 800                 BufTableDelete(&oldTag, oldHash);
 801                 if (oldPartitionLock != newPartitionLock)
 802                         LWLockRelease(oldPartitionLock);
 803         }
 804
 805         LWLockRelease(newPartitionLock);
 806
 807         /*
 808          * Buffer contents are currently invalid.  Try to get the io_in_progress
 809          * lock.  If StartBufferIO returns false, then someone else managed to
 810          * read it before we did, so there's nothing left for BufferAlloc() to do.
 811          */
 812         if (StartBufferIO(buf, true))
 813                 *foundPtr = FALSE;
 814         else
 815                 *foundPtr = TRUE;
 816
 817         return buf;
 818 }
 819
 820 /*
 821  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
 822  * freelist.
 823  *
 824  * The buffer header spinlock must be held at entry.  We drop it before
 825  * returning.  (This is sane because the caller must have locked the
 826  * buffer in order to be sure it should be dropped.)
 827  *
 828  * This is used only in contexts such as dropping a relation.  We assume
 829  * that no other backend could possibly be interested in using the page,
 830  * so the only reason the buffer might be pinned is if someone else is
 831  * trying to write it out.      We have to let them finish before we can
 832  * reclaim the buffer.
 833  *
 834  * The buffer could get reclaimed by someone else while we are waiting
 835  * to acquire the necessary locks; if so, don't mess it up.
 836  */
 837 static void
 838 InvalidateBuffer(volatile BufferDesc *buf)
 839 {
 840         BufferTag       oldTag;
 841         uint32          oldHash;                /* hash value for oldTag */
 842         LWLockId        oldPartitionLock;               /* buffer partition lock for it */
 843         BufFlags        oldFlags;
 844
 845         /* Save the original buffer tag before dropping the spinlock */
 846         oldTag = buf->tag;
 847
 848         UnlockBufHdr(buf);
 849
 850         /*
 851          * Need to compute the old tag's hashcode and partition lock ID. XXX is it
 852          * worth storing the hashcode in BufferDesc so we need not recompute it
 853          * here?  Probably not.
 854          */
 855         oldHash = BufTableHashCode(&oldTag);
 856         oldPartitionLock = BufMappingPartitionLock(oldHash);
 857
 858 retry:
 859
 860         /*
 861          * Acquire exclusive mapping lock in preparation for changing the buffer's
 862          * association.
 863          */
 864         LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 865
 866         /* Re-lock the buffer header */
 867         LockBufHdr(buf);
 868
 869         /* If it's changed while we were waiting for lock, do nothing */
 870         if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
 871         {
 872                 UnlockBufHdr(buf);
 873                 LWLockRelease(oldPartitionLock);
 874                 return;
 875         }
 876
 877         /*
 878          * We assume the only reason for it to be pinned is that someone else is
 879          * flushing the page out.  Wait for them to finish.  (This could be an
 880          * infinite loop if the refcount is messed up... it would be nice to time
 881          * out after awhile, but there seems no way to be sure how many loops may
 882          * be needed.  Note that if the other guy has pinned the buffer but not
 883          * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 884          * be busy-looping here.)
 885          */
 886         if (buf->refcount != 0)
 887         {
 888                 UnlockBufHdr(buf);
 889                 LWLockRelease(oldPartitionLock);
 890                 /* safety check: should definitely not be our *own* pin */
 891                 if (PrivateRefCount[buf->buf_id] != 0)
 892                         elog(ERROR, "buffer is pinned in InvalidateBuffer");
 893                 WaitIO(buf);
 894                 goto retry;
 895         }
 896
 897         /*
 898          * Clear out the buffer's tag and flags.  We must do this to ensure that
 899          * linear scans of the buffer array don't think the buffer is valid.
 900          */
 901         oldFlags = buf->flags;
 902         CLEAR_BUFFERTAG(buf->tag);
 903         buf->flags = 0;
 904         buf->usage_count = 0;
 905
 906         UnlockBufHdr(buf);
 907
 908         /*
 909          * Remove the buffer from the lookup hashtable, if it was in there.
 910          */
 911         if (oldFlags & BM_TAG_VALID)
 912                 BufTableDelete(&oldTag, oldHash);
 913
 914         /*
 915          * Done with mapping lock.
 916          */
 917         LWLockRelease(oldPartitionLock);
 918
 919         /*
 920          * Insert the buffer at the head of the list of free buffers.
 921          */
 922         StrategyFreeBuffer(buf);
 923 }
 924
 925 /*
 926  * MarkBufferDirty
 927  *
 928  *              Marks buffer contents as dirty (actual write happens later).
 929  *
 930  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
 931  * exclusive lock, then somebody could be in process of writing the buffer,
 932  * leading to risk of bad data written to disk.)
 933  */
 934 void
 935 MarkBufferDirty(Buffer buffer)
 936 {
 937         volatile BufferDesc *bufHdr;
 938
 939         if (!BufferIsValid(buffer))
 940                 elog(ERROR, "bad buffer id: %d", buffer);
 941
 942         if (BufferIsLocal(buffer))
 943         {
 944                 MarkLocalBufferDirty(buffer);
 945                 return;
 946         }
 947
 948         bufHdr = &BufferDescriptors[buffer - 1];
 949
 950         Assert(PrivateRefCount[buffer - 1] > 0);
 951         /* unfortunately we can't check if the lock is held exclusively */
 952         Assert(LWLockHeldByMe(bufHdr->content_lock));
 953
 954         LockBufHdr(bufHdr);
 955
 956         Assert(bufHdr->refcount > 0);
 957
 958         /*
 959          * If the buffer was not dirty already, do vacuum cost accounting.
 960          */
 961         if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
 962                 VacuumCostBalance += VacuumCostPageDirty;
 963
 964         bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
 965
 966         UnlockBufHdr(bufHdr);
 967 }
 968
 969 /*
 970  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
 971  *
 972  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
 973  * compared to calling the two routines separately.  Now it's mainly just
 974  * a convenience function.      However, if the passed buffer is valid and
 975  * already contains the desired block, we just return it as-is; and that
 976  * does save considerable work compared to a full release and reacquire.
 977  *
 978  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
 979  * buffer actually needs to be released.  This case is the same as ReadBuffer,
 980  * but can save some tests in the caller.
 981  */
 982 Buffer
 983 ReleaseAndReadBuffer(Buffer buffer,
 984                                          Relation relation,
 985                                          BlockNumber blockNum)
 986 {
 987         ForkNumber      forkNum = MAIN_FORKNUM;
 988         volatile BufferDesc *bufHdr;
 989
 990         if (BufferIsValid(buffer))
 991         {
 992                 if (BufferIsLocal(buffer))
 993                 {
 994                         Assert(LocalRefCount[-buffer - 1] > 0);
 995                         bufHdr = &LocalBufferDescriptors[-buffer - 1];
 996                         if (bufHdr->tag.blockNum == blockNum &&
 997                                 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
 998                                 bufHdr->tag.forkNum == forkNum)
 999                                 return buffer;
1000                         ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
1001                         LocalRefCount[-buffer - 1]--;
1002                 }
1003                 else
1004                 {
1005                         Assert(PrivateRefCount[buffer - 1] > 0);
1006                         bufHdr = &BufferDescriptors[buffer - 1];
1007                         /* we have pin, so it's ok to examine tag without spinlock */
1008                         if (bufHdr->tag.blockNum == blockNum &&
1009                                 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1010                                 bufHdr->tag.forkNum == forkNum)
1011                                 return buffer;
1012                         UnpinBuffer(bufHdr, true);
1013                 }
1014         }
1015
1016         return ReadBuffer(relation, blockNum);
1017 }
1018
1019 /*
1020  * PinBuffer -- make buffer unavailable for replacement.
1021  *
1022  * For the default access strategy, the buffer's usage_count is incremented
1023  * when we first pin it; for other strategies we just make sure the usage_count
1024  * isn't zero.  (The idea of the latter is that we don't want synchronized
1025  * heap scans to inflate the count, but we need it to not be zero to discourage
1026  * other backends from stealing buffers from our ring.  As long as we cycle
1027  * through the ring faster than the global clock-sweep cycles, buffers in
1028  * our ring won't be chosen as victims for replacement by other backends.)
1029  *
1030  * This should be applied only to shared buffers, never local ones.
1031  *
1032  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1033  *
1034  * Returns TRUE if buffer is BM_VALID, else FALSE.      This provision allows
1035  * some callers to avoid an extra spinlock cycle.
1036  */
1037 static bool
1038 PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
1039 {
1040         int                     b = buf->buf_id;
1041         bool            result;
1042
1043         if (PrivateRefCount[b] == 0)
1044         {
1045                 LockBufHdr(buf);
1046                 buf->refcount++;
1047                 if (strategy == NULL)
1048                 {
1049                         if (buf->usage_count < BM_MAX_USAGE_COUNT)
1050                                 buf->usage_count++;
1051                 }
1052                 else
1053                 {
1054                         if (buf->usage_count == 0)
1055                                 buf->usage_count = 1;
1056                 }
1057                 result = (buf->flags & BM_VALID) != 0;
1058                 UnlockBufHdr(buf);
1059         }
1060         else
1061         {
1062                 /* If we previously pinned the buffer, it must surely be valid */
1063                 result = true;
1064         }
1065         PrivateRefCount[b]++;
1066         Assert(PrivateRefCount[b] > 0);
1067         ResourceOwnerRememberBuffer(CurrentResourceOwner,
1068                                                                 BufferDescriptorGetBuffer(buf));
1069         return result;
1070 }
1071
1072 /*
1073  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1074  * The spinlock is released before return.
1075  *
1076  * Currently, no callers of this function want to modify the buffer's
1077  * usage_count at all, so there's no need for a strategy parameter.
1078  * Also we don't bother with a BM_VALID test (the caller could check that for
1079  * itself).
1080  *
1081  * Note: use of this routine is frequently mandatory, not just an optimization
1082  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1083  * its state can change under us.
1084  */
1085 static void
1086 PinBuffer_Locked(volatile BufferDesc *buf)
1087 {
1088         int                     b = buf->buf_id;
1089
1090         if (PrivateRefCount[b] == 0)
1091                 buf->refcount++;
1092         UnlockBufHdr(buf);
1093         PrivateRefCount[b]++;
1094         Assert(PrivateRefCount[b] > 0);
1095         ResourceOwnerRememberBuffer(CurrentResourceOwner,
1096                                                                 BufferDescriptorGetBuffer(buf));
1097 }
1098
1099 /*
1100  * UnpinBuffer -- make buffer available for replacement.
1101  *
1102  * This should be applied only to shared buffers, never local ones.
1103  *
1104  * Most but not all callers want CurrentResourceOwner to be adjusted.
1105  * Those that don't should pass fixOwner = FALSE.
1106  */
1107 static void
1108 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
1109 {
1110         int                     b = buf->buf_id;
1111
1112         if (fixOwner)
1113                 ResourceOwnerForgetBuffer(CurrentResourceOwner,
1114                                                                   BufferDescriptorGetBuffer(buf));
1115
1116         Assert(PrivateRefCount[b] > 0);
1117         PrivateRefCount[b]--;
1118         if (PrivateRefCount[b] == 0)
1119         {
1120                 /* I'd better not still hold any locks on the buffer */
1121                 Assert(!LWLockHeldByMe(buf->content_lock));
1122                 Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1123
1124                 LockBufHdr(buf);
1125
1126                 /* Decrement the shared reference count */
1127                 Assert(buf->refcount > 0);
1128                 buf->refcount--;
1129
1130                 /* Support LockBufferForCleanup() */
1131                 if ((buf->flags & BM_PIN_COUNT_WAITER) &&
1132                         buf->refcount == 1)
1133                 {
1134                         /* we just released the last pin other than the waiter's */
1135                         int                     wait_backend_pid = buf->wait_backend_pid;
1136
1137                         buf->flags &= ~BM_PIN_COUNT_WAITER;
1138                         UnlockBufHdr(buf);
1139                         ProcSendSignal(wait_backend_pid);
1140                 }
1141                 else
1142                         UnlockBufHdr(buf);
1143         }
1144 }
1145
1146 /*
1147  * BufferSync -- Write out all dirty buffers in the pool.
1148  *
1149  * This is called at checkpoint time to write out all dirty shared buffers.
1150  * The checkpoint request flags should be passed in; currently the only one
1151  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
1152  */
1153 static void
1154 BufferSync(int flags)
1155 {
1156         int                     buf_id;
1157         int                     num_to_scan;
1158         int                     num_to_write;
1159         int                     num_written;
1160
1161         /* Make sure we can handle the pin inside SyncOneBuffer */
1162         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1163
1164         /*
1165          * Loop over all buffers, and mark the ones that need to be written with
1166          * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_write), so that we
1167          * can estimate how much work needs to be done.
1168          *
1169          * This allows us to write only those pages that were dirty when the
1170          * checkpoint began, and not those that get dirtied while it proceeds.
1171          * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1172          * later in this function, or by normal backends or the bgwriter cleaning
1173          * scan, the flag is cleared.  Any buffer dirtied after this point won't
1174          * have the flag set.
1175          *
1176          * Note that if we fail to write some buffer, we may leave buffers with
1177          * BM_CHECKPOINT_NEEDED still set.      This is OK since any such buffer would
1178          * certainly need to be written for the next checkpoint attempt, too.
1179          */
1180         num_to_write = 0;
1181         for (buf_id = 0; buf_id < NBuffers; buf_id++)
1182         {
1183                 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1184
1185                 /*
1186                  * Header spinlock is enough to examine BM_DIRTY, see comment in
1187                  * SyncOneBuffer.
1188                  */
1189                 LockBufHdr(bufHdr);
1190
1191                 if (bufHdr->flags & BM_DIRTY)
1192                 {
1193                         bufHdr->flags |= BM_CHECKPOINT_NEEDED;
1194                         num_to_write++;
1195                 }
1196
1197                 UnlockBufHdr(bufHdr);
1198         }
1199
1200         if (num_to_write == 0)
1201                 return;                                 /* nothing to do */
1202
1203         TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
1204
1205         /*
1206          * Loop over all buffers again, and write the ones (still) marked with
1207          * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
1208          * since we might as well dump soon-to-be-recycled buffers first.
1209          *
1210          * Note that we don't read the buffer alloc count here --- that should be
1211          * left untouched till the next BgBufferSync() call.
1212          */
1213         buf_id = StrategySyncStart(NULL, NULL);
1214         num_to_scan = NBuffers;
1215         num_written = 0;
1216         while (num_to_scan-- > 0)
1217         {
1218                 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1219
1220                 /*
1221                  * We don't need to acquire the lock here, because we're only looking
1222                  * at a single bit. It's possible that someone else writes the buffer
1223                  * and clears the flag right after we check, but that doesn't matter
1224                  * since SyncOneBuffer will then do nothing.  However, there is a
1225                  * further race condition: it's conceivable that between the time we
1226                  * examine the bit here and the time SyncOneBuffer acquires lock,
1227                  * someone else not only wrote the buffer but replaced it with another
1228                  * page and dirtied it.  In that improbable case, SyncOneBuffer will
1229                  * write the buffer though we didn't need to.  It doesn't seem worth
1230                  * guarding against this, though.
1231                  */
1232                 if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
1233                 {
1234                         if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
1235                         {
1236                                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1237                                 BgWriterStats.m_buf_written_checkpoints++;
1238                                 num_written++;
1239
1240                                 /*
1241                                  * We know there are at most num_to_write buffers with
1242                                  * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
1243                                  * num_written reaches num_to_write.
1244                                  *
1245                                  * Note that num_written doesn't include buffers written by
1246                                  * other backends, or by the bgwriter cleaning scan. That
1247                                  * means that the estimate of how much progress we've made is
1248                                  * conservative, and also that this test will often fail to
1249                                  * trigger.  But it seems worth making anyway.
1250                                  */
1251                                 if (num_written >= num_to_write)
1252                                         break;
1253
1254                                 /*
1255                                  * Perform normal bgwriter duties and sleep to throttle our
1256                                  * I/O rate.
1257                                  */
1258                                 CheckpointWriteDelay(flags,
1259                                                                          (double) num_written / num_to_write);
1260                         }
1261                 }
1262
1263                 if (++buf_id >= NBuffers)
1264                         buf_id = 0;
1265         }
1266
1267         /*
1268          * Update checkpoint statistics. As noted above, this doesn't include
1269          * buffers written by other backends or bgwriter scan.
1270          */
1271         CheckpointStats.ckpt_bufs_written += num_written;
1272
1273         TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
1274 }
1275
1276 /*
1277  * BgBufferSync -- Write out some dirty buffers in the pool.
1278  *
1279  * This is called periodically by the background writer process.
1280  */
1281 void
1282 BgBufferSync(void)
1283 {
1284         /* info obtained from freelist.c */
1285         int                     strategy_buf_id;
1286         uint32          strategy_passes;
1287         uint32          recent_alloc;
1288
1289         /*
1290          * Information saved between calls so we can determine the strategy
1291          * point's advance rate and avoid scanning already-cleaned buffers.
1292          */
1293         static bool saved_info_valid = false;
1294         static int      prev_strategy_buf_id;
1295         static uint32 prev_strategy_passes;
1296         static int      next_to_clean;
1297         static uint32 next_passes;
1298
1299         /* Moving averages of allocation rate and clean-buffer density */
1300         static float smoothed_alloc = 0;
1301         static float smoothed_density = 10.0;
1302
1303         /* Potentially these could be tunables, but for now, not */
1304         float           smoothing_samples = 16;
1305         float           scan_whole_pool_milliseconds = 120000.0;
1306
1307         /* Used to compute how far we scan ahead */
1308         long            strategy_delta;
1309         int                     bufs_to_lap;
1310         int                     bufs_ahead;
1311         float           scans_per_alloc;
1312         int                     reusable_buffers_est;
1313         int                     upcoming_alloc_est;
1314         int                     min_scan_buffers;
1315
1316         /* Variables for the scanning loop proper */
1317         int                     num_to_scan;
1318         int                     num_written;
1319         int                     reusable_buffers;
1320
1321         /*
1322          * Find out where the freelist clock sweep currently is, and how many
1323          * buffer allocations have happened since our last call.
1324          */
1325         strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
1326
1327         /* Report buffer alloc counts to pgstat */
1328         BgWriterStats.m_buf_alloc += recent_alloc;
1329
1330         /*
1331          * If we're not running the LRU scan, just stop after doing the stats
1332          * stuff.  We mark the saved state invalid so that we can recover sanely
1333          * if LRU scan is turned back on later.
1334          */
1335         if (bgwriter_lru_maxpages <= 0)
1336         {
1337                 saved_info_valid = false;
1338                 return;
1339         }
1340
1341         /*
1342          * Compute strategy_delta = how many buffers have been scanned by the
1343          * clock sweep since last time.  If first time through, assume none. Then
1344          * see if we are still ahead of the clock sweep, and if so, how many
1345          * buffers we could scan before we'd catch up with it and "lap" it. Note:
1346          * weird-looking coding of xxx_passes comparisons are to avoid bogus
1347          * behavior when the passes counts wrap around.
1348          */
1349         if (saved_info_valid)
1350         {
1351                 int32           passes_delta = strategy_passes - prev_strategy_passes;
1352
1353                 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
1354                 strategy_delta += (long) passes_delta *NBuffers;
1355
1356                 Assert(strategy_delta >= 0);
1357
1358                 if ((int32) (next_passes - strategy_passes) > 0)
1359                 {
1360                         /* we're one pass ahead of the strategy point */
1361                         bufs_to_lap = strategy_buf_id - next_to_clean;
1362 #ifdef BGW_DEBUG
1363                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1364                                  next_passes, next_to_clean,
1365                                  strategy_passes, strategy_buf_id,
1366                                  strategy_delta, bufs_to_lap);
1367 #endif
1368                 }
1369                 else if (next_passes == strategy_passes &&
1370                                  next_to_clean >= strategy_buf_id)
1371                 {
1372                         /* on same pass, but ahead or at least not behind */
1373                         bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
1374 #ifdef BGW_DEBUG
1375                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1376                                  next_passes, next_to_clean,
1377                                  strategy_passes, strategy_buf_id,
1378                                  strategy_delta, bufs_to_lap);
1379 #endif
1380                 }
1381                 else
1382                 {
1383                         /*
1384                          * We're behind, so skip forward to the strategy point and start
1385                          * cleaning from there.
1386                          */
1387 #ifdef BGW_DEBUG
1388                         elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
1389                                  next_passes, next_to_clean,
1390                                  strategy_passes, strategy_buf_id,
1391                                  strategy_delta);
1392 #endif
1393                         next_to_clean = strategy_buf_id;
1394                         next_passes = strategy_passes;
1395                         bufs_to_lap = NBuffers;
1396                 }
1397         }
1398         else
1399         {
1400                 /*
1401                  * Initializing at startup or after LRU scanning had been off. Always
1402                  * start at the strategy point.
1403                  */
1404 #ifdef BGW_DEBUG
1405                 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
1406                          strategy_passes, strategy_buf_id);
1407 #endif
1408                 strategy_delta = 0;
1409                 next_to_clean = strategy_buf_id;
1410                 next_passes = strategy_passes;
1411                 bufs_to_lap = NBuffers;
1412         }
1413
1414         /* Update saved info for next time */
1415         prev_strategy_buf_id = strategy_buf_id;
1416         prev_strategy_passes = strategy_passes;
1417         saved_info_valid = true;
1418
1419         /*
1420          * Compute how many buffers had to be scanned for each new allocation, ie,
1421          * 1/density of reusable buffers, and track a moving average of that.
1422          *
1423          * If the strategy point didn't move, we don't update the density estimate
1424          */
1425         if (strategy_delta > 0 && recent_alloc > 0)
1426         {
1427                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1428                 smoothed_density += (scans_per_alloc - smoothed_density) /
1429                         smoothing_samples;
1430         }
1431
1432         /*
1433          * Estimate how many reusable buffers there are between the current
1434          * strategy point and where we've scanned ahead to, based on the smoothed
1435          * density estimate.
1436          */
1437         bufs_ahead = NBuffers - bufs_to_lap;
1438         reusable_buffers_est = (float) bufs_ahead / smoothed_density;
1439
1440         /*
1441          * Track a moving average of recent buffer allocations.  Here, rather than
1442          * a true average we want a fast-attack, slow-decline behavior: we
1443          * immediately follow any increase.
1444          */
1445         if (smoothed_alloc <= (float) recent_alloc)
1446                 smoothed_alloc = recent_alloc;
1447         else
1448                 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
1449                         smoothing_samples;
1450
1451         /* Scale the estimate by a GUC to allow more aggressive tuning. */
1452         upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier;
1453
1454         /*
1455          * Even in cases where there's been little or no buffer allocation
1456          * activity, we want to make a small amount of progress through the buffer
1457          * cache so that as many reusable buffers as possible are clean after an
1458          * idle period.
1459          *
1460          * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
1461          * the BGW will be called during the scan_whole_pool time; slice the
1462          * buffer pool into that many sections.
1463          */
1464         min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
1465
1466         if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
1467         {
1468 #ifdef BGW_DEBUG
1469                 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
1470                          upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
1471 #endif
1472                 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
1473         }
1474
1475         /*
1476          * Now write out dirty reusable buffers, working forward from the
1477          * next_to_clean point, until we have lapped the strategy scan, or cleaned
1478          * enough buffers to match our estimate of the next cycle's allocation
1479          * requirements, or hit the bgwriter_lru_maxpages limit.
1480          */
1481
1482         /* Make sure we can handle the pin inside SyncOneBuffer */
1483         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1484
1485         num_to_scan = bufs_to_lap;
1486         num_written = 0;
1487         reusable_buffers = reusable_buffers_est;
1488
1489         /* Execute the LRU scan */
1490         while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
1491         {
1492                 int                     buffer_state = SyncOneBuffer(next_to_clean, true);
1493
1494                 if (++next_to_clean >= NBuffers)
1495                 {
1496                         next_to_clean = 0;
1497                         next_passes++;
1498                 }
1499                 num_to_scan--;
1500
1501                 if (buffer_state & BUF_WRITTEN)
1502                 {
1503                         reusable_buffers++;
1504                         if (++num_written >= bgwriter_lru_maxpages)
1505                         {
1506                                 BgWriterStats.m_maxwritten_clean++;
1507                                 break;
1508                         }
1509                 }
1510                 else if (buffer_state & BUF_REUSABLE)
1511                         reusable_buffers++;
1512         }
1513
1514         BgWriterStats.m_buf_written_clean += num_written;
1515
1516 #ifdef BGW_DEBUG
1517         elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
1518                  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
1519                  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
1520                  bufs_to_lap - num_to_scan,
1521                  num_written,
1522                  reusable_buffers - reusable_buffers_est);
1523 #endif
1524
1525         /*
1526          * Consider the above scan as being like a new allocation scan.
1527          * Characterize its density and update the smoothed one based on it. This
1528          * effectively halves the moving average period in cases where both the
1529          * strategy and the background writer are doing some useful scanning,
1530          * which is helpful because a long memory isn't as desirable on the
1531          * density estimates.
1532          */
1533         strategy_delta = bufs_to_lap - num_to_scan;
1534         recent_alloc = reusable_buffers - reusable_buffers_est;
1535         if (strategy_delta > 0 && recent_alloc > 0)
1536         {
1537                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1538                 smoothed_density += (scans_per_alloc - smoothed_density) /
1539                         smoothing_samples;
1540
1541 #ifdef BGW_DEBUG
1542                 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
1543                          recent_alloc, strategy_delta, scans_per_alloc, smoothed_density);
1544 #endif
1545         }
1546 }
1547
1548 /*
1549  * SyncOneBuffer -- process a single buffer during syncing.
1550  *
1551  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
1552  * buffers marked recently used, as these are not replacement candidates.
1553  *
1554  * Returns a bitmask containing the following flag bits:
1555  *      BUF_WRITTEN: we wrote the buffer.
1556  *      BUF_REUSABLE: buffer is available for replacement, ie, it has
1557  *              pin count 0 and usage count 0.
1558  *
1559  * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
1560  * after locking it, but we don't care all that much.)
1561  *
1562  * Note: caller must have done ResourceOwnerEnlargeBuffers.
1563  */
1564 static int
1565 SyncOneBuffer(int buf_id, bool skip_recently_used)
1566 {
1567         volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1568         int                     result = 0;
1569
1570         /*
1571          * Check whether buffer needs writing.
1572          *
1573          * We can make this check without taking the buffer content lock so long
1574          * as we mark pages dirty in access methods *before* logging changes with
1575          * XLogInsert(): if someone marks the buffer dirty just after our check we
1576          * don't worry because our checkpoint.redo points before log record for
1577          * upcoming changes and so we are not required to write such dirty buffer.
1578          */
1579         LockBufHdr(bufHdr);
1580
1581         if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
1582                 result |= BUF_REUSABLE;
1583         else if (skip_recently_used)
1584         {
1585                 /* Caller told us not to write recently-used buffers */
1586                 UnlockBufHdr(bufHdr);
1587                 return result;
1588         }
1589
1590         if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
1591         {
1592                 /* It's clean, so nothing to do */
1593                 UnlockBufHdr(bufHdr);
1594                 return result;
1595         }
1596
1597         /*
1598          * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
1599          * buffer is clean by the time we've locked it.)
1600          */
1601         PinBuffer_Locked(bufHdr);
1602         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
1603
1604         FlushBuffer(bufHdr, NULL);
1605
1606         LWLockRelease(bufHdr->content_lock);
1607         UnpinBuffer(bufHdr, true);
1608
1609         return result | BUF_WRITTEN;
1610 }
1611
1612
1613 /*
1614  * Return a palloc'd string containing buffer usage statistics.
1615  */
1616 char *
1617 ShowBufferUsage(void)
1618 {
1619         StringInfoData str;
1620         float           hitrate;
1621         float           localhitrate;
1622
1623         initStringInfo(&str);
1624
1625         if (ReadBufferCount == 0)
1626                 hitrate = 0.0;
1627         else
1628                 hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
1629
1630         if (ReadLocalBufferCount == 0)
1631                 localhitrate = 0.0;
1632         else
1633                 localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
1634
1635         appendStringInfo(&str,
1636         "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1637                                 ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
1638         appendStringInfo(&str,
1639         "!\tLocal  blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1640                                          ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
1641         appendStringInfo(&str,
1642                                          "!\tDirect blocks: %10ld read, %10ld written\n",
1643                                          BufFileReadCount, BufFileWriteCount);
1644
1645         return str.data;
1646 }
1647
1648 void
1649 ResetBufferUsage(void)
1650 {
1651         BufferHitCount = 0;
1652         ReadBufferCount = 0;
1653         BufferFlushCount = 0;
1654         LocalBufferHitCount = 0;
1655         ReadLocalBufferCount = 0;
1656         LocalBufferFlushCount = 0;
1657         BufFileReadCount = 0;
1658         BufFileWriteCount = 0;
1659 }
1660
1661 /*
1662  *              AtEOXact_Buffers - clean up at end of transaction.
1663  *
1664  *              As of PostgreSQL 8.0, buffer pins should get released by the
1665  *              ResourceOwner mechanism.  This routine is just a debugging
1666  *              cross-check that no pins remain.
1667  */
1668 void
1669 AtEOXact_Buffers(bool isCommit)
1670 {
1671 #ifdef USE_ASSERT_CHECKING
1672         if (assert_enabled)
1673         {
1674                 int                     i;
1675
1676                 for (i = 0; i < NBuffers; i++)
1677                 {
1678                         Assert(PrivateRefCount[i] == 0);
1679                 }
1680         }
1681 #endif
1682
1683         AtEOXact_LocalBuffers(isCommit);
1684 }
1685
1686 /*
1687  * InitBufferPoolBackend --- second-stage initialization of a new backend
1688  *
1689  * This is called after we have acquired a PGPROC and so can safely get
1690  * LWLocks.  We don't currently need to do anything at this stage ...
1691  * except register a shmem-exit callback.  AtProcExit_Buffers needs LWLock
1692  * access, and thereby has to be called at the corresponding phase of
1693  * backend shutdown.
1694  */
1695 void
1696 InitBufferPoolBackend(void)
1697 {
1698         on_shmem_exit(AtProcExit_Buffers, 0);
1699 }
1700
1701 /*
1702  * Ensure we have released all shared-buffer locks and pins during backend exit
1703  */
1704 static void
1705 AtProcExit_Buffers(int code, Datum arg)
1706 {
1707         int                     i;
1708
1709         AbortBufferIO();
1710         UnlockBuffers();
1711
1712         for (i = 0; i < NBuffers; i++)
1713         {
1714                 if (PrivateRefCount[i] != 0)
1715                 {
1716                         volatile BufferDesc *buf = &(BufferDescriptors[i]);
1717
1718                         /*
1719                          * We don't worry about updating ResourceOwner; if we even got
1720                          * here, it suggests that ResourceOwners are messed up.
1721                          */
1722                         PrivateRefCount[i] = 1;         /* make sure we release shared pin */
1723                         UnpinBuffer(buf, false);
1724                         Assert(PrivateRefCount[i] == 0);
1725                 }
1726         }
1727
1728         /* localbuf.c needs a chance too */
1729         AtProcExit_LocalBuffers();
1730 }
1731
1732 /*
1733  * Helper routine to issue warnings when a buffer is unexpectedly pinned
1734  */
1735 void
1736 PrintBufferLeakWarning(Buffer buffer)
1737 {
1738         volatile BufferDesc *buf;
1739         int32           loccount;
1740         char       *path;
1741
1742         Assert(BufferIsValid(buffer));
1743         if (BufferIsLocal(buffer))
1744         {
1745                 buf = &LocalBufferDescriptors[-buffer - 1];
1746                 loccount = LocalRefCount[-buffer - 1];
1747         }
1748         else
1749         {
1750                 buf = &BufferDescriptors[buffer - 1];
1751                 loccount = PrivateRefCount[buffer - 1];
1752         }
1753
1754         /* theoretically we should lock the bufhdr here */
1755         path = relpath(buf->tag.rnode, buf->tag.forkNum);
1756         elog(WARNING,
1757                  "buffer refcount leak: [%03d] "
1758                  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
1759                  buffer, path,
1760                  buf->tag.blockNum, buf->flags,
1761                  buf->refcount, loccount);
1762         pfree(path);
1763 }
1764
1765 /*
1766  * CheckPointBuffers
1767  *
1768  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
1769  *
1770  * Note: temporary relations do not participate in checkpoints, so they don't
1771  * need to be flushed.
1772  */
1773 void
1774 CheckPointBuffers(int flags)
1775 {
1776         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
1777         CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
1778         BufferSync(flags);
1779         CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
1780         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
1781         smgrsync();
1782         CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
1783         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
1784 }
1785
1786
1787 /*
1788  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
1789  */
1790 void
1791 BufmgrCommit(void)
1792 {
1793         /* Nothing to do in bufmgr anymore... */
1794 }
1795
1796 /*
1797  * BufferGetBlockNumber
1798  *              Returns the block number associated with a buffer.
1799  *
1800  * Note:
1801  *              Assumes that the buffer is valid and pinned, else the
1802  *              value may be obsolete immediately...
1803  */
1804 BlockNumber
1805 BufferGetBlockNumber(Buffer buffer)
1806 {
1807         volatile BufferDesc *bufHdr;
1808
1809         Assert(BufferIsPinned(buffer));
1810
1811         if (BufferIsLocal(buffer))
1812                 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1813         else
1814                 bufHdr = &BufferDescriptors[buffer - 1];
1815
1816         /* pinned, so OK to read tag without spinlock */
1817         return bufHdr->tag.blockNum;
1818 }
1819
1820 /*
1821  * BufferGetTag
1822  *              Returns the relfilenode, fork number and block number associated with
1823  *              a buffer.
1824  */
1825 void
1826 BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
1827                          BlockNumber *blknum)
1828 {
1829         volatile BufferDesc *bufHdr;
1830
1831         /* Do the same checks as BufferGetBlockNumber. */
1832         Assert(BufferIsPinned(buffer));
1833
1834         if (BufferIsLocal(buffer))
1835                 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1836         else
1837                 bufHdr = &BufferDescriptors[buffer - 1];
1838
1839         /* pinned, so OK to read tag without spinlock */
1840         *rnode = bufHdr->tag.rnode;
1841         *forknum = bufHdr->tag.forkNum;
1842         *blknum = bufHdr->tag.blockNum;
1843 }
1844
1845 /*
1846  * FlushBuffer
1847  *              Physically write out a shared buffer.
1848  *
1849  * NOTE: this actually just passes the buffer contents to the kernel; the
1850  * real write to disk won't happen until the kernel feels like it.  This
1851  * is okay from our point of view since we can redo the changes from WAL.
1852  * However, we will need to force the changes to disk via fsync before
1853  * we can checkpoint WAL.
1854  *
1855  * The caller must hold a pin on the buffer and have share-locked the
1856  * buffer contents.  (Note: a share-lock does not prevent updates of
1857  * hint bits in the buffer, so the page could change while the write
1858  * is in progress, but we assume that that will not invalidate the data
1859  * written.)
1860  *
1861  * If the caller has an smgr reference for the buffer's relation, pass it
1862  * as the second parameter.  If not, pass NULL.
1863  */
1864 static void
1865 FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
1866 {
1867         XLogRecPtr      recptr;
1868         ErrorContextCallback errcontext;
1869
1870         /*
1871          * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
1872          * false, then someone else flushed the buffer before we could, so we need
1873          * not do anything.
1874          */
1875         if (!StartBufferIO(buf, false))
1876                 return;
1877
1878         /* Setup error traceback support for ereport() */
1879         errcontext.callback = buffer_write_error_callback;
1880         errcontext.arg = (void *) buf;
1881         errcontext.previous = error_context_stack;
1882         error_context_stack = &errcontext;
1883
1884         /* Find smgr relation for buffer */
1885         if (reln == NULL)
1886                 reln = smgropen(buf->tag.rnode);
1887
1888         TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
1889                                                                                 buf->tag.blockNum,
1890                                                                                 reln->smgr_rnode.spcNode,
1891                                                                                 reln->smgr_rnode.dbNode,
1892                                                                                 reln->smgr_rnode.relNode);
1893
1894         /*
1895          * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
1896          * rule that log updates must hit disk before any of the data-file changes
1897          * they describe do.
1898          */
1899         recptr = BufferGetLSN(buf);
1900         XLogFlush(recptr);
1901
1902         /*
1903          * Now it's safe to write buffer to disk. Note that no one else should
1904          * have been able to write it while we were busy with log flushing because
1905          * we have the io_in_progress lock.
1906          */
1907
1908         /* To check if block content changes while flushing. - vadim 01/17/97 */
1909         LockBufHdr(buf);
1910         buf->flags &= ~BM_JUST_DIRTIED;
1911         UnlockBufHdr(buf);
1912
1913         smgrwrite(reln,
1914                           buf->tag.forkNum,
1915                           buf->tag.blockNum,
1916                           (char *) BufHdrGetBlock(buf),
1917                           false);
1918
1919         BufferFlushCount++;
1920
1921         /*
1922          * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
1923          * end the io_in_progress state.
1924          */
1925         TerminateBufferIO(buf, true, 0);
1926
1927         TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
1928                                                                            buf->tag.blockNum,
1929                                                                            reln->smgr_rnode.spcNode,
1930                                                                            reln->smgr_rnode.dbNode,
1931                                                                            reln->smgr_rnode.relNode);
1932
1933         /* Pop the error context stack */
1934         error_context_stack = errcontext.previous;
1935 }
1936
1937 /*
1938  * RelationGetNumberOfBlocks
1939  *              Determines the current number of pages in the relation.
1940  */
1941 BlockNumber
1942 RelationGetNumberOfBlocks(Relation relation)
1943 {
1944         /* Open it at the smgr level if not already done */
1945         RelationOpenSmgr(relation);
1946
1947         return smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
1948 }
1949
1950 /* ---------------------------------------------------------------------
1951  *              DropRelFileNodeBuffers
1952  *
1953  *              This function removes from the buffer pool all the pages of the
1954  *              specified relation that have block numbers >= firstDelBlock.
1955  *              (In particular, with firstDelBlock = 0, all pages are removed.)
1956  *              Dirty pages are simply dropped, without bothering to write them
1957  *              out first.      Therefore, this is NOT rollback-able, and so should be
1958  *              used only with extreme caution!
1959  *
1960  *              Currently, this is called only from smgr.c when the underlying file
1961  *              is about to be deleted or truncated (firstDelBlock is needed for
1962  *              the truncation case).  The data in the affected pages would therefore
1963  *              be deleted momentarily anyway, and there is no point in writing it.
1964  *              It is the responsibility of higher-level code to ensure that the
1965  *              deletion or truncation does not lose any data that could be needed
1966  *              later.  It is also the responsibility of higher-level code to ensure
1967  *              that no other process could be trying to load more pages of the
1968  *              relation into buffers.
1969  *
1970  *              XXX currently it sequentially searches the buffer pool, should be
1971  *              changed to more clever ways of searching.  However, this routine
1972  *              is used only in code paths that aren't very performance-critical,
1973  *              and we shouldn't slow down the hot paths to make it faster ...
1974  * --------------------------------------------------------------------
1975  */
1976 void
1977 DropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum, bool istemp,
1978                                            BlockNumber firstDelBlock)
1979 {
1980         int                     i;
1981
1982         if (istemp)
1983         {
1984                 DropRelFileNodeLocalBuffers(rnode, forkNum, firstDelBlock);
1985                 return;
1986         }
1987
1988         for (i = 0; i < NBuffers; i++)
1989         {
1990                 volatile BufferDesc *bufHdr = &BufferDescriptors[i];
1991
1992                 LockBufHdr(bufHdr);
1993                 if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
1994                         bufHdr->tag.forkNum == forkNum &&
1995                         bufHdr->tag.blockNum >= firstDelBlock)
1996                         InvalidateBuffer(bufHdr);       /* releases spinlock */
1997                 else
1998                         UnlockBufHdr(bufHdr);
1999         }
2000 }
2001
2002 /* ---------------------------------------------------------------------
2003  *              DropDatabaseBuffers
2004  *
2005  *              This function removes all the buffers in the buffer cache for a
2006  *              particular database.  Dirty pages are simply dropped, without
2007  *              bothering to write them out first.      This is used when we destroy a
2008  *              database, to avoid trying to flush data to disk when the directory
2009  *              tree no longer exists.  Implementation is pretty similar to
2010  *              DropRelFileNodeBuffers() which is for destroying just one relation.
2011  * --------------------------------------------------------------------
2012  */
2013 void
2014 DropDatabaseBuffers(Oid dbid)
2015 {
2016         int                     i;
2017         volatile BufferDesc *bufHdr;
2018
2019         /*
2020          * We needn't consider local buffers, since by assumption the target
2021          * database isn't our own.
2022          */
2023
2024         for (i = 0; i < NBuffers; i++)
2025         {
2026                 bufHdr = &BufferDescriptors[i];
2027                 LockBufHdr(bufHdr);
2028                 if (bufHdr->tag.rnode.dbNode == dbid)
2029                         InvalidateBuffer(bufHdr);       /* releases spinlock */
2030                 else
2031                         UnlockBufHdr(bufHdr);
2032         }
2033 }
2034
2035 /* -----------------------------------------------------------------
2036  *              PrintBufferDescs
2037  *
2038  *              this function prints all the buffer descriptors, for debugging
2039  *              use only.
2040  * -----------------------------------------------------------------
2041  */
2042 #ifdef NOT_USED
2043 void
2044 PrintBufferDescs(void)
2045 {
2046         int                     i;
2047         volatile BufferDesc *buf = BufferDescriptors;
2048
2049         for (i = 0; i < NBuffers; ++i, ++buf)
2050         {
2051                 /* theoretically we should lock the bufhdr here */
2052                 elog(LOG,
2053                          "[%02d] (freeNext=%d, rel=%s, "
2054                          "blockNum=%u, flags=0x%x, refcount=%u %d)",
2055                          i, buf->freeNext,
2056                          relpath(buf->tag.rnode, buf->tag.forkNum),
2057                          buf->tag.blockNum, buf->flags,
2058                          buf->refcount, PrivateRefCount[i]);
2059         }
2060 }
2061 #endif
2062
2063 #ifdef NOT_USED
2064 void
2065 PrintPinnedBufs(void)
2066 {
2067         int                     i;
2068         volatile BufferDesc *buf = BufferDescriptors;
2069
2070         for (i = 0; i < NBuffers; ++i, ++buf)
2071         {
2072                 if (PrivateRefCount[i] > 0)
2073                 {
2074                         /* theoretically we should lock the bufhdr here */
2075                         elog(LOG,
2076                                  "[%02d] (freeNext=%d, rel=%s, "
2077                                  "blockNum=%u, flags=0x%x, refcount=%u %d)",
2078                                  i, buf->freeNext,
2079                                  relpath(buf->tag.rnode, buf->tag.forkNum),
2080                                  buf->tag.blockNum, buf->flags,
2081                                  buf->refcount, PrivateRefCount[i]);
2082                 }
2083         }
2084 }
2085 #endif
2086
2087 /* ---------------------------------------------------------------------
2088  *              FlushRelationBuffers
2089  *
2090  *              This function writes all dirty pages of a relation out to disk
2091  *              (or more accurately, out to kernel disk buffers), ensuring that the
2092  *              kernel has an up-to-date view of the relation.
2093  *
2094  *              Generally, the caller should be holding AccessExclusiveLock on the
2095  *              target relation to ensure that no other backend is busy dirtying
2096  *              more blocks of the relation; the effects can't be expected to last
2097  *              after the lock is released.
2098  *
2099  *              XXX currently it sequentially searches the buffer pool, should be
2100  *              changed to more clever ways of searching.  This routine is not
2101  *              used in any performance-critical code paths, so it's not worth
2102  *              adding additional overhead to normal paths to make it go faster;
2103  *              but see also DropRelFileNodeBuffers.
2104  * --------------------------------------------------------------------
2105  */
2106 void
2107 FlushRelationBuffers(Relation rel)
2108 {
2109         int                     i;
2110         volatile BufferDesc *bufHdr;
2111
2112         /* Open rel at the smgr level if not already done */
2113         RelationOpenSmgr(rel);
2114
2115         if (rel->rd_istemp)
2116         {
2117                 for (i = 0; i < NLocBuffer; i++)
2118                 {
2119                         bufHdr = &LocalBufferDescriptors[i];
2120                         if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2121                                 (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2122                         {
2123                                 ErrorContextCallback errcontext;
2124
2125                                 /* Setup error traceback support for ereport() */
2126                                 errcontext.callback = buffer_write_error_callback;
2127                                 errcontext.arg = (void *) bufHdr;
2128                                 errcontext.previous = error_context_stack;
2129                                 error_context_stack = &errcontext;
2130
2131                                 smgrwrite(rel->rd_smgr,
2132                                                   bufHdr->tag.forkNum,
2133                                                   bufHdr->tag.blockNum,
2134                                                   (char *) LocalBufHdrGetBlock(bufHdr),
2135                                                   true);
2136
2137                                 bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
2138
2139                                 /* Pop the error context stack */
2140                                 error_context_stack = errcontext.previous;
2141                         }
2142                 }
2143
2144                 return;
2145         }
2146
2147         /* Make sure we can handle the pin inside the loop */
2148         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2149
2150         for (i = 0; i < NBuffers; i++)
2151         {
2152                 bufHdr = &BufferDescriptors[i];
2153                 LockBufHdr(bufHdr);
2154                 if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2155                         (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2156                 {
2157                         PinBuffer_Locked(bufHdr);
2158                         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2159                         FlushBuffer(bufHdr, rel->rd_smgr);
2160                         LWLockRelease(bufHdr->content_lock);
2161                         UnpinBuffer(bufHdr, true);
2162                 }
2163                 else
2164                         UnlockBufHdr(bufHdr);
2165         }
2166 }
2167
2168 /* ---------------------------------------------------------------------
2169  *              FlushDatabaseBuffers
2170  *
2171  *              This function writes all dirty pages of a database out to disk
2172  *              (or more accurately, out to kernel disk buffers), ensuring that the
2173  *              kernel has an up-to-date view of the database.
2174  *
2175  *              Generally, the caller should be holding an appropriate lock to ensure
2176  *              no other backend is active in the target database; otherwise more
2177  *              pages could get dirtied.
2178  *
2179  *              Note we don't worry about flushing any pages of temporary relations.
2180  *              It's assumed these wouldn't be interesting.
2181  * --------------------------------------------------------------------
2182  */
2183 void
2184 FlushDatabaseBuffers(Oid dbid)
2185 {
2186         int                     i;
2187         volatile BufferDesc *bufHdr;
2188
2189         /* Make sure we can handle the pin inside the loop */
2190         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2191
2192         for (i = 0; i < NBuffers; i++)
2193         {
2194                 bufHdr = &BufferDescriptors[i];
2195                 LockBufHdr(bufHdr);
2196                 if (bufHdr->tag.rnode.dbNode == dbid &&
2197                         (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2198                 {
2199                         PinBuffer_Locked(bufHdr);
2200                         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2201                         FlushBuffer(bufHdr, NULL);
2202                         LWLockRelease(bufHdr->content_lock);
2203                         UnpinBuffer(bufHdr, true);
2204                 }
2205                 else
2206                         UnlockBufHdr(bufHdr);
2207         }
2208 }
2209
2210 /*
2211  * ReleaseBuffer -- release the pin on a buffer
2212  */
2213 void
2214 ReleaseBuffer(Buffer buffer)
2215 {
2216         volatile BufferDesc *bufHdr;
2217
2218         if (!BufferIsValid(buffer))
2219                 elog(ERROR, "bad buffer id: %d", buffer);
2220
2221         ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
2222
2223         if (BufferIsLocal(buffer))
2224         {
2225                 Assert(LocalRefCount[-buffer - 1] > 0);
2226                 LocalRefCount[-buffer - 1]--;
2227                 return;
2228         }
2229
2230         bufHdr = &BufferDescriptors[buffer - 1];
2231
2232         Assert(PrivateRefCount[buffer - 1] > 0);
2233
2234         if (PrivateRefCount[buffer - 1] > 1)
2235                 PrivateRefCount[buffer - 1]--;
2236         else
2237                 UnpinBuffer(bufHdr, false);
2238 }
2239
2240 /*
2241  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
2242  *
2243  * This is just a shorthand for a common combination.
2244  */
2245 void
2246 UnlockReleaseBuffer(Buffer buffer)
2247 {
2248         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2249         ReleaseBuffer(buffer);
2250 }
2251
2252 /*
2253  * IncrBufferRefCount
2254  *              Increment the pin count on a buffer that we have *already* pinned
2255  *              at least once.
2256  *
2257  *              This function cannot be used on a buffer we do not have pinned,
2258  *              because it doesn't change the shared buffer state.
2259  */
2260 void
2261 IncrBufferRefCount(Buffer buffer)
2262 {
2263         Assert(BufferIsPinned(buffer));
2264         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2265         ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
2266         if (BufferIsLocal(buffer))
2267                 LocalRefCount[-buffer - 1]++;
2268         else
2269                 PrivateRefCount[buffer - 1]++;
2270 }
2271
2272 /*
2273  * SetBufferCommitInfoNeedsSave
2274  *
2275  *      Mark a buffer dirty when we have updated tuple commit-status bits in it.
2276  *
2277  * This is essentially the same as MarkBufferDirty, except that the caller
2278  * might have only share-lock instead of exclusive-lock on the buffer's
2279  * content lock.  We preserve the distinction mainly as a way of documenting
2280  * that the caller has not made a critical data change --- the status-bit
2281  * update could be redone by someone else just as easily.  Therefore, no WAL
2282  * log record need be generated, whereas calls to MarkBufferDirty really ought
2283  * to be associated with a WAL-entry-creating action.
2284  */
2285 void
2286 SetBufferCommitInfoNeedsSave(Buffer buffer)
2287 {
2288         volatile BufferDesc *bufHdr;
2289
2290         if (!BufferIsValid(buffer))
2291                 elog(ERROR, "bad buffer id: %d", buffer);
2292
2293         if (BufferIsLocal(buffer))
2294         {
2295                 MarkLocalBufferDirty(buffer);
2296                 return;
2297         }
2298
2299         bufHdr = &BufferDescriptors[buffer - 1];
2300
2301         Assert(PrivateRefCount[buffer - 1] > 0);
2302         /* here, either share or exclusive lock is OK */
2303         Assert(LWLockHeldByMe(bufHdr->content_lock));
2304
2305         /*
2306          * This routine might get called many times on the same page, if we are
2307          * making the first scan after commit of an xact that added/deleted many
2308          * tuples.      So, be as quick as we can if the buffer is already dirty.  We
2309          * do this by not acquiring spinlock if it looks like the status bits are
2310          * already OK.  (Note it is okay if someone else clears BM_JUST_DIRTIED
2311          * immediately after we look, because the buffer content update is already
2312          * done and will be reflected in the I/O.)
2313          */
2314         if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
2315                 (BM_DIRTY | BM_JUST_DIRTIED))
2316         {
2317                 LockBufHdr(bufHdr);
2318                 Assert(bufHdr->refcount > 0);
2319                 if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
2320                         VacuumCostBalance += VacuumCostPageDirty;
2321                 bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
2322                 UnlockBufHdr(bufHdr);
2323         }
2324 }
2325
2326 /*
2327  * Release buffer content locks for shared buffers.
2328  *
2329  * Used to clean up after errors.
2330  *
2331  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
2332  * of releasing buffer content locks per se; the only thing we need to deal
2333  * with here is clearing any PIN_COUNT request that was in progress.
2334  */
2335 void
2336 UnlockBuffers(void)
2337 {
2338         volatile BufferDesc *buf = PinCountWaitBuf;
2339
2340         if (buf)
2341         {
2342                 LockBufHdr(buf);
2343
2344                 /*
2345                  * Don't complain if flag bit not set; it could have been reset but we
2346                  * got a cancel/die interrupt before getting the signal.
2347                  */
2348                 if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
2349                         buf->wait_backend_pid == MyProcPid)
2350                         buf->flags &= ~BM_PIN_COUNT_WAITER;
2351
2352                 UnlockBufHdr(buf);
2353
2354                 PinCountWaitBuf = NULL;
2355         }
2356 }
2357
2358 /*
2359  * Acquire or release the content_lock for the buffer.
2360  */
2361 void
2362 LockBuffer(Buffer buffer, int mode)
2363 {
2364         volatile BufferDesc *buf;
2365
2366         Assert(BufferIsValid(buffer));
2367         if (BufferIsLocal(buffer))
2368                 return;                                 /* local buffers need no lock */
2369
2370         buf = &(BufferDescriptors[buffer - 1]);
2371
2372         if (mode == BUFFER_LOCK_UNLOCK)
2373                 LWLockRelease(buf->content_lock);
2374         else if (mode == BUFFER_LOCK_SHARE)
2375                 LWLockAcquire(buf->content_lock, LW_SHARED);
2376         else if (mode == BUFFER_LOCK_EXCLUSIVE)
2377                 LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
2378         else
2379                 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
2380 }
2381
2382 /*
2383  * Acquire the content_lock for the buffer, but only if we don't have to wait.
2384  *
2385  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
2386  */
2387 bool
2388 ConditionalLockBuffer(Buffer buffer)
2389 {
2390         volatile BufferDesc *buf;
2391
2392         Assert(BufferIsValid(buffer));
2393         if (BufferIsLocal(buffer))
2394                 return true;                    /* act as though we got it */
2395
2396         buf = &(BufferDescriptors[buffer - 1]);
2397
2398         return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
2399 }
2400
2401 /*
2402  * LockBufferForCleanup - lock a buffer in preparation for deleting items
2403  *
2404  * Items may be deleted from a disk page only when the caller (a) holds an
2405  * exclusive lock on the buffer and (b) has observed that no other backend
2406  * holds a pin on the buffer.  If there is a pin, then the other backend
2407  * might have a pointer into the buffer (for example, a heapscan reference
2408  * to an item --- see README for more details).  It's OK if a pin is added
2409  * after the cleanup starts, however; the newly-arrived backend will be
2410  * unable to look at the page until we release the exclusive lock.
2411  *
2412  * To implement this protocol, a would-be deleter must pin the buffer and
2413  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
2414  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
2415  * it has successfully observed pin count = 1.
2416  */
2417 void
2418 LockBufferForCleanup(Buffer buffer)
2419 {
2420         volatile BufferDesc *bufHdr;
2421
2422         Assert(BufferIsValid(buffer));
2423         Assert(PinCountWaitBuf == NULL);
2424
2425         if (BufferIsLocal(buffer))
2426         {
2427                 /* There should be exactly one pin */
2428                 if (LocalRefCount[-buffer - 1] != 1)
2429                         elog(ERROR, "incorrect local pin count: %d",
2430                                  LocalRefCount[-buffer - 1]);
2431                 /* Nobody else to wait for */
2432                 return;
2433         }
2434
2435         /* There should be exactly one local pin */
2436         if (PrivateRefCount[buffer - 1] != 1)
2437                 elog(ERROR, "incorrect local pin count: %d",
2438                          PrivateRefCount[buffer - 1]);
2439
2440         bufHdr = &BufferDescriptors[buffer - 1];
2441
2442         for (;;)
2443         {
2444                 /* Try to acquire lock */
2445                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2446                 LockBufHdr(bufHdr);
2447                 Assert(bufHdr->refcount > 0);
2448                 if (bufHdr->refcount == 1)
2449                 {
2450                         /* Successfully acquired exclusive lock with pincount 1 */
2451                         UnlockBufHdr(bufHdr);
2452                         return;
2453                 }
2454                 /* Failed, so mark myself as waiting for pincount 1 */
2455                 if (bufHdr->flags & BM_PIN_COUNT_WAITER)
2456                 {
2457                         UnlockBufHdr(bufHdr);
2458                         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2459                         elog(ERROR, "multiple backends attempting to wait for pincount 1");
2460                 }
2461                 bufHdr->wait_backend_pid = MyProcPid;
2462                 bufHdr->flags |= BM_PIN_COUNT_WAITER;
2463                 PinCountWaitBuf = bufHdr;
2464                 UnlockBufHdr(bufHdr);
2465                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2466                 /* Wait to be signaled by UnpinBuffer() */
2467                 ProcWaitForSignal();
2468                 PinCountWaitBuf = NULL;
2469                 /* Loop back and try again */
2470         }
2471 }
2472
2473 /*
2474  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
2475  *
2476  * We won't loop, but just check once to see if the pin count is OK.  If
2477  * not, return FALSE with no lock held.
2478  */
2479 bool
2480 ConditionalLockBufferForCleanup(Buffer buffer)
2481 {
2482         volatile BufferDesc *bufHdr;
2483
2484         Assert(BufferIsValid(buffer));
2485
2486         if (BufferIsLocal(buffer))
2487         {
2488                 /* There should be exactly one pin */
2489                 Assert(LocalRefCount[-buffer - 1] > 0);
2490                 if (LocalRefCount[-buffer - 1] != 1)
2491                         return false;
2492                 /* Nobody else to wait for */
2493                 return true;
2494         }
2495
2496         /* There should be exactly one local pin */
2497         Assert(PrivateRefCount[buffer - 1] > 0);
2498         if (PrivateRefCount[buffer - 1] != 1)
2499                 return false;
2500
2501         /* Try to acquire lock */
2502         if (!ConditionalLockBuffer(buffer))
2503                 return false;
2504
2505         bufHdr = &BufferDescriptors[buffer - 1];
2506         LockBufHdr(bufHdr);
2507         Assert(bufHdr->refcount > 0);
2508         if (bufHdr->refcount == 1)
2509         {
2510                 /* Successfully acquired exclusive lock with pincount 1 */
2511                 UnlockBufHdr(bufHdr);
2512                 return true;
2513         }
2514
2515         /* Failed, so release the lock */
2516         UnlockBufHdr(bufHdr);
2517         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2518         return false;
2519 }
2520
2521
2522 /*
2523  *      Functions for buffer I/O handling
2524  *
2525  *      Note: We assume that nested buffer I/O never occurs.
2526  *      i.e at most one io_in_progress lock is held per proc.
2527  *
2528  *      Also note that these are used only for shared buffers, not local ones.
2529  */
2530
2531 /*
2532  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
2533  */
2534 static void
2535 WaitIO(volatile BufferDesc *buf)
2536 {
2537         /*
2538          * Changed to wait until there's no IO - Inoue 01/13/2000
2539          *
2540          * Note this is *necessary* because an error abort in the process doing
2541          * I/O could release the io_in_progress_lock prematurely. See
2542          * AbortBufferIO.
2543          */
2544         for (;;)
2545         {
2546                 BufFlags        sv_flags;
2547
2548                 /*
2549                  * It may not be necessary to acquire the spinlock to check the flag
2550                  * here, but since this test is essential for correctness, we'd better
2551                  * play it safe.
2552                  */
2553                 LockBufHdr(buf);
2554                 sv_flags = buf->flags;
2555                 UnlockBufHdr(buf);
2556                 if (!(sv_flags & BM_IO_IN_PROGRESS))
2557                         break;
2558                 LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
2559                 LWLockRelease(buf->io_in_progress_lock);
2560         }
2561 }
2562
2563 /*
2564  * StartBufferIO: begin I/O on this buffer
2565  *      (Assumptions)
2566  *      My process is executing no IO
2567  *      The buffer is Pinned
2568  *
2569  * In some scenarios there are race conditions in which multiple backends
2570  * could attempt the same I/O operation concurrently.  If someone else
2571  * has already started I/O on this buffer then we will block on the
2572  * io_in_progress lock until he's done.
2573  *
2574  * Input operations are only attempted on buffers that are not BM_VALID,
2575  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
2576  * so we can always tell if the work is already done.
2577  *
2578  * Returns TRUE if we successfully marked the buffer as I/O busy,
2579  * FALSE if someone else already did the work.
2580  */
2581 static bool
2582 StartBufferIO(volatile BufferDesc *buf, bool forInput)
2583 {
2584         Assert(!InProgressBuf);
2585
2586         for (;;)
2587         {
2588                 /*
2589                  * Grab the io_in_progress lock so that other processes can wait for
2590                  * me to finish the I/O.
2591                  */
2592                 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2593
2594                 LockBufHdr(buf);
2595
2596                 if (!(buf->flags & BM_IO_IN_PROGRESS))
2597                         break;
2598
2599                 /*
2600                  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
2601                  * lock isn't held is if the process doing the I/O is recovering from
2602                  * an error (see AbortBufferIO).  If that's the case, we must wait for
2603                  * him to get unwedged.
2604                  */
2605                 UnlockBufHdr(buf);
2606                 LWLockRelease(buf->io_in_progress_lock);
2607                 WaitIO(buf);
2608         }
2609
2610         /* Once we get here, there is definitely no I/O active on this buffer */
2611
2612         if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
2613         {
2614                 /* someone else already did the I/O */
2615                 UnlockBufHdr(buf);
2616                 LWLockRelease(buf->io_in_progress_lock);
2617                 return false;
2618         }
2619
2620         buf->flags |= BM_IO_IN_PROGRESS;
2621
2622         UnlockBufHdr(buf);
2623
2624         InProgressBuf = buf;
2625         IsForInput = forInput;
2626
2627         return true;
2628 }
2629
2630 /*
2631  * TerminateBufferIO: release a buffer we were doing I/O on
2632  *      (Assumptions)
2633  *      My process is executing IO for the buffer
2634  *      BM_IO_IN_PROGRESS bit is set for the buffer
2635  *      We hold the buffer's io_in_progress lock
2636  *      The buffer is Pinned
2637  *
2638  * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
2639  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
2640  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
2641  * marking the buffer clean if it was re-dirtied while we were writing.
2642  *
2643  * set_flag_bits gets ORed into the buffer's flags.  It must include
2644  * BM_IO_ERROR in a failure case.  For successful completion it could
2645  * be 0, or BM_VALID if we just finished reading in the page.
2646  */
2647 static void
2648 TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
2649                                   int set_flag_bits)
2650 {
2651         Assert(buf == InProgressBuf);
2652
2653         LockBufHdr(buf);
2654
2655         Assert(buf->flags & BM_IO_IN_PROGRESS);
2656         buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
2657         if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
2658                 buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
2659         buf->flags |= set_flag_bits;
2660
2661         UnlockBufHdr(buf);
2662
2663         InProgressBuf = NULL;
2664
2665         LWLockRelease(buf->io_in_progress_lock);
2666 }
2667
2668 /*
2669  * AbortBufferIO: Clean up any active buffer I/O after an error.
2670  *
2671  *      All LWLocks we might have held have been released,
2672  *      but we haven't yet released buffer pins, so the buffer is still pinned.
2673  *
2674  *      If I/O was in progress, we always set BM_IO_ERROR, even though it's
2675  *      possible the error condition wasn't related to the I/O.
2676  */
2677 void
2678 AbortBufferIO(void)
2679 {
2680         volatile BufferDesc *buf = InProgressBuf;
2681
2682         if (buf)
2683         {
2684                 /*
2685                  * Since LWLockReleaseAll has already been called, we're not holding
2686                  * the buffer's io_in_progress_lock. We have to re-acquire it so that
2687                  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
2688                  * buffer will be in a busy spin until we succeed in doing this.
2689                  */
2690                 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2691
2692                 LockBufHdr(buf);
2693                 Assert(buf->flags & BM_IO_IN_PROGRESS);
2694                 if (IsForInput)
2695                 {
2696                         Assert(!(buf->flags & BM_DIRTY));
2697                         /* We'd better not think buffer is valid yet */
2698                         Assert(!(buf->flags & BM_VALID));
2699                         UnlockBufHdr(buf);
2700                 }
2701                 else
2702                 {
2703                         BufFlags        sv_flags;
2704
2705                         sv_flags = buf->flags;
2706                         Assert(sv_flags & BM_DIRTY);
2707                         UnlockBufHdr(buf);
2708                         /* Issue notice if this is not the first failure... */
2709                         if (sv_flags & BM_IO_ERROR)
2710                         {
2711                                 /* Buffer is pinned, so we can read tag without spinlock */
2712                                 char       *path = relpath(buf->tag.rnode, buf->tag.forkNum);
2713
2714                                 ereport(WARNING,
2715                                                 (errcode(ERRCODE_IO_ERROR),
2716                                                  errmsg("could not write block %u of %s",
2717                                                                 buf->tag.blockNum, path),
2718                                                  errdetail("Multiple failures --- write error might be permanent.")));
2719                                 pfree(path);
2720                         }
2721                 }
2722                 TerminateBufferIO(buf, false, BM_IO_ERROR);
2723         }
2724 }
2725
2726 /*
2727  * Error context callback for errors occurring during buffer writes.
2728  */
2729 static void
2730 buffer_write_error_callback(void *arg)
2731 {
2732         volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;
2733
2734         /* Buffer is pinned, so we can read the tag without locking the spinlock */
2735         if (bufHdr != NULL)
2736         {
2737                 char       *path = relpath(bufHdr->tag.rnode, bufHdr->tag.forkNum);
2738
2739                 errcontext("writing block %u of relation %s",
2740                                    bufHdr->tag.blockNum, path);
2741                 pfree(path);
2742         }
2743 }