src/backend/storage/buffer/bufmgr.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * bufmgr.c
   4  *        buffer manager interface routines
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 /*
  16  * Principal entry points:
  17  *
  18  * ReadBuffer() -- find or create a buffer holding the requested page,
  19  *              and pin it so that no one can destroy it while this process
  20  *              is using it.
  21  *
  22  * ReleaseBuffer() -- unpin a buffer
  23  *
  24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
  25  *              The disk write is delayed until buffer replacement or checkpoint.
  26  *
  27  * See also these files:
  28  *              freelist.c -- chooses victim for buffer replacement
  29  *              buf_table.c -- manages the buffer lookup table
  30  */
  31 #include "postgres.h"
  32
  33 #include <sys/file.h>
  34 #include <unistd.h>
  35
  36 #include "miscadmin.h"
  37 #include "pg_trace.h"
  38 #include "pgstat.h"
  39 #include "postmaster/bgwriter.h"
  40 #include "storage/buf_internals.h"
  41 #include "storage/bufmgr.h"
  42 #include "storage/ipc.h"
  43 #include "storage/proc.h"
  44 #include "storage/smgr.h"
  45 #include "utils/rel.h"
  46 #include "utils/resowner.h"
  47
  48
  49 /* Note: these two macros only work on shared buffers, not local ones! */
  50 #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  51 #define BufferGetLSN(bufHdr)    (*((XLogRecPtr*) BufHdrGetBlock(bufHdr)))
  52
  53 /* Note: this macro only works on local buffers, not shared ones! */
  54 #define LocalBufHdrGetBlock(bufHdr) \
  55         LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
  56
  57 /* Bits in SyncOneBuffer's return value */
  58 #define BUF_WRITTEN                             0x01
  59 #define BUF_REUSABLE                    0x02
  60
  61
  62 /* GUC variables */
  63 bool            zero_damaged_pages = false;
  64 int                     bgwriter_lru_maxpages = 100;
  65 double          bgwriter_lru_multiplier = 2.0;
  66
  67
  68 long            NDirectFileRead;        /* some I/O's are direct file access. bypass
  69                                                                  * bufmgr */
  70 long            NDirectFileWrite;       /* e.g., I/O in psort and hashjoin. */
  71
  72
  73 /* local state for StartBufferIO and related functions */
  74 static volatile BufferDesc *InProgressBuf = NULL;
  75 static bool IsForInput;
  76
  77 /* local state for LockBufferForCleanup */
  78 static volatile BufferDesc *PinCountWaitBuf = NULL;
  79
  80
  81 static Buffer ReadBuffer_relcache(Relation reln, ForkNumber forkNum,
  82                 BlockNumber blockNum, bool zeroPage, BufferAccessStrategy strategy);
  83 static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
  84                                   ForkNumber forkNum, BlockNumber blockNum,
  85                                   bool zeroPage, BufferAccessStrategy strategy, bool *hit);
  86 static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
  87 static void PinBuffer_Locked(volatile BufferDesc *buf);
  88 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
  89 static void BufferSync(int flags);
  90 static int      SyncOneBuffer(int buf_id, bool skip_recently_used);
  91 static void WaitIO(volatile BufferDesc *buf);
  92 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
  93 static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  94                                   int set_flag_bits);
  95 static void buffer_write_error_callback(void *arg);
  96 static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
  97                         BlockNumber blockNum,
  98                         BufferAccessStrategy strategy,
  99                         bool *foundPtr);
 100 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 101 static void AtProcExit_Buffers(int code, Datum arg);
 102
 103
 104 /*
 105  * ReadBuffer -- returns a buffer containing the requested
 106  *              block of the requested relation.  If the blknum
 107  *              requested is P_NEW, extend the relation file and
 108  *              allocate a new block.  (Caller is responsible for
 109  *              ensuring that only one backend tries to extend a
 110  *              relation at the same time!)
 111  *
 112  * Returns: the buffer number for the buffer containing
 113  *              the block read.  The returned buffer has been pinned.
 114  *              Does not return on error --- elog's instead.
 115  *
 116  * Assume when this function is called, that reln has been
 117  *              opened already.
 118  */
 119 Buffer
 120 ReadBuffer(Relation reln, BlockNumber blockNum)
 121 {
 122         return ReadBuffer_relcache(reln, MAIN_FORKNUM, blockNum, false, NULL);
 123 }
 124
 125 /*
 126  * ReadBufferWithFork -- same as ReadBuffer, but for accessing relation
 127  *              forks other than MAIN_FORKNUM.
 128  */
 129 Buffer
 130 ReadBufferWithFork(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 131 {
 132         return ReadBuffer_relcache(reln, forkNum, blockNum, false, NULL);
 133 }
 134
 135 /*
 136  * ReadBufferWithStrategy -- same as ReadBuffer, except caller can specify
 137  *              a nondefault buffer access strategy.  See buffer/README for details.
 138  */
 139 Buffer
 140 ReadBufferWithStrategy(Relation reln, BlockNumber blockNum,
 141                                            BufferAccessStrategy strategy)
 142 {
 143         return ReadBuffer_relcache(reln, MAIN_FORKNUM, blockNum, false, strategy);
 144 }
 145
 146 /*
 147  * ReadOrZeroBuffer -- like ReadBuffer, but if the page isn't in buffer
 148  *              cache already, it's filled with zeros instead of reading it from
 149  *              disk.  Useful when the caller intends to fill the page from scratch,
 150  *              since this saves I/O and avoids unnecessary failure if the
 151  *              page-on-disk has corrupt page headers.
 152  *
 153  *              Caution: do not use this to read a page that is beyond the relation's
 154  *              current physical EOF; that is likely to cause problems in md.c when
 155  *              the page is modified and written out.  P_NEW is OK, though.
 156  */
 157 Buffer
 158 ReadOrZeroBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 159 {
 160         return ReadBuffer_relcache(reln, forkNum, blockNum, true, NULL);
 161 }
 162
 163 /*
 164  * ReadBufferWithoutRelcache -- like ReadBuffer, but doesn't require a
 165  *              relcache entry for the relation. If zeroPage is true, this behaves
 166  *              like ReadOrZeroBuffer rather than ReadBuffer.
 167  */
 168 Buffer
 169 ReadBufferWithoutRelcache(RelFileNode rnode, bool isTemp,
 170                                           ForkNumber forkNum, BlockNumber blockNum, bool zeroPage)
 171 {
 172         bool hit;
 173
 174         SMgrRelation smgr = smgropen(rnode);
 175         return ReadBuffer_common(smgr, isTemp, forkNum, blockNum, zeroPage, NULL, &hit);
 176 }
 177
 178 /*
 179  * ReadBuffer_relcache -- common logic for ReadBuffer-variants that
 180  *              operate on a Relation.
 181  */
 182 static Buffer
 183 ReadBuffer_relcache(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
 184                                         bool zeroPage, BufferAccessStrategy strategy)
 185 {
 186         bool hit;
 187         Buffer buf;
 188
 189         /* Open it at the smgr level if not already done */
 190         RelationOpenSmgr(reln);
 191
 192         /*
 193          * Read the buffer, and update pgstat counters to reflect a cache
 194          * hit or miss.
 195          */
 196         pgstat_count_buffer_read(reln);
 197         buf = ReadBuffer_common(reln->rd_smgr, reln->rd_istemp, forkNum, blockNum,
 198                                                         zeroPage, strategy, &hit);
 199         if (hit)
 200                 pgstat_count_buffer_hit(reln);
 201         return buf;
 202 }
 203
 204 /*
 205  * ReadBuffer_common -- common logic for all ReadBuffer variants
 206  *
 207  * *hit is set to true if the request was satisfied from shared buffer cache.
 208  */
 209 static Buffer
 210 ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
 211                                   BlockNumber blockNum, bool zeroPage,
 212                                   BufferAccessStrategy strategy, bool *hit)
 213 {
 214         volatile BufferDesc *bufHdr;
 215         Block           bufBlock;
 216         bool            found;
 217         bool            isExtend;
 218
 219         *hit = false;
 220
 221         /* Make sure we will have room to remember the buffer pin */
 222         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 223
 224         isExtend = (blockNum == P_NEW);
 225
 226         /* Substitute proper block number if caller asked for P_NEW */
 227         if (isExtend)
 228                 blockNum = smgrnblocks(smgr, forkNum);
 229
 230         TRACE_POSTGRESQL_BUFFER_READ_START(blockNum, smgr->smgr_rnode.spcNode,
 231                 smgr->smgr_rnode.dbNode, smgr->smgr_rnode.relNode, isLocalBuf);
 232
 233         if (isLocalBuf)
 234         {
 235                 ReadLocalBufferCount++;
 236                 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
 237                 if (found)
 238                 {
 239                         LocalBufferHitCount++;
 240                         TRACE_POSTGRESQL_BUFFER_HIT(true); /* true == local buffer */
 241                 }
 242                 else
 243                 {
 244                         TRACE_POSTGRESQL_BUFFER_MISS(true); /* ditto */
 245                 }
 246         }
 247         else
 248         {
 249                 ReadBufferCount++;
 250
 251                 /*
 252                  * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
 253                  * not currently in memory.
 254                  */
 255                 bufHdr = BufferAlloc(smgr, forkNum, blockNum, strategy, &found);
 256                 if (found)
 257                 {
 258                         BufferHitCount++;
 259                         TRACE_POSTGRESQL_BUFFER_HIT(false); /* false != local buffer */
 260                 }
 261                 else
 262                 {
 263                         TRACE_POSTGRESQL_BUFFER_MISS(false); /* ditto */
 264                 }
 265         }
 266
 267         /* At this point we do NOT hold any locks. */
 268
 269         /* if it was already in the buffer pool, we're done */
 270         if (found)
 271         {
 272                 if (!isExtend)
 273                 {
 274                         /* Just need to update stats before we exit */
 275                         *hit = true;
 276
 277                         if (VacuumCostActive)
 278                                 VacuumCostBalance += VacuumCostPageHit;
 279
 280                         TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum,
 281                                 smgr->smgr_rnode.spcNode,
 282                                 smgr->smgr_rnode.dbNode,
 283                                 smgr->smgr_rnode.relNode, isLocalBuf, found);
 284
 285                         return BufferDescriptorGetBuffer(bufHdr);
 286                 }
 287
 288                 /*
 289                  * We get here only in the corner case where we are trying to extend
 290                  * the relation but we found a pre-existing buffer marked BM_VALID.
 291                  * This can happen because mdread doesn't complain about reads beyond
 292                  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
 293                  * read a block beyond EOF could have left a "valid" zero-filled
 294                  * buffer.      Unfortunately, we have also seen this case occurring
 295                  * because of buggy Linux kernels that sometimes return an
 296                  * lseek(SEEK_END) result that doesn't account for a recent write. In
 297                  * that situation, the pre-existing buffer would contain valid data
 298                  * that we don't want to overwrite.  Since the legitimate case should
 299                  * always have left a zero-filled buffer, complain if not PageIsNew.
 300                  */
 301                 bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 302                 if (!PageIsNew((Page) bufBlock))
 303                         ereport(ERROR,
 304                                         (errmsg("unexpected data beyond EOF in block %u of relation %u/%u/%u",
 305                                                         blockNum, smgr->smgr_rnode.spcNode, smgr->smgr_rnode.dbNode, smgr->smgr_rnode.relNode),
 306                                          errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
 307
 308                 /*
 309                  * We *must* do smgrextend before succeeding, else the page will not
 310                  * be reserved by the kernel, and the next P_NEW call will decide to
 311                  * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
 312                  * call that BufferAlloc didn't, and proceed.
 313                  */
 314                 if (isLocalBuf)
 315                 {
 316                         /* Only need to adjust flags */
 317                         Assert(bufHdr->flags & BM_VALID);
 318                         bufHdr->flags &= ~BM_VALID;
 319                 }
 320                 else
 321                 {
 322                         /*
 323                          * Loop to handle the very small possibility that someone re-sets
 324                          * BM_VALID between our clearing it and StartBufferIO inspecting
 325                          * it.
 326                          */
 327                         do
 328                         {
 329                                 LockBufHdr(bufHdr);
 330                                 Assert(bufHdr->flags & BM_VALID);
 331                                 bufHdr->flags &= ~BM_VALID;
 332                                 UnlockBufHdr(bufHdr);
 333                         } while (!StartBufferIO(bufHdr, true));
 334                 }
 335         }
 336
 337         /*
 338          * if we have gotten to this point, we have allocated a buffer for the
 339          * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
 340          * if it's a shared buffer.
 341          *
 342          * Note: if smgrextend fails, we will end up with a buffer that is
 343          * allocated but not marked BM_VALID.  P_NEW will still select the same
 344          * block number (because the relation didn't get any longer on disk) and
 345          * so future attempts to extend the relation will find the same buffer (if
 346          * it's not been recycled) but come right back here to try smgrextend
 347          * again.
 348          */
 349         Assert(!(bufHdr->flags & BM_VALID));            /* spinlock not needed */
 350
 351         bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 352
 353         if (isExtend)
 354         {
 355                 /* new buffers are zero-filled */
 356                 MemSet((char *) bufBlock, 0, BLCKSZ);
 357                 smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, isLocalBuf);
 358         }
 359         else
 360         {
 361                 /*
 362                  * Read in the page, unless the caller intends to overwrite it and
 363                  * just wants us to allocate a buffer.
 364                  */
 365                 if (zeroPage)
 366                         MemSet((char *) bufBlock, 0, BLCKSZ);
 367                 else
 368                 {
 369                         smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
 370
 371                         /* check for garbage data */
 372                         if (!PageHeaderIsValid((PageHeader) bufBlock))
 373                         {
 374                                 if (zero_damaged_pages)
 375                                 {
 376                                         ereport(WARNING,
 377                                                         (errcode(ERRCODE_DATA_CORRUPTED),
 378                                                          errmsg("invalid page header in block %u of relation %u/%u/%u; zeroing out page",
 379                                                                         blockNum,
 380                                                                         smgr->smgr_rnode.spcNode,
 381                                                                         smgr->smgr_rnode.dbNode,
 382                                                                         smgr->smgr_rnode.relNode)));
 383                                         MemSet((char *) bufBlock, 0, BLCKSZ);
 384                                 }
 385                                 else
 386                                         ereport(ERROR,
 387                                                         (errcode(ERRCODE_DATA_CORRUPTED),
 388                                                          errmsg("invalid page header in block %u of relation %u/%u/%u",
 389                                                                         blockNum, smgr->smgr_rnode.spcNode,
 390                                                                         smgr->smgr_rnode.dbNode,
 391                                                                         smgr->smgr_rnode.relNode)));
 392                         }
 393                 }
 394         }
 395
 396         if (isLocalBuf)
 397         {
 398                 /* Only need to adjust flags */
 399                 bufHdr->flags |= BM_VALID;
 400         }
 401         else
 402         {
 403                 /* Set BM_VALID, terminate IO, and wake up any waiters */
 404                 TerminateBufferIO(bufHdr, false, BM_VALID);
 405         }
 406
 407         if (VacuumCostActive)
 408                 VacuumCostBalance += VacuumCostPageMiss;
 409
 410         TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, smgr->smgr_rnode.spcNode,
 411                         smgr->smgr_rnode.dbNode, smgr->smgr_rnode.relNode,
 412                         isLocalBuf, found);
 413
 414         return BufferDescriptorGetBuffer(bufHdr);
 415 }
 416
 417 /*
 418  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
 419  *              buffer.  If no buffer exists already, selects a replacement
 420  *              victim and evicts the old page, but does NOT read in new page.
 421  *
 422  * "strategy" can be a buffer replacement strategy object, or NULL for
 423  * the default strategy.  The selected buffer's usage_count is advanced when
 424  * using the default strategy, but otherwise possibly not (see PinBuffer).
 425  *
 426  * The returned buffer is pinned and is already marked as holding the
 427  * desired page.  If it already did have the desired page, *foundPtr is
 428  * set TRUE.  Otherwise, *foundPtr is set FALSE and the buffer is marked
 429  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
 430  *
 431  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
 432  * we keep it for simplicity in ReadBuffer.
 433  *
 434  * No locks are held either at entry or exit.
 435  */
 436 static volatile BufferDesc *
 437 BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
 438                         BlockNumber blockNum,
 439                         BufferAccessStrategy strategy,
 440                         bool *foundPtr)
 441 {
 442         BufferTag       newTag;                 /* identity of requested block */
 443         uint32          newHash;                /* hash value for newTag */
 444         LWLockId        newPartitionLock;               /* buffer partition lock for it */
 445         BufferTag       oldTag;                 /* previous identity of selected buffer */
 446         uint32          oldHash;                /* hash value for oldTag */
 447         LWLockId        oldPartitionLock;               /* buffer partition lock for it */
 448         BufFlags        oldFlags;
 449         int                     buf_id;
 450         volatile BufferDesc *buf;
 451         bool            valid;
 452
 453         /* create a tag so we can lookup the buffer */
 454         INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum);
 455
 456         /* determine its hash code and partition lock ID */
 457         newHash = BufTableHashCode(&newTag);
 458         newPartitionLock = BufMappingPartitionLock(newHash);
 459
 460         /* see if the block is in the buffer pool already */
 461         LWLockAcquire(newPartitionLock, LW_SHARED);
 462         buf_id = BufTableLookup(&newTag, newHash);
 463         if (buf_id >= 0)
 464         {
 465                 /*
 466                  * Found it.  Now, pin the buffer so no one can steal it from the
 467                  * buffer pool, and check to see if the correct data has been loaded
 468                  * into the buffer.
 469                  */
 470                 buf = &BufferDescriptors[buf_id];
 471
 472                 valid = PinBuffer(buf, strategy);
 473
 474                 /* Can release the mapping lock as soon as we've pinned it */
 475                 LWLockRelease(newPartitionLock);
 476
 477                 *foundPtr = TRUE;
 478
 479                 if (!valid)
 480                 {
 481                         /*
 482                          * We can only get here if (a) someone else is still reading in
 483                          * the page, or (b) a previous read attempt failed.  We have to
 484                          * wait for any active read attempt to finish, and then set up our
 485                          * own read attempt if the page is still not BM_VALID.
 486                          * StartBufferIO does it all.
 487                          */
 488                         if (StartBufferIO(buf, true))
 489                         {
 490                                 /*
 491                                  * If we get here, previous attempts to read the buffer must
 492                                  * have failed ... but we shall bravely try again.
 493                                  */
 494                                 *foundPtr = FALSE;
 495                         }
 496                 }
 497
 498                 return buf;
 499         }
 500
 501         /*
 502          * Didn't find it in the buffer pool.  We'll have to initialize a new
 503          * buffer.      Remember to unlock the mapping lock while doing the work.
 504          */
 505         LWLockRelease(newPartitionLock);
 506
 507         /* Loop here in case we have to try another victim buffer */
 508         for (;;)
 509         {
 510                 bool            lock_held;
 511
 512                 /*
 513                  * Select a victim buffer.      The buffer is returned with its header
 514                  * spinlock still held!  Also (in most cases) the BufFreelistLock is
 515                  * still held, since it would be bad to hold the spinlock while
 516                  * possibly waking up other processes.
 517                  */
 518                 buf = StrategyGetBuffer(strategy, &lock_held);
 519
 520                 Assert(buf->refcount == 0);
 521
 522                 /* Must copy buffer flags while we still hold the spinlock */
 523                 oldFlags = buf->flags;
 524
 525                 /* Pin the buffer and then release the buffer spinlock */
 526                 PinBuffer_Locked(buf);
 527
 528                 /* Now it's safe to release the freelist lock */
 529                 if (lock_held)
 530                         LWLockRelease(BufFreelistLock);
 531
 532                 /*
 533                  * If the buffer was dirty, try to write it out.  There is a race
 534                  * condition here, in that someone might dirty it after we released it
 535                  * above, or even while we are writing it out (since our share-lock
 536                  * won't prevent hint-bit updates).  We will recheck the dirty bit
 537                  * after re-locking the buffer header.
 538                  */
 539                 if (oldFlags & BM_DIRTY)
 540                 {
 541                         /*
 542                          * We need a share-lock on the buffer contents to write it out
 543                          * (else we might write invalid data, eg because someone else is
 544                          * compacting the page contents while we write).  We must use a
 545                          * conditional lock acquisition here to avoid deadlock.  Even
 546                          * though the buffer was not pinned (and therefore surely not
 547                          * locked) when StrategyGetBuffer returned it, someone else could
 548                          * have pinned and exclusive-locked it by the time we get here. If
 549                          * we try to get the lock unconditionally, we'd block waiting for
 550                          * them; if they later block waiting for us, deadlock ensues.
 551                          * (This has been observed to happen when two backends are both
 552                          * trying to split btree index pages, and the second one just
 553                          * happens to be trying to split the page the first one got from
 554                          * StrategyGetBuffer.)
 555                          */
 556                         if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
 557                         {
 558                                 /*
 559                                  * If using a nondefault strategy, and writing the buffer
 560                                  * would require a WAL flush, let the strategy decide whether
 561                                  * to go ahead and write/reuse the buffer or to choose another
 562                                  * victim.      We need lock to inspect the page LSN, so this
 563                                  * can't be done inside StrategyGetBuffer.
 564                                  */
 565                                 if (strategy != NULL &&
 566                                         XLogNeedsFlush(BufferGetLSN(buf)) &&
 567                                         StrategyRejectBuffer(strategy, buf))
 568                                 {
 569                                         /* Drop lock/pin and loop around for another buffer */
 570                                         LWLockRelease(buf->content_lock);
 571                                         UnpinBuffer(buf, true);
 572                                         continue;
 573                                 }
 574
 575                                 /* OK, do the I/O */
 576                                 FlushBuffer(buf, NULL);
 577                                 LWLockRelease(buf->content_lock);
 578                         }
 579                         else
 580                         {
 581                                 /*
 582                                  * Someone else has locked the buffer, so give it up and loop
 583                                  * back to get another one.
 584                                  */
 585                                 UnpinBuffer(buf, true);
 586                                 continue;
 587                         }
 588                 }
 589
 590                 /*
 591                  * To change the association of a valid buffer, we'll need to have
 592                  * exclusive lock on both the old and new mapping partitions.
 593                  */
 594                 if (oldFlags & BM_TAG_VALID)
 595                 {
 596                         /*
 597                          * Need to compute the old tag's hashcode and partition lock ID.
 598                          * XXX is it worth storing the hashcode in BufferDesc so we need
 599                          * not recompute it here?  Probably not.
 600                          */
 601                         oldTag = buf->tag;
 602                         oldHash = BufTableHashCode(&oldTag);
 603                         oldPartitionLock = BufMappingPartitionLock(oldHash);
 604
 605                         /*
 606                          * Must lock the lower-numbered partition first to avoid
 607                          * deadlocks.
 608                          */
 609                         if (oldPartitionLock < newPartitionLock)
 610                         {
 611                                 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 612                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 613                         }
 614                         else if (oldPartitionLock > newPartitionLock)
 615                         {
 616                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 617                                 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 618                         }
 619                         else
 620                         {
 621                                 /* only one partition, only one lock */
 622                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 623                         }
 624                 }
 625                 else
 626                 {
 627                         /* if it wasn't valid, we need only the new partition */
 628                         LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 629                         /* these just keep the compiler quiet about uninit variables */
 630                         oldHash = 0;
 631                         oldPartitionLock = 0;
 632                 }
 633
 634                 /*
 635                  * Try to make a hashtable entry for the buffer under its new tag.
 636                  * This could fail because while we were writing someone else
 637                  * allocated another buffer for the same block we want to read in.
 638                  * Note that we have not yet removed the hashtable entry for the old
 639                  * tag.
 640                  */
 641                 buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
 642
 643                 if (buf_id >= 0)
 644                 {
 645                         /*
 646                          * Got a collision. Someone has already done what we were about to
 647                          * do. We'll just handle this as if it were found in the buffer
 648                          * pool in the first place.  First, give up the buffer we were
 649                          * planning to use.
 650                          */
 651                         UnpinBuffer(buf, true);
 652
 653                         /* Can give up that buffer's mapping partition lock now */
 654                         if ((oldFlags & BM_TAG_VALID) &&
 655                                 oldPartitionLock != newPartitionLock)
 656                                 LWLockRelease(oldPartitionLock);
 657
 658                         /* remaining code should match code at top of routine */
 659
 660                         buf = &BufferDescriptors[buf_id];
 661
 662                         valid = PinBuffer(buf, strategy);
 663
 664                         /* Can release the mapping lock as soon as we've pinned it */
 665                         LWLockRelease(newPartitionLock);
 666
 667                         *foundPtr = TRUE;
 668
 669                         if (!valid)
 670                         {
 671                                 /*
 672                                  * We can only get here if (a) someone else is still reading
 673                                  * in the page, or (b) a previous read attempt failed.  We
 674                                  * have to wait for any active read attempt to finish, and
 675                                  * then set up our own read attempt if the page is still not
 676                                  * BM_VALID.  StartBufferIO does it all.
 677                                  */
 678                                 if (StartBufferIO(buf, true))
 679                                 {
 680                                         /*
 681                                          * If we get here, previous attempts to read the buffer
 682                                          * must have failed ... but we shall bravely try again.
 683                                          */
 684                                         *foundPtr = FALSE;
 685                                 }
 686                         }
 687
 688                         return buf;
 689                 }
 690
 691                 /*
 692                  * Need to lock the buffer header too in order to change its tag.
 693                  */
 694                 LockBufHdr(buf);
 695
 696                 /*
 697                  * Somebody could have pinned or re-dirtied the buffer while we were
 698                  * doing the I/O and making the new hashtable entry.  If so, we can't
 699                  * recycle this buffer; we must undo everything we've done and start
 700                  * over with a new victim buffer.
 701                  */
 702                 oldFlags = buf->flags;
 703                 if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
 704                         break;
 705
 706                 UnlockBufHdr(buf);
 707                 BufTableDelete(&newTag, newHash);
 708                 if ((oldFlags & BM_TAG_VALID) &&
 709                         oldPartitionLock != newPartitionLock)
 710                         LWLockRelease(oldPartitionLock);
 711                 LWLockRelease(newPartitionLock);
 712                 UnpinBuffer(buf, true);
 713         }
 714
 715         /*
 716          * Okay, it's finally safe to rename the buffer.
 717          *
 718          * Clearing BM_VALID here is necessary, clearing the dirtybits is just
 719          * paranoia.  We also reset the usage_count since any recency of use of
 720          * the old content is no longer relevant.  (The usage_count starts out at
 721          * 1 so that the buffer can survive one clock-sweep pass.)
 722          */
 723         buf->tag = newTag;
 724         buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR);
 725         buf->flags |= BM_TAG_VALID;
 726         buf->usage_count = 1;
 727
 728         UnlockBufHdr(buf);
 729
 730         if (oldFlags & BM_TAG_VALID)
 731         {
 732                 BufTableDelete(&oldTag, oldHash);
 733                 if (oldPartitionLock != newPartitionLock)
 734                         LWLockRelease(oldPartitionLock);
 735         }
 736
 737         LWLockRelease(newPartitionLock);
 738
 739         /*
 740          * Buffer contents are currently invalid.  Try to get the io_in_progress
 741          * lock.  If StartBufferIO returns false, then someone else managed to
 742          * read it before we did, so there's nothing left for BufferAlloc() to do.
 743          */
 744         if (StartBufferIO(buf, true))
 745                 *foundPtr = FALSE;
 746         else
 747                 *foundPtr = TRUE;
 748
 749         return buf;
 750 }
 751
 752 /*
 753  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
 754  * freelist.
 755  *
 756  * The buffer header spinlock must be held at entry.  We drop it before
 757  * returning.  (This is sane because the caller must have locked the
 758  * buffer in order to be sure it should be dropped.)
 759  *
 760  * This is used only in contexts such as dropping a relation.  We assume
 761  * that no other backend could possibly be interested in using the page,
 762  * so the only reason the buffer might be pinned is if someone else is
 763  * trying to write it out.      We have to let them finish before we can
 764  * reclaim the buffer.
 765  *
 766  * The buffer could get reclaimed by someone else while we are waiting
 767  * to acquire the necessary locks; if so, don't mess it up.
 768  */
 769 static void
 770 InvalidateBuffer(volatile BufferDesc *buf)
 771 {
 772         BufferTag       oldTag;
 773         uint32          oldHash;                /* hash value for oldTag */
 774         LWLockId        oldPartitionLock;               /* buffer partition lock for it */
 775         BufFlags        oldFlags;
 776
 777         /* Save the original buffer tag before dropping the spinlock */
 778         oldTag = buf->tag;
 779
 780         UnlockBufHdr(buf);
 781
 782         /*
 783          * Need to compute the old tag's hashcode and partition lock ID. XXX is it
 784          * worth storing the hashcode in BufferDesc so we need not recompute it
 785          * here?  Probably not.
 786          */
 787         oldHash = BufTableHashCode(&oldTag);
 788         oldPartitionLock = BufMappingPartitionLock(oldHash);
 789
 790 retry:
 791
 792         /*
 793          * Acquire exclusive mapping lock in preparation for changing the buffer's
 794          * association.
 795          */
 796         LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 797
 798         /* Re-lock the buffer header */
 799         LockBufHdr(buf);
 800
 801         /* If it's changed while we were waiting for lock, do nothing */
 802         if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
 803         {
 804                 UnlockBufHdr(buf);
 805                 LWLockRelease(oldPartitionLock);
 806                 return;
 807         }
 808
 809         /*
 810          * We assume the only reason for it to be pinned is that someone else is
 811          * flushing the page out.  Wait for them to finish.  (This could be an
 812          * infinite loop if the refcount is messed up... it would be nice to time
 813          * out after awhile, but there seems no way to be sure how many loops may
 814          * be needed.  Note that if the other guy has pinned the buffer but not
 815          * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 816          * be busy-looping here.)
 817          */
 818         if (buf->refcount != 0)
 819         {
 820                 UnlockBufHdr(buf);
 821                 LWLockRelease(oldPartitionLock);
 822                 /* safety check: should definitely not be our *own* pin */
 823                 if (PrivateRefCount[buf->buf_id] != 0)
 824                         elog(ERROR, "buffer is pinned in InvalidateBuffer");
 825                 WaitIO(buf);
 826                 goto retry;
 827         }
 828
 829         /*
 830          * Clear out the buffer's tag and flags.  We must do this to ensure that
 831          * linear scans of the buffer array don't think the buffer is valid.
 832          */
 833         oldFlags = buf->flags;
 834         CLEAR_BUFFERTAG(buf->tag);
 835         buf->flags = 0;
 836         buf->usage_count = 0;
 837
 838         UnlockBufHdr(buf);
 839
 840         /*
 841          * Remove the buffer from the lookup hashtable, if it was in there.
 842          */
 843         if (oldFlags & BM_TAG_VALID)
 844                 BufTableDelete(&oldTag, oldHash);
 845
 846         /*
 847          * Done with mapping lock.
 848          */
 849         LWLockRelease(oldPartitionLock);
 850
 851         /*
 852          * Insert the buffer at the head of the list of free buffers.
 853          */
 854         StrategyFreeBuffer(buf);
 855 }
 856
 857 /*
 858  * MarkBufferDirty
 859  *
 860  *              Marks buffer contents as dirty (actual write happens later).
 861  *
 862  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
 863  * exclusive lock, then somebody could be in process of writing the buffer,
 864  * leading to risk of bad data written to disk.)
 865  */
 866 void
 867 MarkBufferDirty(Buffer buffer)
 868 {
 869         volatile BufferDesc *bufHdr;
 870
 871         if (!BufferIsValid(buffer))
 872                 elog(ERROR, "bad buffer id: %d", buffer);
 873
 874         if (BufferIsLocal(buffer))
 875         {
 876                 MarkLocalBufferDirty(buffer);
 877                 return;
 878         }
 879
 880         bufHdr = &BufferDescriptors[buffer - 1];
 881
 882         Assert(PrivateRefCount[buffer - 1] > 0);
 883         /* unfortunately we can't check if the lock is held exclusively */
 884         Assert(LWLockHeldByMe(bufHdr->content_lock));
 885
 886         LockBufHdr(bufHdr);
 887
 888         Assert(bufHdr->refcount > 0);
 889
 890         /*
 891          * If the buffer was not dirty already, do vacuum cost accounting.
 892          */
 893         if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
 894                 VacuumCostBalance += VacuumCostPageDirty;
 895
 896         bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
 897
 898         UnlockBufHdr(bufHdr);
 899 }
 900
 901 /*
 902  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
 903  *
 904  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
 905  * compared to calling the two routines separately.  Now it's mainly just
 906  * a convenience function.      However, if the passed buffer is valid and
 907  * already contains the desired block, we just return it as-is; and that
 908  * does save considerable work compared to a full release and reacquire.
 909  *
 910  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
 911  * buffer actually needs to be released.  This case is the same as ReadBuffer,
 912  * but can save some tests in the caller.
 913  */
 914 Buffer
 915 ReleaseAndReadBuffer(Buffer buffer,
 916                                          Relation relation,
 917                                          BlockNumber blockNum)
 918 {
 919         ForkNumber forkNum = MAIN_FORKNUM;
 920         volatile BufferDesc *bufHdr;
 921
 922         if (BufferIsValid(buffer))
 923         {
 924                 if (BufferIsLocal(buffer))
 925                 {
 926                         Assert(LocalRefCount[-buffer - 1] > 0);
 927                         bufHdr = &LocalBufferDescriptors[-buffer - 1];
 928                         if (bufHdr->tag.blockNum == blockNum &&
 929                                 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
 930                                 bufHdr->tag.forkNum == forkNum)
 931                                 return buffer;
 932                         ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 933                         LocalRefCount[-buffer - 1]--;
 934                 }
 935                 else
 936                 {
 937                         Assert(PrivateRefCount[buffer - 1] > 0);
 938                         bufHdr = &BufferDescriptors[buffer - 1];
 939                         /* we have pin, so it's ok to examine tag without spinlock */
 940                         if (bufHdr->tag.blockNum == blockNum &&
 941                                 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
 942                                 bufHdr->tag.forkNum == forkNum)
 943                                 return buffer;
 944                         UnpinBuffer(bufHdr, true);
 945                 }
 946         }
 947
 948         return ReadBuffer(relation, blockNum);
 949 }
 950
 951 /*
 952  * PinBuffer -- make buffer unavailable for replacement.
 953  *
 954  * For the default access strategy, the buffer's usage_count is incremented
 955  * when we first pin it; for other strategies we just make sure the usage_count
 956  * isn't zero.  (The idea of the latter is that we don't want synchronized
 957  * heap scans to inflate the count, but we need it to not be zero to discourage
 958  * other backends from stealing buffers from our ring.  As long as we cycle
 959  * through the ring faster than the global clock-sweep cycles, buffers in
 960  * our ring won't be chosen as victims for replacement by other backends.)
 961  *
 962  * This should be applied only to shared buffers, never local ones.
 963  *
 964  * Note that ResourceOwnerEnlargeBuffers must have been done already.
 965  *
 966  * Returns TRUE if buffer is BM_VALID, else FALSE.      This provision allows
 967  * some callers to avoid an extra spinlock cycle.
 968  */
 969 static bool
 970 PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 971 {
 972         int                     b = buf->buf_id;
 973         bool            result;
 974
 975         if (PrivateRefCount[b] == 0)
 976         {
 977                 LockBufHdr(buf);
 978                 buf->refcount++;
 979                 if (strategy == NULL)
 980                 {
 981                         if (buf->usage_count < BM_MAX_USAGE_COUNT)
 982                                 buf->usage_count++;
 983                 }
 984                 else
 985                 {
 986                         if (buf->usage_count == 0)
 987                                 buf->usage_count = 1;
 988                 }
 989                 result = (buf->flags & BM_VALID) != 0;
 990                 UnlockBufHdr(buf);
 991         }
 992         else
 993         {
 994                 /* If we previously pinned the buffer, it must surely be valid */
 995                 result = true;
 996         }
 997         PrivateRefCount[b]++;
 998         Assert(PrivateRefCount[b] > 0);
 999         ResourceOwnerRememberBuffer(CurrentResourceOwner,
1000                                                                 BufferDescriptorGetBuffer(buf));
1001         return result;
1002 }
1003
1004 /*
1005  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1006  * The spinlock is released before return.
1007  *
1008  * Currently, no callers of this function want to modify the buffer's
1009  * usage_count at all, so there's no need for a strategy parameter.
1010  * Also we don't bother with a BM_VALID test (the caller could check that for
1011  * itself).
1012  *
1013  * Note: use of this routine is frequently mandatory, not just an optimization
1014  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1015  * its state can change under us.
1016  */
1017 static void
1018 PinBuffer_Locked(volatile BufferDesc *buf)
1019 {
1020         int                     b = buf->buf_id;
1021
1022         if (PrivateRefCount[b] == 0)
1023                 buf->refcount++;
1024         UnlockBufHdr(buf);
1025         PrivateRefCount[b]++;
1026         Assert(PrivateRefCount[b] > 0);
1027         ResourceOwnerRememberBuffer(CurrentResourceOwner,
1028                                                                 BufferDescriptorGetBuffer(buf));
1029 }
1030
1031 /*
1032  * UnpinBuffer -- make buffer available for replacement.
1033  *
1034  * This should be applied only to shared buffers, never local ones.
1035  *
1036  * Most but not all callers want CurrentResourceOwner to be adjusted.
1037  * Those that don't should pass fixOwner = FALSE.
1038  */
1039 static void
1040 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
1041 {
1042         int                     b = buf->buf_id;
1043
1044         if (fixOwner)
1045                 ResourceOwnerForgetBuffer(CurrentResourceOwner,
1046                                                                   BufferDescriptorGetBuffer(buf));
1047
1048         Assert(PrivateRefCount[b] > 0);
1049         PrivateRefCount[b]--;
1050         if (PrivateRefCount[b] == 0)
1051         {
1052                 /* I'd better not still hold any locks on the buffer */
1053                 Assert(!LWLockHeldByMe(buf->content_lock));
1054                 Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1055
1056                 LockBufHdr(buf);
1057
1058                 /* Decrement the shared reference count */
1059                 Assert(buf->refcount > 0);
1060                 buf->refcount--;
1061
1062                 /* Support LockBufferForCleanup() */
1063                 if ((buf->flags & BM_PIN_COUNT_WAITER) &&
1064                         buf->refcount == 1)
1065                 {
1066                         /* we just released the last pin other than the waiter's */
1067                         int                     wait_backend_pid = buf->wait_backend_pid;
1068
1069                         buf->flags &= ~BM_PIN_COUNT_WAITER;
1070                         UnlockBufHdr(buf);
1071                         ProcSendSignal(wait_backend_pid);
1072                 }
1073                 else
1074                         UnlockBufHdr(buf);
1075         }
1076 }
1077
1078 /*
1079  * BufferSync -- Write out all dirty buffers in the pool.
1080  *
1081  * This is called at checkpoint time to write out all dirty shared buffers.
1082  * The checkpoint request flags should be passed in; currently the only one
1083  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
1084  */
1085 static void
1086 BufferSync(int flags)
1087 {
1088         int                     buf_id;
1089         int                     num_to_scan;
1090         int                     num_to_write;
1091         int                     num_written;
1092
1093         /* Make sure we can handle the pin inside SyncOneBuffer */
1094         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1095
1096         /*
1097          * Loop over all buffers, and mark the ones that need to be written with
1098          * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_write), so that we
1099          * can estimate how much work needs to be done.
1100          *
1101          * This allows us to write only those pages that were dirty when the
1102          * checkpoint began, and not those that get dirtied while it proceeds.
1103          * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1104          * later in this function, or by normal backends or the bgwriter cleaning
1105          * scan, the flag is cleared.  Any buffer dirtied after this point won't
1106          * have the flag set.
1107          *
1108          * Note that if we fail to write some buffer, we may leave buffers with
1109          * BM_CHECKPOINT_NEEDED still set.      This is OK since any such buffer would
1110          * certainly need to be written for the next checkpoint attempt, too.
1111          */
1112         num_to_write = 0;
1113         for (buf_id = 0; buf_id < NBuffers; buf_id++)
1114         {
1115                 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1116
1117                 /*
1118                  * Header spinlock is enough to examine BM_DIRTY, see comment in
1119                  * SyncOneBuffer.
1120                  */
1121                 LockBufHdr(bufHdr);
1122
1123                 if (bufHdr->flags & BM_DIRTY)
1124                 {
1125                         bufHdr->flags |= BM_CHECKPOINT_NEEDED;
1126                         num_to_write++;
1127                 }
1128
1129                 UnlockBufHdr(bufHdr);
1130         }
1131
1132         if (num_to_write == 0)
1133                 return;                                 /* nothing to do */
1134
1135         TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
1136
1137         /*
1138          * Loop over all buffers again, and write the ones (still) marked with
1139          * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
1140          * since we might as well dump soon-to-be-recycled buffers first.
1141          *
1142          * Note that we don't read the buffer alloc count here --- that should be
1143          * left untouched till the next BgBufferSync() call.
1144          */
1145         buf_id = StrategySyncStart(NULL, NULL);
1146         num_to_scan = NBuffers;
1147         num_written = 0;
1148         while (num_to_scan-- > 0)
1149         {
1150                 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1151
1152                 /*
1153                  * We don't need to acquire the lock here, because we're only looking
1154                  * at a single bit. It's possible that someone else writes the buffer
1155                  * and clears the flag right after we check, but that doesn't matter
1156                  * since SyncOneBuffer will then do nothing.  However, there is a
1157                  * further race condition: it's conceivable that between the time we
1158                  * examine the bit here and the time SyncOneBuffer acquires lock,
1159                  * someone else not only wrote the buffer but replaced it with another
1160                  * page and dirtied it.  In that improbable case, SyncOneBuffer will
1161                  * write the buffer though we didn't need to.  It doesn't seem worth
1162                  * guarding against this, though.
1163                  */
1164                 if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
1165                 {
1166                         if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
1167                         {
1168                                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1169                                 BgWriterStats.m_buf_written_checkpoints++;
1170                                 num_written++;
1171
1172                                 /*
1173                                  * We know there are at most num_to_write buffers with
1174                                  * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
1175                                  * num_written reaches num_to_write.
1176                                  *
1177                                  * Note that num_written doesn't include buffers written by
1178                                  * other backends, or by the bgwriter cleaning scan. That
1179                                  * means that the estimate of how much progress we've made is
1180                                  * conservative, and also that this test will often fail to
1181                                  * trigger.  But it seems worth making anyway.
1182                                  */
1183                                 if (num_written >= num_to_write)
1184                                         break;
1185
1186                                 /*
1187                                  * Perform normal bgwriter duties and sleep to throttle our
1188                                  * I/O rate.
1189                                  */
1190                                 CheckpointWriteDelay(flags,
1191                                                                          (double) num_written / num_to_write);
1192                         }
1193                 }
1194
1195                 if (++buf_id >= NBuffers)
1196                         buf_id = 0;
1197         }
1198
1199         TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
1200
1201         /*
1202          * Update checkpoint statistics. As noted above, this doesn't include
1203          * buffers written by other backends or bgwriter scan.
1204          */
1205         CheckpointStats.ckpt_bufs_written += num_written;
1206 }
1207
1208 /*
1209  * BgBufferSync -- Write out some dirty buffers in the pool.
1210  *
1211  * This is called periodically by the background writer process.
1212  */
1213 void
1214 BgBufferSync(void)
1215 {
1216         /* info obtained from freelist.c */
1217         int                     strategy_buf_id;
1218         uint32          strategy_passes;
1219         uint32          recent_alloc;
1220
1221         /*
1222          * Information saved between calls so we can determine the strategy
1223          * point's advance rate and avoid scanning already-cleaned buffers.
1224          */
1225         static bool saved_info_valid = false;
1226         static int      prev_strategy_buf_id;
1227         static uint32 prev_strategy_passes;
1228         static int      next_to_clean;
1229         static uint32 next_passes;
1230
1231         /* Moving averages of allocation rate and clean-buffer density */
1232         static float smoothed_alloc = 0;
1233         static float smoothed_density = 10.0;
1234
1235         /* Potentially these could be tunables, but for now, not */
1236         float           smoothing_samples = 16;
1237         float           scan_whole_pool_milliseconds = 120000.0;
1238
1239         /* Used to compute how far we scan ahead */
1240         long            strategy_delta;
1241         int                     bufs_to_lap;
1242         int                     bufs_ahead;
1243         float           scans_per_alloc;
1244         int                     reusable_buffers_est;
1245         int                     upcoming_alloc_est;
1246         int                     min_scan_buffers;
1247
1248         /* Variables for the scanning loop proper */
1249         int                     num_to_scan;
1250         int                     num_written;
1251         int                     reusable_buffers;
1252
1253         /*
1254          * Find out where the freelist clock sweep currently is, and how many
1255          * buffer allocations have happened since our last call.
1256          */
1257         strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
1258
1259         /* Report buffer alloc counts to pgstat */
1260         BgWriterStats.m_buf_alloc += recent_alloc;
1261
1262         /*
1263          * If we're not running the LRU scan, just stop after doing the stats
1264          * stuff.  We mark the saved state invalid so that we can recover sanely
1265          * if LRU scan is turned back on later.
1266          */
1267         if (bgwriter_lru_maxpages <= 0)
1268         {
1269                 saved_info_valid = false;
1270                 return;
1271         }
1272
1273         /*
1274          * Compute strategy_delta = how many buffers have been scanned by the
1275          * clock sweep since last time.  If first time through, assume none. Then
1276          * see if we are still ahead of the clock sweep, and if so, how many
1277          * buffers we could scan before we'd catch up with it and "lap" it. Note:
1278          * weird-looking coding of xxx_passes comparisons are to avoid bogus
1279          * behavior when the passes counts wrap around.
1280          */
1281         if (saved_info_valid)
1282         {
1283                 int32           passes_delta = strategy_passes - prev_strategy_passes;
1284
1285                 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
1286                 strategy_delta += (long) passes_delta *NBuffers;
1287
1288                 Assert(strategy_delta >= 0);
1289
1290                 if ((int32) (next_passes - strategy_passes) > 0)
1291                 {
1292                         /* we're one pass ahead of the strategy point */
1293                         bufs_to_lap = strategy_buf_id - next_to_clean;
1294 #ifdef BGW_DEBUG
1295                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1296                                  next_passes, next_to_clean,
1297                                  strategy_passes, strategy_buf_id,
1298                                  strategy_delta, bufs_to_lap);
1299 #endif
1300                 }
1301                 else if (next_passes == strategy_passes &&
1302                                  next_to_clean >= strategy_buf_id)
1303                 {
1304                         /* on same pass, but ahead or at least not behind */
1305                         bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
1306 #ifdef BGW_DEBUG
1307                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1308                                  next_passes, next_to_clean,
1309                                  strategy_passes, strategy_buf_id,
1310                                  strategy_delta, bufs_to_lap);
1311 #endif
1312                 }
1313                 else
1314                 {
1315                         /*
1316                          * We're behind, so skip forward to the strategy point and start
1317                          * cleaning from there.
1318                          */
1319 #ifdef BGW_DEBUG
1320                         elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
1321                                  next_passes, next_to_clean,
1322                                  strategy_passes, strategy_buf_id,
1323                                  strategy_delta);
1324 #endif
1325                         next_to_clean = strategy_buf_id;
1326                         next_passes = strategy_passes;
1327                         bufs_to_lap = NBuffers;
1328                 }
1329         }
1330         else
1331         {
1332                 /*
1333                  * Initializing at startup or after LRU scanning had been off. Always
1334                  * start at the strategy point.
1335                  */
1336 #ifdef BGW_DEBUG
1337                 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
1338                          strategy_passes, strategy_buf_id);
1339 #endif
1340                 strategy_delta = 0;
1341                 next_to_clean = strategy_buf_id;
1342                 next_passes = strategy_passes;
1343                 bufs_to_lap = NBuffers;
1344         }
1345
1346         /* Update saved info for next time */
1347         prev_strategy_buf_id = strategy_buf_id;
1348         prev_strategy_passes = strategy_passes;
1349         saved_info_valid = true;
1350
1351         /*
1352          * Compute how many buffers had to be scanned for each new allocation, ie,
1353          * 1/density of reusable buffers, and track a moving average of that.
1354          *
1355          * If the strategy point didn't move, we don't update the density estimate
1356          */
1357         if (strategy_delta > 0 && recent_alloc > 0)
1358         {
1359                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1360                 smoothed_density += (scans_per_alloc - smoothed_density) /
1361                         smoothing_samples;
1362         }
1363
1364         /*
1365          * Estimate how many reusable buffers there are between the current
1366          * strategy point and where we've scanned ahead to, based on the smoothed
1367          * density estimate.
1368          */
1369         bufs_ahead = NBuffers - bufs_to_lap;
1370         reusable_buffers_est = (float) bufs_ahead / smoothed_density;
1371
1372         /*
1373          * Track a moving average of recent buffer allocations.  Here, rather than
1374          * a true average we want a fast-attack, slow-decline behavior: we
1375          * immediately follow any increase.
1376          */
1377         if (smoothed_alloc <= (float) recent_alloc)
1378                 smoothed_alloc = recent_alloc;
1379         else
1380                 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
1381                         smoothing_samples;
1382
1383         /* Scale the estimate by a GUC to allow more aggressive tuning. */
1384         upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier;
1385
1386         /*
1387          * Even in cases where there's been little or no buffer allocation
1388          * activity, we want to make a small amount of progress through the buffer
1389          * cache so that as many reusable buffers as possible are clean after an
1390          * idle period.
1391          *
1392          * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
1393          * the BGW will be called during the scan_whole_pool time; slice the
1394          * buffer pool into that many sections.
1395          */
1396         min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
1397
1398         if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
1399         {
1400 #ifdef BGW_DEBUG
1401                 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
1402                          upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
1403 #endif
1404                 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
1405         }
1406
1407         /*
1408          * Now write out dirty reusable buffers, working forward from the
1409          * next_to_clean point, until we have lapped the strategy scan, or cleaned
1410          * enough buffers to match our estimate of the next cycle's allocation
1411          * requirements, or hit the bgwriter_lru_maxpages limit.
1412          */
1413
1414         /* Make sure we can handle the pin inside SyncOneBuffer */
1415         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1416
1417         num_to_scan = bufs_to_lap;
1418         num_written = 0;
1419         reusable_buffers = reusable_buffers_est;
1420
1421         /* Execute the LRU scan */
1422         while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
1423         {
1424                 int                     buffer_state = SyncOneBuffer(next_to_clean, true);
1425
1426                 if (++next_to_clean >= NBuffers)
1427                 {
1428                         next_to_clean = 0;
1429                         next_passes++;
1430                 }
1431                 num_to_scan--;
1432
1433                 if (buffer_state & BUF_WRITTEN)
1434                 {
1435                         reusable_buffers++;
1436                         if (++num_written >= bgwriter_lru_maxpages)
1437                         {
1438                                 BgWriterStats.m_maxwritten_clean++;
1439                                 break;
1440                         }
1441                 }
1442                 else if (buffer_state & BUF_REUSABLE)
1443                         reusable_buffers++;
1444         }
1445
1446         BgWriterStats.m_buf_written_clean += num_written;
1447
1448 #ifdef BGW_DEBUG
1449         elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
1450                  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
1451                  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
1452                  bufs_to_lap - num_to_scan,
1453                  num_written,
1454                  reusable_buffers - reusable_buffers_est);
1455 #endif
1456
1457         /*
1458          * Consider the above scan as being like a new allocation scan.
1459          * Characterize its density and update the smoothed one based on it. This
1460          * effectively halves the moving average period in cases where both the
1461          * strategy and the background writer are doing some useful scanning,
1462          * which is helpful because a long memory isn't as desirable on the
1463          * density estimates.
1464          */
1465         strategy_delta = bufs_to_lap - num_to_scan;
1466         recent_alloc = reusable_buffers - reusable_buffers_est;
1467         if (strategy_delta > 0 && recent_alloc > 0)
1468         {
1469                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1470                 smoothed_density += (scans_per_alloc - smoothed_density) /
1471                         smoothing_samples;
1472
1473 #ifdef BGW_DEBUG
1474                 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
1475                          recent_alloc, strategy_delta, scans_per_alloc, smoothed_density);
1476 #endif
1477         }
1478 }
1479
1480 /*
1481  * SyncOneBuffer -- process a single buffer during syncing.
1482  *
1483  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
1484  * buffers marked recently used, as these are not replacement candidates.
1485  *
1486  * Returns a bitmask containing the following flag bits:
1487  *      BUF_WRITTEN: we wrote the buffer.
1488  *      BUF_REUSABLE: buffer is available for replacement, ie, it has
1489  *              pin count 0 and usage count 0.
1490  *
1491  * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
1492  * after locking it, but we don't care all that much.)
1493  *
1494  * Note: caller must have done ResourceOwnerEnlargeBuffers.
1495  */
1496 static int
1497 SyncOneBuffer(int buf_id, bool skip_recently_used)
1498 {
1499         volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1500         int                     result = 0;
1501
1502         /*
1503          * Check whether buffer needs writing.
1504          *
1505          * We can make this check without taking the buffer content lock so long
1506          * as we mark pages dirty in access methods *before* logging changes with
1507          * XLogInsert(): if someone marks the buffer dirty just after our check we
1508          * don't worry because our checkpoint.redo points before log record for
1509          * upcoming changes and so we are not required to write such dirty buffer.
1510          */
1511         LockBufHdr(bufHdr);
1512
1513         if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
1514                 result |= BUF_REUSABLE;
1515         else if (skip_recently_used)
1516         {
1517                 /* Caller told us not to write recently-used buffers */
1518                 UnlockBufHdr(bufHdr);
1519                 return result;
1520         }
1521
1522         if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
1523         {
1524                 /* It's clean, so nothing to do */
1525                 UnlockBufHdr(bufHdr);
1526                 return result;
1527         }
1528
1529         /*
1530          * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
1531          * buffer is clean by the time we've locked it.)
1532          */
1533         PinBuffer_Locked(bufHdr);
1534         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
1535
1536         FlushBuffer(bufHdr, NULL);
1537
1538         LWLockRelease(bufHdr->content_lock);
1539         UnpinBuffer(bufHdr, true);
1540
1541         return result | BUF_WRITTEN;
1542 }
1543
1544
1545 /*
1546  * Return a palloc'd string containing buffer usage statistics.
1547  */
1548 char *
1549 ShowBufferUsage(void)
1550 {
1551         StringInfoData str;
1552         float           hitrate;
1553         float           localhitrate;
1554
1555         initStringInfo(&str);
1556
1557         if (ReadBufferCount == 0)
1558                 hitrate = 0.0;
1559         else
1560                 hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
1561
1562         if (ReadLocalBufferCount == 0)
1563                 localhitrate = 0.0;
1564         else
1565                 localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
1566
1567         appendStringInfo(&str,
1568         "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1569                                 ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
1570         appendStringInfo(&str,
1571         "!\tLocal  blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1572                                          ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
1573         appendStringInfo(&str,
1574                                          "!\tDirect blocks: %10ld read, %10ld written\n",
1575                                          NDirectFileRead, NDirectFileWrite);
1576
1577         return str.data;
1578 }
1579
1580 void
1581 ResetBufferUsage(void)
1582 {
1583         BufferHitCount = 0;
1584         ReadBufferCount = 0;
1585         BufferFlushCount = 0;
1586         LocalBufferHitCount = 0;
1587         ReadLocalBufferCount = 0;
1588         LocalBufferFlushCount = 0;
1589         NDirectFileRead = 0;
1590         NDirectFileWrite = 0;
1591 }
1592
1593 /*
1594  *              AtEOXact_Buffers - clean up at end of transaction.
1595  *
1596  *              As of PostgreSQL 8.0, buffer pins should get released by the
1597  *              ResourceOwner mechanism.  This routine is just a debugging
1598  *              cross-check that no pins remain.
1599  */
1600 void
1601 AtEOXact_Buffers(bool isCommit)
1602 {
1603 #ifdef USE_ASSERT_CHECKING
1604         if (assert_enabled)
1605         {
1606                 int                     i;
1607
1608                 for (i = 0; i < NBuffers; i++)
1609                 {
1610                         Assert(PrivateRefCount[i] == 0);
1611                 }
1612         }
1613 #endif
1614
1615         AtEOXact_LocalBuffers(isCommit);
1616 }
1617
1618 /*
1619  * InitBufferPoolBackend --- second-stage initialization of a new backend
1620  *
1621  * This is called after we have acquired a PGPROC and so can safely get
1622  * LWLocks.  We don't currently need to do anything at this stage ...
1623  * except register a shmem-exit callback.  AtProcExit_Buffers needs LWLock
1624  * access, and thereby has to be called at the corresponding phase of
1625  * backend shutdown.
1626  */
1627 void
1628 InitBufferPoolBackend(void)
1629 {
1630         on_shmem_exit(AtProcExit_Buffers, 0);
1631 }
1632
1633 /*
1634  * Ensure we have released all shared-buffer locks and pins during backend exit
1635  */
1636 static void
1637 AtProcExit_Buffers(int code, Datum arg)
1638 {
1639         int                     i;
1640
1641         AbortBufferIO();
1642         UnlockBuffers();
1643
1644         for (i = 0; i < NBuffers; i++)
1645         {
1646                 if (PrivateRefCount[i] != 0)
1647                 {
1648                         volatile BufferDesc *buf = &(BufferDescriptors[i]);
1649
1650                         /*
1651                          * We don't worry about updating ResourceOwner; if we even got
1652                          * here, it suggests that ResourceOwners are messed up.
1653                          */
1654                         PrivateRefCount[i] = 1;         /* make sure we release shared pin */
1655                         UnpinBuffer(buf, false);
1656                         Assert(PrivateRefCount[i] == 0);
1657                 }
1658         }
1659
1660         /* localbuf.c needs a chance too */
1661         AtProcExit_LocalBuffers();
1662 }
1663
1664 /*
1665  * Helper routine to issue warnings when a buffer is unexpectedly pinned
1666  */
1667 void
1668 PrintBufferLeakWarning(Buffer buffer)
1669 {
1670         volatile BufferDesc *buf;
1671         int32           loccount;
1672
1673         Assert(BufferIsValid(buffer));
1674         if (BufferIsLocal(buffer))
1675         {
1676                 buf = &LocalBufferDescriptors[-buffer - 1];
1677                 loccount = LocalRefCount[-buffer - 1];
1678         }
1679         else
1680         {
1681                 buf = &BufferDescriptors[buffer - 1];
1682                 loccount = PrivateRefCount[buffer - 1];
1683         }
1684
1685         /* theoretically we should lock the bufhdr here */
1686         elog(WARNING,
1687                  "buffer refcount leak: [%03d] "
1688                  "(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
1689                  buffer,
1690                  buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1691                  buf->tag.rnode.relNode,
1692                  buf->tag.blockNum, buf->flags,
1693                  buf->refcount, loccount);
1694 }
1695
1696 /*
1697  * CheckPointBuffers
1698  *
1699  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
1700  *
1701  * Note: temporary relations do not participate in checkpoints, so they don't
1702  * need to be flushed.
1703  */
1704 void
1705 CheckPointBuffers(int flags)
1706 {
1707         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
1708         CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
1709         BufferSync(flags);
1710         CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
1711         smgrsync();
1712         CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
1713         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
1714 }
1715
1716
1717 /*
1718  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
1719  */
1720 void
1721 BufmgrCommit(void)
1722 {
1723         /* Nothing to do in bufmgr anymore... */
1724
1725         smgrcommit();
1726 }
1727
1728 /*
1729  * BufferGetBlockNumber
1730  *              Returns the block number associated with a buffer.
1731  *
1732  * Note:
1733  *              Assumes that the buffer is valid and pinned, else the
1734  *              value may be obsolete immediately...
1735  */
1736 BlockNumber
1737 BufferGetBlockNumber(Buffer buffer)
1738 {
1739         volatile BufferDesc *bufHdr;
1740
1741         Assert(BufferIsPinned(buffer));
1742
1743         if (BufferIsLocal(buffer))
1744                 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1745         else
1746                 bufHdr = &BufferDescriptors[buffer - 1];
1747
1748         /* pinned, so OK to read tag without spinlock */
1749         return bufHdr->tag.blockNum;
1750 }
1751
1752 /*
1753  * BufferGetTag
1754  *              Returns the relfilenode, fork number and block number associated with
1755  *              a buffer.
1756  */
1757 void
1758 BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
1759                          BlockNumber *blknum)
1760 {
1761         volatile BufferDesc *bufHdr;
1762
1763         /* Do the same checks as BufferGetBlockNumber. */
1764         Assert(BufferIsPinned(buffer));
1765
1766         if (BufferIsLocal(buffer))
1767                 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1768         else
1769                 bufHdr = &BufferDescriptors[buffer - 1];
1770
1771         /* pinned, so OK to read tag without spinlock */
1772         *rnode = bufHdr->tag.rnode;
1773         *forknum = bufHdr->tag.forkNum;
1774         *blknum = bufHdr->tag.blockNum;
1775 }
1776
1777 /*
1778  * FlushBuffer
1779  *              Physically write out a shared buffer.
1780  *
1781  * NOTE: this actually just passes the buffer contents to the kernel; the
1782  * real write to disk won't happen until the kernel feels like it.  This
1783  * is okay from our point of view since we can redo the changes from WAL.
1784  * However, we will need to force the changes to disk via fsync before
1785  * we can checkpoint WAL.
1786  *
1787  * The caller must hold a pin on the buffer and have share-locked the
1788  * buffer contents.  (Note: a share-lock does not prevent updates of
1789  * hint bits in the buffer, so the page could change while the write
1790  * is in progress, but we assume that that will not invalidate the data
1791  * written.)
1792  *
1793  * If the caller has an smgr reference for the buffer's relation, pass it
1794  * as the second parameter.  If not, pass NULL.
1795  */
1796 static void
1797 FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
1798 {
1799         XLogRecPtr      recptr;
1800         ErrorContextCallback errcontext;
1801
1802         /*
1803          * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
1804          * false, then someone else flushed the buffer before we could, so we need
1805          * not do anything.
1806          */
1807         if (!StartBufferIO(buf, false))
1808                 return;
1809
1810         /* Setup error traceback support for ereport() */
1811         errcontext.callback = buffer_write_error_callback;
1812         errcontext.arg = (void *) buf;
1813         errcontext.previous = error_context_stack;
1814         error_context_stack = &errcontext;
1815
1816         /* Find smgr relation for buffer */
1817         if (reln == NULL)
1818                 reln = smgropen(buf->tag.rnode);
1819
1820         TRACE_POSTGRESQL_BUFFER_FLUSH_START(reln->smgr_rnode.spcNode,
1821                  reln->smgr_rnode.dbNode,
1822                  reln->smgr_rnode.relNode);
1823
1824         /*
1825          * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
1826          * rule that log updates must hit disk before any of the data-file changes
1827          * they describe do.
1828          */
1829         recptr = BufferGetLSN(buf);
1830         XLogFlush(recptr);
1831
1832         /*
1833          * Now it's safe to write buffer to disk. Note that no one else should
1834          * have been able to write it while we were busy with log flushing because
1835          * we have the io_in_progress lock.
1836          */
1837
1838         /* To check if block content changes while flushing. - vadim 01/17/97 */
1839         LockBufHdr(buf);
1840         buf->flags &= ~BM_JUST_DIRTIED;
1841         UnlockBufHdr(buf);
1842
1843         smgrwrite(reln,
1844                           buf->tag.forkNum,
1845                           buf->tag.blockNum,
1846                           (char *) BufHdrGetBlock(buf),
1847                           false);
1848
1849         BufferFlushCount++;
1850
1851         TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(reln->smgr_rnode.spcNode,
1852                  reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
1853
1854         /*
1855          * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
1856          * end the io_in_progress state.
1857          */
1858         TerminateBufferIO(buf, true, 0);
1859
1860         /* Pop the error context stack */
1861         error_context_stack = errcontext.previous;
1862 }
1863
1864 /*
1865  * RelationGetNumberOfBlocks
1866  *              Determines the current number of pages in the relation.
1867  */
1868 BlockNumber
1869 RelationGetNumberOfBlocks(Relation relation)
1870 {
1871         /* Open it at the smgr level if not already done */
1872         RelationOpenSmgr(relation);
1873
1874         return smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
1875 }
1876
1877 /*
1878  * RelationTruncate
1879  *              Physically truncate a relation to the specified number of blocks.
1880  *
1881  * As of Postgres 8.1, this includes getting rid of any buffers for the
1882  * blocks that are to be dropped; previously, callers had to do that.
1883  */
1884 void
1885 RelationTruncate(Relation rel, BlockNumber nblocks)
1886 {
1887         /* Open it at the smgr level if not already done */
1888         RelationOpenSmgr(rel);
1889
1890         /* Make sure rd_targblock isn't pointing somewhere past end */
1891         rel->rd_targblock = InvalidBlockNumber;
1892
1893         /* Do the real work */
1894         smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp);
1895 }
1896
1897 /* ---------------------------------------------------------------------
1898  *              DropRelFileNodeBuffers
1899  *
1900  *              This function removes from the buffer pool all the pages of the
1901  *              specified relation that have block numbers >= firstDelBlock.
1902  *              (In particular, with firstDelBlock = 0, all pages are removed.)
1903  *              Dirty pages are simply dropped, without bothering to write them
1904  *              out first.      Therefore, this is NOT rollback-able, and so should be
1905  *              used only with extreme caution!
1906  *
1907  *              Currently, this is called only from smgr.c when the underlying file
1908  *              is about to be deleted or truncated (firstDelBlock is needed for
1909  *              the truncation case).  The data in the affected pages would therefore
1910  *              be deleted momentarily anyway, and there is no point in writing it.
1911  *              It is the responsibility of higher-level code to ensure that the
1912  *              deletion or truncation does not lose any data that could be needed
1913  *              later.  It is also the responsibility of higher-level code to ensure
1914  *              that no other process could be trying to load more pages of the
1915  *              relation into buffers.
1916  *
1917  *              XXX currently it sequentially searches the buffer pool, should be
1918  *              changed to more clever ways of searching.  However, this routine
1919  *              is used only in code paths that aren't very performance-critical,
1920  *              and we shouldn't slow down the hot paths to make it faster ...
1921  * --------------------------------------------------------------------
1922  */
1923 void
1924 DropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum, bool istemp,
1925                                            BlockNumber firstDelBlock)
1926 {
1927         int                     i;
1928
1929         if (istemp)
1930         {
1931                 DropRelFileNodeLocalBuffers(rnode, forkNum, firstDelBlock);
1932                 return;
1933         }
1934
1935         for (i = 0; i < NBuffers; i++)
1936         {
1937                 volatile BufferDesc *bufHdr = &BufferDescriptors[i];
1938
1939                 LockBufHdr(bufHdr);
1940                 if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
1941                         bufHdr->tag.forkNum == forkNum &&
1942                         bufHdr->tag.blockNum >= firstDelBlock)
1943                         InvalidateBuffer(bufHdr);       /* releases spinlock */
1944                 else
1945                         UnlockBufHdr(bufHdr);
1946         }
1947 }
1948
1949 /* ---------------------------------------------------------------------
1950  *              DropDatabaseBuffers
1951  *
1952  *              This function removes all the buffers in the buffer cache for a
1953  *              particular database.  Dirty pages are simply dropped, without
1954  *              bothering to write them out first.      This is used when we destroy a
1955  *              database, to avoid trying to flush data to disk when the directory
1956  *              tree no longer exists.  Implementation is pretty similar to
1957  *              DropRelFileNodeBuffers() which is for destroying just one relation.
1958  * --------------------------------------------------------------------
1959  */
1960 void
1961 DropDatabaseBuffers(Oid dbid)
1962 {
1963         int                     i;
1964         volatile BufferDesc *bufHdr;
1965
1966         /*
1967          * We needn't consider local buffers, since by assumption the target
1968          * database isn't our own.
1969          */
1970
1971         for (i = 0; i < NBuffers; i++)
1972         {
1973                 bufHdr = &BufferDescriptors[i];
1974                 LockBufHdr(bufHdr);
1975                 if (bufHdr->tag.rnode.dbNode == dbid)
1976                         InvalidateBuffer(bufHdr);       /* releases spinlock */
1977                 else
1978                         UnlockBufHdr(bufHdr);
1979         }
1980 }
1981
1982 /* -----------------------------------------------------------------
1983  *              PrintBufferDescs
1984  *
1985  *              this function prints all the buffer descriptors, for debugging
1986  *              use only.
1987  * -----------------------------------------------------------------
1988  */
1989 #ifdef NOT_USED
1990 void
1991 PrintBufferDescs(void)
1992 {
1993         int                     i;
1994         volatile BufferDesc *buf = BufferDescriptors;
1995
1996         for (i = 0; i < NBuffers; ++i, ++buf)
1997         {
1998                 /* theoretically we should lock the bufhdr here */
1999                 elog(LOG,
2000                          "[%02d] (freeNext=%d, rel=%u/%u/%u, "
2001                          "blockNum=%u, flags=0x%x, refcount=%u %d)",
2002                          i, buf->freeNext,
2003                          buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
2004                          buf->tag.rnode.relNode,
2005                          buf->tag.blockNum, buf->flags,
2006                          buf->refcount, PrivateRefCount[i]);
2007         }
2008 }
2009 #endif
2010
2011 #ifdef NOT_USED
2012 void
2013 PrintPinnedBufs(void)
2014 {
2015         int                     i;
2016         volatile BufferDesc *buf = BufferDescriptors;
2017
2018         for (i = 0; i < NBuffers; ++i, ++buf)
2019         {
2020                 if (PrivateRefCount[i] > 0)
2021                 {
2022                         /* theoretically we should lock the bufhdr here */
2023                         elog(LOG,
2024                                  "[%02d] (freeNext=%d, rel=%u/%u/%u, "
2025                                  "blockNum=%u, flags=0x%x, refcount=%u %d)",
2026                                  i, buf->freeNext,
2027                                  buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
2028                                  buf->tag.rnode.relNode,
2029                                  buf->tag.blockNum, buf->flags,
2030                                  buf->refcount, PrivateRefCount[i]);
2031                 }
2032         }
2033 }
2034 #endif
2035
2036 /* ---------------------------------------------------------------------
2037  *              FlushRelationBuffers
2038  *
2039  *              This function writes all dirty pages of a relation out to disk
2040  *              (or more accurately, out to kernel disk buffers), ensuring that the
2041  *              kernel has an up-to-date view of the relation.
2042  *
2043  *              Generally, the caller should be holding AccessExclusiveLock on the
2044  *              target relation to ensure that no other backend is busy dirtying
2045  *              more blocks of the relation; the effects can't be expected to last
2046  *              after the lock is released.
2047  *
2048  *              XXX currently it sequentially searches the buffer pool, should be
2049  *              changed to more clever ways of searching.  This routine is not
2050  *              used in any performance-critical code paths, so it's not worth
2051  *              adding additional overhead to normal paths to make it go faster;
2052  *              but see also DropRelFileNodeBuffers.
2053  * --------------------------------------------------------------------
2054  */
2055 void
2056 FlushRelationBuffers(Relation rel)
2057 {
2058         int                     i;
2059         volatile BufferDesc *bufHdr;
2060
2061         /* Open rel at the smgr level if not already done */
2062         RelationOpenSmgr(rel);
2063
2064         if (rel->rd_istemp)
2065         {
2066                 for (i = 0; i < NLocBuffer; i++)
2067                 {
2068                         bufHdr = &LocalBufferDescriptors[i];
2069                         if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2070                                 (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2071                         {
2072                                 ErrorContextCallback errcontext;
2073
2074                                 /* Setup error traceback support for ereport() */
2075                                 errcontext.callback = buffer_write_error_callback;
2076                                 errcontext.arg = (void *) bufHdr;
2077                                 errcontext.previous = error_context_stack;
2078                                 error_context_stack = &errcontext;
2079
2080                                 smgrwrite(rel->rd_smgr,
2081                                                   bufHdr->tag.forkNum,
2082                                                   bufHdr->tag.blockNum,
2083                                                   (char *) LocalBufHdrGetBlock(bufHdr),
2084                                                   true);
2085
2086                                 bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
2087
2088                                 /* Pop the error context stack */
2089                                 error_context_stack = errcontext.previous;
2090                         }
2091                 }
2092
2093                 return;
2094         }
2095
2096         /* Make sure we can handle the pin inside the loop */
2097         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2098
2099         for (i = 0; i < NBuffers; i++)
2100         {
2101                 bufHdr = &BufferDescriptors[i];
2102                 LockBufHdr(bufHdr);
2103                 if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2104                         (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2105                 {
2106                         PinBuffer_Locked(bufHdr);
2107                         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2108                         FlushBuffer(bufHdr, rel->rd_smgr);
2109                         LWLockRelease(bufHdr->content_lock);
2110                         UnpinBuffer(bufHdr, true);
2111                 }
2112                 else
2113                         UnlockBufHdr(bufHdr);
2114         }
2115 }
2116
2117 /* ---------------------------------------------------------------------
2118  *              FlushDatabaseBuffers
2119  *
2120  *              This function writes all dirty pages of a database out to disk
2121  *              (or more accurately, out to kernel disk buffers), ensuring that the
2122  *              kernel has an up-to-date view of the database.
2123  *
2124  *              Generally, the caller should be holding an appropriate lock to ensure
2125  *              no other backend is active in the target database; otherwise more
2126  *              pages could get dirtied.
2127  *
2128  *              Note we don't worry about flushing any pages of temporary relations.
2129  *              It's assumed these wouldn't be interesting.
2130  * --------------------------------------------------------------------
2131  */
2132 void
2133 FlushDatabaseBuffers(Oid dbid)
2134 {
2135         int                     i;
2136         volatile BufferDesc *bufHdr;
2137
2138         /* Make sure we can handle the pin inside the loop */
2139         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2140
2141         for (i = 0; i < NBuffers; i++)
2142         {
2143                 bufHdr = &BufferDescriptors[i];
2144                 LockBufHdr(bufHdr);
2145                 if (bufHdr->tag.rnode.dbNode == dbid &&
2146                         (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2147                 {
2148                         PinBuffer_Locked(bufHdr);
2149                         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2150                         FlushBuffer(bufHdr, NULL);
2151                         LWLockRelease(bufHdr->content_lock);
2152                         UnpinBuffer(bufHdr, true);
2153                 }
2154                 else
2155                         UnlockBufHdr(bufHdr);
2156         }
2157 }
2158
2159 /*
2160  * ReleaseBuffer -- release the pin on a buffer
2161  */
2162 void
2163 ReleaseBuffer(Buffer buffer)
2164 {
2165         volatile BufferDesc *bufHdr;
2166
2167         if (!BufferIsValid(buffer))
2168                 elog(ERROR, "bad buffer id: %d", buffer);
2169
2170         ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
2171
2172         if (BufferIsLocal(buffer))
2173         {
2174                 Assert(LocalRefCount[-buffer - 1] > 0);
2175                 LocalRefCount[-buffer - 1]--;
2176                 return;
2177         }
2178
2179         bufHdr = &BufferDescriptors[buffer - 1];
2180
2181         Assert(PrivateRefCount[buffer - 1] > 0);
2182
2183         if (PrivateRefCount[buffer - 1] > 1)
2184                 PrivateRefCount[buffer - 1]--;
2185         else
2186                 UnpinBuffer(bufHdr, false);
2187 }
2188
2189 /*
2190  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
2191  *
2192  * This is just a shorthand for a common combination.
2193  */
2194 void
2195 UnlockReleaseBuffer(Buffer buffer)
2196 {
2197         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2198         ReleaseBuffer(buffer);
2199 }
2200
2201 /*
2202  * IncrBufferRefCount
2203  *              Increment the pin count on a buffer that we have *already* pinned
2204  *              at least once.
2205  *
2206  *              This function cannot be used on a buffer we do not have pinned,
2207  *              because it doesn't change the shared buffer state.
2208  */
2209 void
2210 IncrBufferRefCount(Buffer buffer)
2211 {
2212         Assert(BufferIsPinned(buffer));
2213         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2214         ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
2215         if (BufferIsLocal(buffer))
2216                 LocalRefCount[-buffer - 1]++;
2217         else
2218                 PrivateRefCount[buffer - 1]++;
2219 }
2220
2221 /*
2222  * SetBufferCommitInfoNeedsSave
2223  *
2224  *      Mark a buffer dirty when we have updated tuple commit-status bits in it.
2225  *
2226  * This is essentially the same as MarkBufferDirty, except that the caller
2227  * might have only share-lock instead of exclusive-lock on the buffer's
2228  * content lock.  We preserve the distinction mainly as a way of documenting
2229  * that the caller has not made a critical data change --- the status-bit
2230  * update could be redone by someone else just as easily.  Therefore, no WAL
2231  * log record need be generated, whereas calls to MarkBufferDirty really ought
2232  * to be associated with a WAL-entry-creating action.
2233  */
2234 void
2235 SetBufferCommitInfoNeedsSave(Buffer buffer)
2236 {
2237         volatile BufferDesc *bufHdr;
2238
2239         if (!BufferIsValid(buffer))
2240                 elog(ERROR, "bad buffer id: %d", buffer);
2241
2242         if (BufferIsLocal(buffer))
2243         {
2244                 MarkLocalBufferDirty(buffer);
2245                 return;
2246         }
2247
2248         bufHdr = &BufferDescriptors[buffer - 1];
2249
2250         Assert(PrivateRefCount[buffer - 1] > 0);
2251         /* here, either share or exclusive lock is OK */
2252         Assert(LWLockHeldByMe(bufHdr->content_lock));
2253
2254         /*
2255          * This routine might get called many times on the same page, if we are
2256          * making the first scan after commit of an xact that added/deleted many
2257          * tuples.      So, be as quick as we can if the buffer is already dirty.  We
2258          * do this by not acquiring spinlock if it looks like the status bits are
2259          * already OK.  (Note it is okay if someone else clears BM_JUST_DIRTIED
2260          * immediately after we look, because the buffer content update is already
2261          * done and will be reflected in the I/O.)
2262          */
2263         if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
2264                 (BM_DIRTY | BM_JUST_DIRTIED))
2265         {
2266                 LockBufHdr(bufHdr);
2267                 Assert(bufHdr->refcount > 0);
2268                 if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
2269                         VacuumCostBalance += VacuumCostPageDirty;
2270                 bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
2271                 UnlockBufHdr(bufHdr);
2272         }
2273 }
2274
2275 /*
2276  * Release buffer content locks for shared buffers.
2277  *
2278  * Used to clean up after errors.
2279  *
2280  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
2281  * of releasing buffer content locks per se; the only thing we need to deal
2282  * with here is clearing any PIN_COUNT request that was in progress.
2283  */
2284 void
2285 UnlockBuffers(void)
2286 {
2287         volatile BufferDesc *buf = PinCountWaitBuf;
2288
2289         if (buf)
2290         {
2291                 LockBufHdr(buf);
2292
2293                 /*
2294                  * Don't complain if flag bit not set; it could have been reset but we
2295                  * got a cancel/die interrupt before getting the signal.
2296                  */
2297                 if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
2298                         buf->wait_backend_pid == MyProcPid)
2299                         buf->flags &= ~BM_PIN_COUNT_WAITER;
2300
2301                 UnlockBufHdr(buf);
2302
2303                 PinCountWaitBuf = NULL;
2304         }
2305 }
2306
2307 /*
2308  * Acquire or release the content_lock for the buffer.
2309  */
2310 void
2311 LockBuffer(Buffer buffer, int mode)
2312 {
2313         volatile BufferDesc *buf;
2314
2315         Assert(BufferIsValid(buffer));
2316         if (BufferIsLocal(buffer))
2317                 return;                                 /* local buffers need no lock */
2318
2319         buf = &(BufferDescriptors[buffer - 1]);
2320
2321         if (mode == BUFFER_LOCK_UNLOCK)
2322                 LWLockRelease(buf->content_lock);
2323         else if (mode == BUFFER_LOCK_SHARE)
2324                 LWLockAcquire(buf->content_lock, LW_SHARED);
2325         else if (mode == BUFFER_LOCK_EXCLUSIVE)
2326                 LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
2327         else
2328                 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
2329 }
2330
2331 /*
2332  * Acquire the content_lock for the buffer, but only if we don't have to wait.
2333  *
2334  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
2335  */
2336 bool
2337 ConditionalLockBuffer(Buffer buffer)
2338 {
2339         volatile BufferDesc *buf;
2340
2341         Assert(BufferIsValid(buffer));
2342         if (BufferIsLocal(buffer))
2343                 return true;                    /* act as though we got it */
2344
2345         buf = &(BufferDescriptors[buffer - 1]);
2346
2347         return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
2348 }
2349
2350 /*
2351  * LockBufferForCleanup - lock a buffer in preparation for deleting items
2352  *
2353  * Items may be deleted from a disk page only when the caller (a) holds an
2354  * exclusive lock on the buffer and (b) has observed that no other backend
2355  * holds a pin on the buffer.  If there is a pin, then the other backend
2356  * might have a pointer into the buffer (for example, a heapscan reference
2357  * to an item --- see README for more details).  It's OK if a pin is added
2358  * after the cleanup starts, however; the newly-arrived backend will be
2359  * unable to look at the page until we release the exclusive lock.
2360  *
2361  * To implement this protocol, a would-be deleter must pin the buffer and
2362  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
2363  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
2364  * it has successfully observed pin count = 1.
2365  */
2366 void
2367 LockBufferForCleanup(Buffer buffer)
2368 {
2369         volatile BufferDesc *bufHdr;
2370
2371         Assert(BufferIsValid(buffer));
2372         Assert(PinCountWaitBuf == NULL);
2373
2374         if (BufferIsLocal(buffer))
2375         {
2376                 /* There should be exactly one pin */
2377                 if (LocalRefCount[-buffer - 1] != 1)
2378                         elog(ERROR, "incorrect local pin count: %d",
2379                                  LocalRefCount[-buffer - 1]);
2380                 /* Nobody else to wait for */
2381                 return;
2382         }
2383
2384         /* There should be exactly one local pin */
2385         if (PrivateRefCount[buffer - 1] != 1)
2386                 elog(ERROR, "incorrect local pin count: %d",
2387                          PrivateRefCount[buffer - 1]);
2388
2389         bufHdr = &BufferDescriptors[buffer - 1];
2390
2391         for (;;)
2392         {
2393                 /* Try to acquire lock */
2394                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2395                 LockBufHdr(bufHdr);
2396                 Assert(bufHdr->refcount > 0);
2397                 if (bufHdr->refcount == 1)
2398                 {
2399                         /* Successfully acquired exclusive lock with pincount 1 */
2400                         UnlockBufHdr(bufHdr);
2401                         return;
2402                 }
2403                 /* Failed, so mark myself as waiting for pincount 1 */
2404                 if (bufHdr->flags & BM_PIN_COUNT_WAITER)
2405                 {
2406                         UnlockBufHdr(bufHdr);
2407                         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2408                         elog(ERROR, "multiple backends attempting to wait for pincount 1");
2409                 }
2410                 bufHdr->wait_backend_pid = MyProcPid;
2411                 bufHdr->flags |= BM_PIN_COUNT_WAITER;
2412                 PinCountWaitBuf = bufHdr;
2413                 UnlockBufHdr(bufHdr);
2414                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2415                 /* Wait to be signaled by UnpinBuffer() */
2416                 ProcWaitForSignal();
2417                 PinCountWaitBuf = NULL;
2418                 /* Loop back and try again */
2419         }
2420 }
2421
2422 /*
2423  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
2424  *
2425  * We won't loop, but just check once to see if the pin count is OK.  If
2426  * not, return FALSE with no lock held.
2427  */
2428 bool
2429 ConditionalLockBufferForCleanup(Buffer buffer)
2430 {
2431         volatile BufferDesc *bufHdr;
2432
2433         Assert(BufferIsValid(buffer));
2434
2435         if (BufferIsLocal(buffer))
2436         {
2437                 /* There should be exactly one pin */
2438                 Assert(LocalRefCount[-buffer - 1] > 0);
2439                 if (LocalRefCount[-buffer - 1] != 1)
2440                         return false;
2441                 /* Nobody else to wait for */
2442                 return true;
2443         }
2444
2445         /* There should be exactly one local pin */
2446         Assert(PrivateRefCount[buffer - 1] > 0);
2447         if (PrivateRefCount[buffer - 1] != 1)
2448                 return false;
2449
2450         /* Try to acquire lock */
2451         if (!ConditionalLockBuffer(buffer))
2452                 return false;
2453
2454         bufHdr = &BufferDescriptors[buffer - 1];
2455         LockBufHdr(bufHdr);
2456         Assert(bufHdr->refcount > 0);
2457         if (bufHdr->refcount == 1)
2458         {
2459                 /* Successfully acquired exclusive lock with pincount 1 */
2460                 UnlockBufHdr(bufHdr);
2461                 return true;
2462         }
2463
2464         /* Failed, so release the lock */
2465         UnlockBufHdr(bufHdr);
2466         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2467         return false;
2468 }
2469
2470
2471 /*
2472  *      Functions for buffer I/O handling
2473  *
2474  *      Note: We assume that nested buffer I/O never occurs.
2475  *      i.e at most one io_in_progress lock is held per proc.
2476  *
2477  *      Also note that these are used only for shared buffers, not local ones.
2478  */
2479
2480 /*
2481  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
2482  */
2483 static void
2484 WaitIO(volatile BufferDesc *buf)
2485 {
2486         /*
2487          * Changed to wait until there's no IO - Inoue 01/13/2000
2488          *
2489          * Note this is *necessary* because an error abort in the process doing
2490          * I/O could release the io_in_progress_lock prematurely. See
2491          * AbortBufferIO.
2492          */
2493         for (;;)
2494         {
2495                 BufFlags        sv_flags;
2496
2497                 /*
2498                  * It may not be necessary to acquire the spinlock to check the flag
2499                  * here, but since this test is essential for correctness, we'd better
2500                  * play it safe.
2501                  */
2502                 LockBufHdr(buf);
2503                 sv_flags = buf->flags;
2504                 UnlockBufHdr(buf);
2505                 if (!(sv_flags & BM_IO_IN_PROGRESS))
2506                         break;
2507                 LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
2508                 LWLockRelease(buf->io_in_progress_lock);
2509         }
2510 }
2511
2512 /*
2513  * StartBufferIO: begin I/O on this buffer
2514  *      (Assumptions)
2515  *      My process is executing no IO
2516  *      The buffer is Pinned
2517  *
2518  * In some scenarios there are race conditions in which multiple backends
2519  * could attempt the same I/O operation concurrently.  If someone else
2520  * has already started I/O on this buffer then we will block on the
2521  * io_in_progress lock until he's done.
2522  *
2523  * Input operations are only attempted on buffers that are not BM_VALID,
2524  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
2525  * so we can always tell if the work is already done.
2526  *
2527  * Returns TRUE if we successfully marked the buffer as I/O busy,
2528  * FALSE if someone else already did the work.
2529  */
2530 static bool
2531 StartBufferIO(volatile BufferDesc *buf, bool forInput)
2532 {
2533         Assert(!InProgressBuf);
2534
2535         for (;;)
2536         {
2537                 /*
2538                  * Grab the io_in_progress lock so that other processes can wait for
2539                  * me to finish the I/O.
2540                  */
2541                 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2542
2543                 LockBufHdr(buf);
2544
2545                 if (!(buf->flags & BM_IO_IN_PROGRESS))
2546                         break;
2547
2548                 /*
2549                  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
2550                  * lock isn't held is if the process doing the I/O is recovering from
2551                  * an error (see AbortBufferIO).  If that's the case, we must wait for
2552                  * him to get unwedged.
2553                  */
2554                 UnlockBufHdr(buf);
2555                 LWLockRelease(buf->io_in_progress_lock);
2556                 WaitIO(buf);
2557         }
2558
2559         /* Once we get here, there is definitely no I/O active on this buffer */
2560
2561         if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
2562         {
2563                 /* someone else already did the I/O */
2564                 UnlockBufHdr(buf);
2565                 LWLockRelease(buf->io_in_progress_lock);
2566                 return false;
2567         }
2568
2569         buf->flags |= BM_IO_IN_PROGRESS;
2570
2571         UnlockBufHdr(buf);
2572
2573         InProgressBuf = buf;
2574         IsForInput = forInput;
2575
2576         return true;
2577 }
2578
2579 /*
2580  * TerminateBufferIO: release a buffer we were doing I/O on
2581  *      (Assumptions)
2582  *      My process is executing IO for the buffer
2583  *      BM_IO_IN_PROGRESS bit is set for the buffer
2584  *      We hold the buffer's io_in_progress lock
2585  *      The buffer is Pinned
2586  *
2587  * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
2588  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
2589  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
2590  * marking the buffer clean if it was re-dirtied while we were writing.
2591  *
2592  * set_flag_bits gets ORed into the buffer's flags.  It must include
2593  * BM_IO_ERROR in a failure case.  For successful completion it could
2594  * be 0, or BM_VALID if we just finished reading in the page.
2595  */
2596 static void
2597 TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
2598                                   int set_flag_bits)
2599 {
2600         Assert(buf == InProgressBuf);
2601
2602         LockBufHdr(buf);
2603
2604         Assert(buf->flags & BM_IO_IN_PROGRESS);
2605         buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
2606         if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
2607                 buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
2608         buf->flags |= set_flag_bits;
2609
2610         UnlockBufHdr(buf);
2611
2612         InProgressBuf = NULL;
2613
2614         LWLockRelease(buf->io_in_progress_lock);
2615 }
2616
2617 /*
2618  * AbortBufferIO: Clean up any active buffer I/O after an error.
2619  *
2620  *      All LWLocks we might have held have been released,
2621  *      but we haven't yet released buffer pins, so the buffer is still pinned.
2622  *
2623  *      If I/O was in progress, we always set BM_IO_ERROR, even though it's
2624  *      possible the error condition wasn't related to the I/O.
2625  */
2626 void
2627 AbortBufferIO(void)
2628 {
2629         volatile BufferDesc *buf = InProgressBuf;
2630
2631         if (buf)
2632         {
2633                 /*
2634                  * Since LWLockReleaseAll has already been called, we're not holding
2635                  * the buffer's io_in_progress_lock. We have to re-acquire it so that
2636                  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
2637                  * buffer will be in a busy spin until we succeed in doing this.
2638                  */
2639                 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2640
2641                 LockBufHdr(buf);
2642                 Assert(buf->flags & BM_IO_IN_PROGRESS);
2643                 if (IsForInput)
2644                 {
2645                         Assert(!(buf->flags & BM_DIRTY));
2646                         /* We'd better not think buffer is valid yet */
2647                         Assert(!(buf->flags & BM_VALID));
2648                         UnlockBufHdr(buf);
2649                 }
2650                 else
2651                 {
2652                         BufFlags        sv_flags;
2653
2654                         sv_flags = buf->flags;
2655                         Assert(sv_flags & BM_DIRTY);
2656                         UnlockBufHdr(buf);
2657                         /* Issue notice if this is not the first failure... */
2658                         if (sv_flags & BM_IO_ERROR)
2659                         {
2660                                 /* Buffer is pinned, so we can read tag without spinlock */
2661                                 ereport(WARNING,
2662                                                 (errcode(ERRCODE_IO_ERROR),
2663                                                  errmsg("could not write block %u of %u/%u/%u",
2664                                                                 buf->tag.blockNum,
2665                                                                 buf->tag.rnode.spcNode,
2666                                                                 buf->tag.rnode.dbNode,
2667                                                                 buf->tag.rnode.relNode),
2668                                                  errdetail("Multiple failures --- write error might be permanent.")));
2669                         }
2670                 }
2671                 TerminateBufferIO(buf, false, BM_IO_ERROR);
2672         }
2673 }
2674
2675 /*
2676  * Error context callback for errors occurring during buffer writes.
2677  */
2678 static void
2679 buffer_write_error_callback(void *arg)
2680 {
2681         volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;
2682
2683         /* Buffer is pinned, so we can read the tag without locking the spinlock */
2684         if (bufHdr != NULL)
2685                 errcontext("writing block %u of relation %u/%u/%u",
2686                                    bufHdr->tag.blockNum,
2687                                    bufHdr->tag.rnode.spcNode,
2688                                    bufHdr->tag.rnode.dbNode,
2689                                    bufHdr->tag.rnode.relNode);
2690 }