src/backend/storage/buffer/bufmgr.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * bufmgr.c
   4  *        buffer manager interface routines
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 /*
  16  * Principal entry points:
  17  *
  18  * ReadBuffer() -- find or create a buffer holding the requested page,
  19  *              and pin it so that no one can destroy it while this process
  20  *              is using it.
  21  *
  22  * ReleaseBuffer() -- unpin a buffer
  23  *
  24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
  25  *              The disk write is delayed until buffer replacement or checkpoint.
  26  *
  27  * See also these files:
  28  *              freelist.c -- chooses victim for buffer replacement
  29  *              buf_table.c -- manages the buffer lookup table
  30  */
  31 #include "postgres.h"
  32
  33 #include <sys/file.h>
  34 #include <unistd.h>
  35
  36 #include "miscadmin.h"
  37 #include "pg_trace.h"
  38 #include "pgstat.h"
  39 #include "postmaster/bgwriter.h"
  40 #include "storage/buf_internals.h"
  41 #include "storage/bufmgr.h"
  42 #include "storage/ipc.h"
  43 #include "storage/proc.h"
  44 #include "storage/smgr.h"
  45 #include "utils/rel.h"
  46 #include "utils/resowner.h"
  47
  48
  49 /* Note: these two macros only work on shared buffers, not local ones! */
  50 #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  51 #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
  52
  53 /* Note: this macro only works on local buffers, not shared ones! */
  54 #define LocalBufHdrGetBlock(bufHdr) \
  55         LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
  56
  57 /* Bits in SyncOneBuffer's return value */
  58 #define BUF_WRITTEN                             0x01
  59 #define BUF_REUSABLE                    0x02
  60
  61
  62 /* GUC variables */
  63 bool            zero_damaged_pages = false;
  64 int                     bgwriter_lru_maxpages = 100;
  65 double          bgwriter_lru_multiplier = 2.0;
  66
  67 /* local state for StartBufferIO and related functions */
  68 static volatile BufferDesc *InProgressBuf = NULL;
  69 static bool IsForInput;
  70
  71 /* local state for LockBufferForCleanup */
  72 static volatile BufferDesc *PinCountWaitBuf = NULL;
  73
  74
  75 static Buffer ReadBuffer_relcache(Relation reln, ForkNumber forkNum,
  76                 BlockNumber blockNum, bool zeroPage, BufferAccessStrategy strategy);
  77 static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
  78                                   ForkNumber forkNum, BlockNumber blockNum,
  79                                   bool zeroPage, BufferAccessStrategy strategy, bool *hit);
  80 static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
  81 static void PinBuffer_Locked(volatile BufferDesc *buf);
  82 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
  83 static void BufferSync(int flags);
  84 static int      SyncOneBuffer(int buf_id, bool skip_recently_used);
  85 static void WaitIO(volatile BufferDesc *buf);
  86 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
  87 static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  88                                   int set_flag_bits);
  89 static void buffer_write_error_callback(void *arg);
  90 static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
  91                         BlockNumber blockNum,
  92                         BufferAccessStrategy strategy,
  93                         bool *foundPtr);
  94 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
  95 static void AtProcExit_Buffers(int code, Datum arg);
  96
  97
  98 /*
  99  * ReadBuffer -- returns a buffer containing the requested
 100  *              block of the requested relation.  If the blknum
 101  *              requested is P_NEW, extend the relation file and
 102  *              allocate a new block.  (Caller is responsible for
 103  *              ensuring that only one backend tries to extend a
 104  *              relation at the same time!)
 105  *
 106  * Returns: the buffer number for the buffer containing
 107  *              the block read.  The returned buffer has been pinned.
 108  *              Does not return on error --- elog's instead.
 109  *
 110  * Assume when this function is called, that reln has been
 111  *              opened already.
 112  */
 113 Buffer
 114 ReadBuffer(Relation reln, BlockNumber blockNum)
 115 {
 116         return ReadBuffer_relcache(reln, MAIN_FORKNUM, blockNum, false, NULL);
 117 }
 118
 119 /*
 120  * ReadBufferWithFork -- same as ReadBuffer, but for accessing relation
 121  *              forks other than MAIN_FORKNUM.
 122  */
 123 Buffer
 124 ReadBufferWithFork(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 125 {
 126         return ReadBuffer_relcache(reln, forkNum, blockNum, false, NULL);
 127 }
 128
 129 /*
 130  * ReadBufferWithStrategy -- same as ReadBuffer, except caller can specify
 131  *              a nondefault buffer access strategy.  See buffer/README for details.
 132  */
 133 Buffer
 134 ReadBufferWithStrategy(Relation reln, BlockNumber blockNum,
 135                                            BufferAccessStrategy strategy)
 136 {
 137         return ReadBuffer_relcache(reln, MAIN_FORKNUM, blockNum, false, strategy);
 138 }
 139
 140 /*
 141  * ReadOrZeroBuffer -- like ReadBuffer, but if the page isn't in buffer
 142  *              cache already, it's filled with zeros instead of reading it from
 143  *              disk.  Useful when the caller intends to fill the page from scratch,
 144  *              since this saves I/O and avoids unnecessary failure if the
 145  *              page-on-disk has corrupt page headers.
 146  *
 147  *              Caution: do not use this to read a page that is beyond the relation's
 148  *              current physical EOF; that is likely to cause problems in md.c when
 149  *              the page is modified and written out.  P_NEW is OK, though.
 150  */
 151 Buffer
 152 ReadOrZeroBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 153 {
 154         return ReadBuffer_relcache(reln, forkNum, blockNum, true, NULL);
 155 }
 156
 157 /*
 158  * ReadBufferWithoutRelcache -- like ReadBuffer, but doesn't require a
 159  *              relcache entry for the relation. If zeroPage is true, this behaves
 160  *              like ReadOrZeroBuffer rather than ReadBuffer.
 161  */
 162 Buffer
 163 ReadBufferWithoutRelcache(RelFileNode rnode, bool isTemp,
 164                                           ForkNumber forkNum, BlockNumber blockNum, bool zeroPage)
 165 {
 166         bool hit;
 167
 168         SMgrRelation smgr = smgropen(rnode);
 169         return ReadBuffer_common(smgr, isTemp, forkNum, blockNum, zeroPage, NULL, &hit);
 170 }
 171
 172 /*
 173  * ReadBuffer_relcache -- common logic for ReadBuffer-variants that
 174  *              operate on a Relation.
 175  */
 176 static Buffer
 177 ReadBuffer_relcache(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
 178                                         bool zeroPage, BufferAccessStrategy strategy)
 179 {
 180         bool hit;
 181         Buffer buf;
 182
 183         /* Open it at the smgr level if not already done */
 184         RelationOpenSmgr(reln);
 185
 186         /*
 187          * Read the buffer, and update pgstat counters to reflect a cache
 188          * hit or miss.
 189          */
 190         pgstat_count_buffer_read(reln);
 191         buf = ReadBuffer_common(reln->rd_smgr, reln->rd_istemp, forkNum, blockNum,
 192                                                         zeroPage, strategy, &hit);
 193         if (hit)
 194                 pgstat_count_buffer_hit(reln);
 195         return buf;
 196 }
 197
 198 /*
 199  * ReadBuffer_common -- common logic for all ReadBuffer variants
 200  *
 201  * *hit is set to true if the request was satisfied from shared buffer cache.
 202  */
 203 static Buffer
 204 ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
 205                                   BlockNumber blockNum, bool zeroPage,
 206                                   BufferAccessStrategy strategy, bool *hit)
 207 {
 208         volatile BufferDesc *bufHdr;
 209         Block           bufBlock;
 210         bool            found;
 211         bool            isExtend;
 212
 213         *hit = false;
 214
 215         /* Make sure we will have room to remember the buffer pin */
 216         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 217
 218         isExtend = (blockNum == P_NEW);
 219
 220         /* Substitute proper block number if caller asked for P_NEW */
 221         if (isExtend)
 222                 blockNum = smgrnblocks(smgr, forkNum);
 223
 224         TRACE_POSTGRESQL_BUFFER_READ_START(blockNum, smgr->smgr_rnode.spcNode,
 225                 smgr->smgr_rnode.dbNode, smgr->smgr_rnode.relNode, isLocalBuf);
 226
 227         if (isLocalBuf)
 228         {
 229                 ReadLocalBufferCount++;
 230                 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
 231                 if (found)
 232                 {
 233                         LocalBufferHitCount++;
 234                         TRACE_POSTGRESQL_BUFFER_HIT(true); /* true == local buffer */
 235                 }
 236                 else
 237                 {
 238                         TRACE_POSTGRESQL_BUFFER_MISS(true); /* ditto */
 239                 }
 240         }
 241         else
 242         {
 243                 ReadBufferCount++;
 244
 245                 /*
 246                  * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
 247                  * not currently in memory.
 248                  */
 249                 bufHdr = BufferAlloc(smgr, forkNum, blockNum, strategy, &found);
 250                 if (found)
 251                 {
 252                         BufferHitCount++;
 253                         TRACE_POSTGRESQL_BUFFER_HIT(false); /* false != local buffer */
 254                 }
 255                 else
 256                 {
 257                         TRACE_POSTGRESQL_BUFFER_MISS(false); /* ditto */
 258                 }
 259         }
 260
 261         /* At this point we do NOT hold any locks. */
 262
 263         /* if it was already in the buffer pool, we're done */
 264         if (found)
 265         {
 266                 if (!isExtend)
 267                 {
 268                         /* Just need to update stats before we exit */
 269                         *hit = true;
 270
 271                         if (VacuumCostActive)
 272                                 VacuumCostBalance += VacuumCostPageHit;
 273
 274                         TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum,
 275                                 smgr->smgr_rnode.spcNode,
 276                                 smgr->smgr_rnode.dbNode,
 277                                 smgr->smgr_rnode.relNode, isLocalBuf, found);
 278
 279                         return BufferDescriptorGetBuffer(bufHdr);
 280                 }
 281
 282                 /*
 283                  * We get here only in the corner case where we are trying to extend
 284                  * the relation but we found a pre-existing buffer marked BM_VALID.
 285                  * This can happen because mdread doesn't complain about reads beyond
 286                  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
 287                  * read a block beyond EOF could have left a "valid" zero-filled
 288                  * buffer.      Unfortunately, we have also seen this case occurring
 289                  * because of buggy Linux kernels that sometimes return an
 290                  * lseek(SEEK_END) result that doesn't account for a recent write. In
 291                  * that situation, the pre-existing buffer would contain valid data
 292                  * that we don't want to overwrite.  Since the legitimate case should
 293                  * always have left a zero-filled buffer, complain if not PageIsNew.
 294                  */
 295                 bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 296                 if (!PageIsNew((Page) bufBlock))
 297                         ereport(ERROR,
 298                                         (errmsg("unexpected data beyond EOF in block %u of relation %u/%u/%u",
 299                                                         blockNum, smgr->smgr_rnode.spcNode, smgr->smgr_rnode.dbNode, smgr->smgr_rnode.relNode),
 300                                          errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
 301
 302                 /*
 303                  * We *must* do smgrextend before succeeding, else the page will not
 304                  * be reserved by the kernel, and the next P_NEW call will decide to
 305                  * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
 306                  * call that BufferAlloc didn't, and proceed.
 307                  */
 308                 if (isLocalBuf)
 309                 {
 310                         /* Only need to adjust flags */
 311                         Assert(bufHdr->flags & BM_VALID);
 312                         bufHdr->flags &= ~BM_VALID;
 313                 }
 314                 else
 315                 {
 316                         /*
 317                          * Loop to handle the very small possibility that someone re-sets
 318                          * BM_VALID between our clearing it and StartBufferIO inspecting
 319                          * it.
 320                          */
 321                         do
 322                         {
 323                                 LockBufHdr(bufHdr);
 324                                 Assert(bufHdr->flags & BM_VALID);
 325                                 bufHdr->flags &= ~BM_VALID;
 326                                 UnlockBufHdr(bufHdr);
 327                         } while (!StartBufferIO(bufHdr, true));
 328                 }
 329         }
 330
 331         /*
 332          * if we have gotten to this point, we have allocated a buffer for the
 333          * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
 334          * if it's a shared buffer.
 335          *
 336          * Note: if smgrextend fails, we will end up with a buffer that is
 337          * allocated but not marked BM_VALID.  P_NEW will still select the same
 338          * block number (because the relation didn't get any longer on disk) and
 339          * so future attempts to extend the relation will find the same buffer (if
 340          * it's not been recycled) but come right back here to try smgrextend
 341          * again.
 342          */
 343         Assert(!(bufHdr->flags & BM_VALID));            /* spinlock not needed */
 344
 345         bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 346
 347         if (isExtend)
 348         {
 349                 /* new buffers are zero-filled */
 350                 MemSet((char *) bufBlock, 0, BLCKSZ);
 351                 smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, isLocalBuf);
 352         }
 353         else
 354         {
 355                 /*
 356                  * Read in the page, unless the caller intends to overwrite it and
 357                  * just wants us to allocate a buffer.
 358                  */
 359                 if (zeroPage)
 360                         MemSet((char *) bufBlock, 0, BLCKSZ);
 361                 else
 362                 {
 363                         smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
 364
 365                         /* check for garbage data */
 366                         if (!PageHeaderIsValid((PageHeader) bufBlock))
 367                         {
 368                                 if (zero_damaged_pages)
 369                                 {
 370                                         ereport(WARNING,
 371                                                         (errcode(ERRCODE_DATA_CORRUPTED),
 372                                                          errmsg("invalid page header in block %u of relation %u/%u/%u; zeroing out page",
 373                                                                         blockNum,
 374                                                                         smgr->smgr_rnode.spcNode,
 375                                                                         smgr->smgr_rnode.dbNode,
 376                                                                         smgr->smgr_rnode.relNode)));
 377                                         MemSet((char *) bufBlock, 0, BLCKSZ);
 378                                 }
 379                                 else
 380                                         ereport(ERROR,
 381                                                         (errcode(ERRCODE_DATA_CORRUPTED),
 382                                                          errmsg("invalid page header in block %u of relation %u/%u/%u",
 383                                                                         blockNum, smgr->smgr_rnode.spcNode,
 384                                                                         smgr->smgr_rnode.dbNode,
 385                                                                         smgr->smgr_rnode.relNode)));
 386                         }
 387                 }
 388         }
 389
 390         if (isLocalBuf)
 391         {
 392                 /* Only need to adjust flags */
 393                 bufHdr->flags |= BM_VALID;
 394         }
 395         else
 396         {
 397                 /* Set BM_VALID, terminate IO, and wake up any waiters */
 398                 TerminateBufferIO(bufHdr, false, BM_VALID);
 399         }
 400
 401         if (VacuumCostActive)
 402                 VacuumCostBalance += VacuumCostPageMiss;
 403
 404         TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, smgr->smgr_rnode.spcNode,
 405                         smgr->smgr_rnode.dbNode, smgr->smgr_rnode.relNode,
 406                         isLocalBuf, found);
 407
 408         return BufferDescriptorGetBuffer(bufHdr);
 409 }
 410
 411 /*
 412  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
 413  *              buffer.  If no buffer exists already, selects a replacement
 414  *              victim and evicts the old page, but does NOT read in new page.
 415  *
 416  * "strategy" can be a buffer replacement strategy object, or NULL for
 417  * the default strategy.  The selected buffer's usage_count is advanced when
 418  * using the default strategy, but otherwise possibly not (see PinBuffer).
 419  *
 420  * The returned buffer is pinned and is already marked as holding the
 421  * desired page.  If it already did have the desired page, *foundPtr is
 422  * set TRUE.  Otherwise, *foundPtr is set FALSE and the buffer is marked
 423  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
 424  *
 425  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
 426  * we keep it for simplicity in ReadBuffer.
 427  *
 428  * No locks are held either at entry or exit.
 429  */
 430 static volatile BufferDesc *
 431 BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
 432                         BlockNumber blockNum,
 433                         BufferAccessStrategy strategy,
 434                         bool *foundPtr)
 435 {
 436         BufferTag       newTag;                 /* identity of requested block */
 437         uint32          newHash;                /* hash value for newTag */
 438         LWLockId        newPartitionLock;               /* buffer partition lock for it */
 439         BufferTag       oldTag;                 /* previous identity of selected buffer */
 440         uint32          oldHash;                /* hash value for oldTag */
 441         LWLockId        oldPartitionLock;               /* buffer partition lock for it */
 442         BufFlags        oldFlags;
 443         int                     buf_id;
 444         volatile BufferDesc *buf;
 445         bool            valid;
 446
 447         /* create a tag so we can lookup the buffer */
 448         INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum);
 449
 450         /* determine its hash code and partition lock ID */
 451         newHash = BufTableHashCode(&newTag);
 452         newPartitionLock = BufMappingPartitionLock(newHash);
 453
 454         /* see if the block is in the buffer pool already */
 455         LWLockAcquire(newPartitionLock, LW_SHARED);
 456         buf_id = BufTableLookup(&newTag, newHash);
 457         if (buf_id >= 0)
 458         {
 459                 /*
 460                  * Found it.  Now, pin the buffer so no one can steal it from the
 461                  * buffer pool, and check to see if the correct data has been loaded
 462                  * into the buffer.
 463                  */
 464                 buf = &BufferDescriptors[buf_id];
 465
 466                 valid = PinBuffer(buf, strategy);
 467
 468                 /* Can release the mapping lock as soon as we've pinned it */
 469                 LWLockRelease(newPartitionLock);
 470
 471                 *foundPtr = TRUE;
 472
 473                 if (!valid)
 474                 {
 475                         /*
 476                          * We can only get here if (a) someone else is still reading in
 477                          * the page, or (b) a previous read attempt failed.  We have to
 478                          * wait for any active read attempt to finish, and then set up our
 479                          * own read attempt if the page is still not BM_VALID.
 480                          * StartBufferIO does it all.
 481                          */
 482                         if (StartBufferIO(buf, true))
 483                         {
 484                                 /*
 485                                  * If we get here, previous attempts to read the buffer must
 486                                  * have failed ... but we shall bravely try again.
 487                                  */
 488                                 *foundPtr = FALSE;
 489                         }
 490                 }
 491
 492                 return buf;
 493         }
 494
 495         /*
 496          * Didn't find it in the buffer pool.  We'll have to initialize a new
 497          * buffer.      Remember to unlock the mapping lock while doing the work.
 498          */
 499         LWLockRelease(newPartitionLock);
 500
 501         /* Loop here in case we have to try another victim buffer */
 502         for (;;)
 503         {
 504                 bool            lock_held;
 505
 506                 /*
 507                  * Select a victim buffer.      The buffer is returned with its header
 508                  * spinlock still held!  Also (in most cases) the BufFreelistLock is
 509                  * still held, since it would be bad to hold the spinlock while
 510                  * possibly waking up other processes.
 511                  */
 512                 buf = StrategyGetBuffer(strategy, &lock_held);
 513
 514                 Assert(buf->refcount == 0);
 515
 516                 /* Must copy buffer flags while we still hold the spinlock */
 517                 oldFlags = buf->flags;
 518
 519                 /* Pin the buffer and then release the buffer spinlock */
 520                 PinBuffer_Locked(buf);
 521
 522                 /* Now it's safe to release the freelist lock */
 523                 if (lock_held)
 524                         LWLockRelease(BufFreelistLock);
 525
 526                 /*
 527                  * If the buffer was dirty, try to write it out.  There is a race
 528                  * condition here, in that someone might dirty it after we released it
 529                  * above, or even while we are writing it out (since our share-lock
 530                  * won't prevent hint-bit updates).  We will recheck the dirty bit
 531                  * after re-locking the buffer header.
 532                  */
 533                 if (oldFlags & BM_DIRTY)
 534                 {
 535                         /*
 536                          * We need a share-lock on the buffer contents to write it out
 537                          * (else we might write invalid data, eg because someone else is
 538                          * compacting the page contents while we write).  We must use a
 539                          * conditional lock acquisition here to avoid deadlock.  Even
 540                          * though the buffer was not pinned (and therefore surely not
 541                          * locked) when StrategyGetBuffer returned it, someone else could
 542                          * have pinned and exclusive-locked it by the time we get here. If
 543                          * we try to get the lock unconditionally, we'd block waiting for
 544                          * them; if they later block waiting for us, deadlock ensues.
 545                          * (This has been observed to happen when two backends are both
 546                          * trying to split btree index pages, and the second one just
 547                          * happens to be trying to split the page the first one got from
 548                          * StrategyGetBuffer.)
 549                          */
 550                         if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
 551                         {
 552                                 /*
 553                                  * If using a nondefault strategy, and writing the buffer
 554                                  * would require a WAL flush, let the strategy decide whether
 555                                  * to go ahead and write/reuse the buffer or to choose another
 556                                  * victim.      We need lock to inspect the page LSN, so this
 557                                  * can't be done inside StrategyGetBuffer.
 558                                  */
 559                                 if (strategy != NULL &&
 560                                         XLogNeedsFlush(BufferGetLSN(buf)) &&
 561                                         StrategyRejectBuffer(strategy, buf))
 562                                 {
 563                                         /* Drop lock/pin and loop around for another buffer */
 564                                         LWLockRelease(buf->content_lock);
 565                                         UnpinBuffer(buf, true);
 566                                         continue;
 567                                 }
 568
 569                                 /* OK, do the I/O */
 570                                 FlushBuffer(buf, NULL);
 571                                 LWLockRelease(buf->content_lock);
 572                         }
 573                         else
 574                         {
 575                                 /*
 576                                  * Someone else has locked the buffer, so give it up and loop
 577                                  * back to get another one.
 578                                  */
 579                                 UnpinBuffer(buf, true);
 580                                 continue;
 581                         }
 582                 }
 583
 584                 /*
 585                  * To change the association of a valid buffer, we'll need to have
 586                  * exclusive lock on both the old and new mapping partitions.
 587                  */
 588                 if (oldFlags & BM_TAG_VALID)
 589                 {
 590                         /*
 591                          * Need to compute the old tag's hashcode and partition lock ID.
 592                          * XXX is it worth storing the hashcode in BufferDesc so we need
 593                          * not recompute it here?  Probably not.
 594                          */
 595                         oldTag = buf->tag;
 596                         oldHash = BufTableHashCode(&oldTag);
 597                         oldPartitionLock = BufMappingPartitionLock(oldHash);
 598
 599                         /*
 600                          * Must lock the lower-numbered partition first to avoid
 601                          * deadlocks.
 602                          */
 603                         if (oldPartitionLock < newPartitionLock)
 604                         {
 605                                 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 606                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 607                         }
 608                         else if (oldPartitionLock > newPartitionLock)
 609                         {
 610                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 611                                 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 612                         }
 613                         else
 614                         {
 615                                 /* only one partition, only one lock */
 616                                 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 617                         }
 618                 }
 619                 else
 620                 {
 621                         /* if it wasn't valid, we need only the new partition */
 622                         LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 623                         /* these just keep the compiler quiet about uninit variables */
 624                         oldHash = 0;
 625                         oldPartitionLock = 0;
 626                 }
 627
 628                 /*
 629                  * Try to make a hashtable entry for the buffer under its new tag.
 630                  * This could fail because while we were writing someone else
 631                  * allocated another buffer for the same block we want to read in.
 632                  * Note that we have not yet removed the hashtable entry for the old
 633                  * tag.
 634                  */
 635                 buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
 636
 637                 if (buf_id >= 0)
 638                 {
 639                         /*
 640                          * Got a collision. Someone has already done what we were about to
 641                          * do. We'll just handle this as if it were found in the buffer
 642                          * pool in the first place.  First, give up the buffer we were
 643                          * planning to use.
 644                          */
 645                         UnpinBuffer(buf, true);
 646
 647                         /* Can give up that buffer's mapping partition lock now */
 648                         if ((oldFlags & BM_TAG_VALID) &&
 649                                 oldPartitionLock != newPartitionLock)
 650                                 LWLockRelease(oldPartitionLock);
 651
 652                         /* remaining code should match code at top of routine */
 653
 654                         buf = &BufferDescriptors[buf_id];
 655
 656                         valid = PinBuffer(buf, strategy);
 657
 658                         /* Can release the mapping lock as soon as we've pinned it */
 659                         LWLockRelease(newPartitionLock);
 660
 661                         *foundPtr = TRUE;
 662
 663                         if (!valid)
 664                         {
 665                                 /*
 666                                  * We can only get here if (a) someone else is still reading
 667                                  * in the page, or (b) a previous read attempt failed.  We
 668                                  * have to wait for any active read attempt to finish, and
 669                                  * then set up our own read attempt if the page is still not
 670                                  * BM_VALID.  StartBufferIO does it all.
 671                                  */
 672                                 if (StartBufferIO(buf, true))
 673                                 {
 674                                         /*
 675                                          * If we get here, previous attempts to read the buffer
 676                                          * must have failed ... but we shall bravely try again.
 677                                          */
 678                                         *foundPtr = FALSE;
 679                                 }
 680                         }
 681
 682                         return buf;
 683                 }
 684
 685                 /*
 686                  * Need to lock the buffer header too in order to change its tag.
 687                  */
 688                 LockBufHdr(buf);
 689
 690                 /*
 691                  * Somebody could have pinned or re-dirtied the buffer while we were
 692                  * doing the I/O and making the new hashtable entry.  If so, we can't
 693                  * recycle this buffer; we must undo everything we've done and start
 694                  * over with a new victim buffer.
 695                  */
 696                 oldFlags = buf->flags;
 697                 if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
 698                         break;
 699
 700                 UnlockBufHdr(buf);
 701                 BufTableDelete(&newTag, newHash);
 702                 if ((oldFlags & BM_TAG_VALID) &&
 703                         oldPartitionLock != newPartitionLock)
 704                         LWLockRelease(oldPartitionLock);
 705                 LWLockRelease(newPartitionLock);
 706                 UnpinBuffer(buf, true);
 707         }
 708
 709         /*
 710          * Okay, it's finally safe to rename the buffer.
 711          *
 712          * Clearing BM_VALID here is necessary, clearing the dirtybits is just
 713          * paranoia.  We also reset the usage_count since any recency of use of
 714          * the old content is no longer relevant.  (The usage_count starts out at
 715          * 1 so that the buffer can survive one clock-sweep pass.)
 716          */
 717         buf->tag = newTag;
 718         buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR);
 719         buf->flags |= BM_TAG_VALID;
 720         buf->usage_count = 1;
 721
 722         UnlockBufHdr(buf);
 723
 724         if (oldFlags & BM_TAG_VALID)
 725         {
 726                 BufTableDelete(&oldTag, oldHash);
 727                 if (oldPartitionLock != newPartitionLock)
 728                         LWLockRelease(oldPartitionLock);
 729         }
 730
 731         LWLockRelease(newPartitionLock);
 732
 733         /*
 734          * Buffer contents are currently invalid.  Try to get the io_in_progress
 735          * lock.  If StartBufferIO returns false, then someone else managed to
 736          * read it before we did, so there's nothing left for BufferAlloc() to do.
 737          */
 738         if (StartBufferIO(buf, true))
 739                 *foundPtr = FALSE;
 740         else
 741                 *foundPtr = TRUE;
 742
 743         return buf;
 744 }
 745
 746 /*
 747  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
 748  * freelist.
 749  *
 750  * The buffer header spinlock must be held at entry.  We drop it before
 751  * returning.  (This is sane because the caller must have locked the
 752  * buffer in order to be sure it should be dropped.)
 753  *
 754  * This is used only in contexts such as dropping a relation.  We assume
 755  * that no other backend could possibly be interested in using the page,
 756  * so the only reason the buffer might be pinned is if someone else is
 757  * trying to write it out.      We have to let them finish before we can
 758  * reclaim the buffer.
 759  *
 760  * The buffer could get reclaimed by someone else while we are waiting
 761  * to acquire the necessary locks; if so, don't mess it up.
 762  */
 763 static void
 764 InvalidateBuffer(volatile BufferDesc *buf)
 765 {
 766         BufferTag       oldTag;
 767         uint32          oldHash;                /* hash value for oldTag */
 768         LWLockId        oldPartitionLock;               /* buffer partition lock for it */
 769         BufFlags        oldFlags;
 770
 771         /* Save the original buffer tag before dropping the spinlock */
 772         oldTag = buf->tag;
 773
 774         UnlockBufHdr(buf);
 775
 776         /*
 777          * Need to compute the old tag's hashcode and partition lock ID. XXX is it
 778          * worth storing the hashcode in BufferDesc so we need not recompute it
 779          * here?  Probably not.
 780          */
 781         oldHash = BufTableHashCode(&oldTag);
 782         oldPartitionLock = BufMappingPartitionLock(oldHash);
 783
 784 retry:
 785
 786         /*
 787          * Acquire exclusive mapping lock in preparation for changing the buffer's
 788          * association.
 789          */
 790         LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 791
 792         /* Re-lock the buffer header */
 793         LockBufHdr(buf);
 794
 795         /* If it's changed while we were waiting for lock, do nothing */
 796         if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
 797         {
 798                 UnlockBufHdr(buf);
 799                 LWLockRelease(oldPartitionLock);
 800                 return;
 801         }
 802
 803         /*
 804          * We assume the only reason for it to be pinned is that someone else is
 805          * flushing the page out.  Wait for them to finish.  (This could be an
 806          * infinite loop if the refcount is messed up... it would be nice to time
 807          * out after awhile, but there seems no way to be sure how many loops may
 808          * be needed.  Note that if the other guy has pinned the buffer but not
 809          * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 810          * be busy-looping here.)
 811          */
 812         if (buf->refcount != 0)
 813         {
 814                 UnlockBufHdr(buf);
 815                 LWLockRelease(oldPartitionLock);
 816                 /* safety check: should definitely not be our *own* pin */
 817                 if (PrivateRefCount[buf->buf_id] != 0)
 818                         elog(ERROR, "buffer is pinned in InvalidateBuffer");
 819                 WaitIO(buf);
 820                 goto retry;
 821         }
 822
 823         /*
 824          * Clear out the buffer's tag and flags.  We must do this to ensure that
 825          * linear scans of the buffer array don't think the buffer is valid.
 826          */
 827         oldFlags = buf->flags;
 828         CLEAR_BUFFERTAG(buf->tag);
 829         buf->flags = 0;
 830         buf->usage_count = 0;
 831
 832         UnlockBufHdr(buf);
 833
 834         /*
 835          * Remove the buffer from the lookup hashtable, if it was in there.
 836          */
 837         if (oldFlags & BM_TAG_VALID)
 838                 BufTableDelete(&oldTag, oldHash);
 839
 840         /*
 841          * Done with mapping lock.
 842          */
 843         LWLockRelease(oldPartitionLock);
 844
 845         /*
 846          * Insert the buffer at the head of the list of free buffers.
 847          */
 848         StrategyFreeBuffer(buf);
 849 }
 850
 851 /*
 852  * MarkBufferDirty
 853  *
 854  *              Marks buffer contents as dirty (actual write happens later).
 855  *
 856  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
 857  * exclusive lock, then somebody could be in process of writing the buffer,
 858  * leading to risk of bad data written to disk.)
 859  */
 860 void
 861 MarkBufferDirty(Buffer buffer)
 862 {
 863         volatile BufferDesc *bufHdr;
 864
 865         if (!BufferIsValid(buffer))
 866                 elog(ERROR, "bad buffer id: %d", buffer);
 867
 868         if (BufferIsLocal(buffer))
 869         {
 870                 MarkLocalBufferDirty(buffer);
 871                 return;
 872         }
 873
 874         bufHdr = &BufferDescriptors[buffer - 1];
 875
 876         Assert(PrivateRefCount[buffer - 1] > 0);
 877         /* unfortunately we can't check if the lock is held exclusively */
 878         Assert(LWLockHeldByMe(bufHdr->content_lock));
 879
 880         LockBufHdr(bufHdr);
 881
 882         Assert(bufHdr->refcount > 0);
 883
 884         /*
 885          * If the buffer was not dirty already, do vacuum cost accounting.
 886          */
 887         if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
 888                 VacuumCostBalance += VacuumCostPageDirty;
 889
 890         bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
 891
 892         UnlockBufHdr(bufHdr);
 893 }
 894
 895 /*
 896  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
 897  *
 898  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
 899  * compared to calling the two routines separately.  Now it's mainly just
 900  * a convenience function.      However, if the passed buffer is valid and
 901  * already contains the desired block, we just return it as-is; and that
 902  * does save considerable work compared to a full release and reacquire.
 903  *
 904  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
 905  * buffer actually needs to be released.  This case is the same as ReadBuffer,
 906  * but can save some tests in the caller.
 907  */
 908 Buffer
 909 ReleaseAndReadBuffer(Buffer buffer,
 910                                          Relation relation,
 911                                          BlockNumber blockNum)
 912 {
 913         ForkNumber forkNum = MAIN_FORKNUM;
 914         volatile BufferDesc *bufHdr;
 915
 916         if (BufferIsValid(buffer))
 917         {
 918                 if (BufferIsLocal(buffer))
 919                 {
 920                         Assert(LocalRefCount[-buffer - 1] > 0);
 921                         bufHdr = &LocalBufferDescriptors[-buffer - 1];
 922                         if (bufHdr->tag.blockNum == blockNum &&
 923                                 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
 924                                 bufHdr->tag.forkNum == forkNum)
 925                                 return buffer;
 926                         ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 927                         LocalRefCount[-buffer - 1]--;
 928                 }
 929                 else
 930                 {
 931                         Assert(PrivateRefCount[buffer - 1] > 0);
 932                         bufHdr = &BufferDescriptors[buffer - 1];
 933                         /* we have pin, so it's ok to examine tag without spinlock */
 934                         if (bufHdr->tag.blockNum == blockNum &&
 935                                 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
 936                                 bufHdr->tag.forkNum == forkNum)
 937                                 return buffer;
 938                         UnpinBuffer(bufHdr, true);
 939                 }
 940         }
 941
 942         return ReadBuffer(relation, blockNum);
 943 }
 944
 945 /*
 946  * PinBuffer -- make buffer unavailable for replacement.
 947  *
 948  * For the default access strategy, the buffer's usage_count is incremented
 949  * when we first pin it; for other strategies we just make sure the usage_count
 950  * isn't zero.  (The idea of the latter is that we don't want synchronized
 951  * heap scans to inflate the count, but we need it to not be zero to discourage
 952  * other backends from stealing buffers from our ring.  As long as we cycle
 953  * through the ring faster than the global clock-sweep cycles, buffers in
 954  * our ring won't be chosen as victims for replacement by other backends.)
 955  *
 956  * This should be applied only to shared buffers, never local ones.
 957  *
 958  * Note that ResourceOwnerEnlargeBuffers must have been done already.
 959  *
 960  * Returns TRUE if buffer is BM_VALID, else FALSE.      This provision allows
 961  * some callers to avoid an extra spinlock cycle.
 962  */
 963 static bool
 964 PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 965 {
 966         int                     b = buf->buf_id;
 967         bool            result;
 968
 969         if (PrivateRefCount[b] == 0)
 970         {
 971                 LockBufHdr(buf);
 972                 buf->refcount++;
 973                 if (strategy == NULL)
 974                 {
 975                         if (buf->usage_count < BM_MAX_USAGE_COUNT)
 976                                 buf->usage_count++;
 977                 }
 978                 else
 979                 {
 980                         if (buf->usage_count == 0)
 981                                 buf->usage_count = 1;
 982                 }
 983                 result = (buf->flags & BM_VALID) != 0;
 984                 UnlockBufHdr(buf);
 985         }
 986         else
 987         {
 988                 /* If we previously pinned the buffer, it must surely be valid */
 989                 result = true;
 990         }
 991         PrivateRefCount[b]++;
 992         Assert(PrivateRefCount[b] > 0);
 993         ResourceOwnerRememberBuffer(CurrentResourceOwner,
 994                                                                 BufferDescriptorGetBuffer(buf));
 995         return result;
 996 }
 997
 998 /*
 999  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1000  * The spinlock is released before return.
1001  *
1002  * Currently, no callers of this function want to modify the buffer's
1003  * usage_count at all, so there's no need for a strategy parameter.
1004  * Also we don't bother with a BM_VALID test (the caller could check that for
1005  * itself).
1006  *
1007  * Note: use of this routine is frequently mandatory, not just an optimization
1008  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1009  * its state can change under us.
1010  */
1011 static void
1012 PinBuffer_Locked(volatile BufferDesc *buf)
1013 {
1014         int                     b = buf->buf_id;
1015
1016         if (PrivateRefCount[b] == 0)
1017                 buf->refcount++;
1018         UnlockBufHdr(buf);
1019         PrivateRefCount[b]++;
1020         Assert(PrivateRefCount[b] > 0);
1021         ResourceOwnerRememberBuffer(CurrentResourceOwner,
1022                                                                 BufferDescriptorGetBuffer(buf));
1023 }
1024
1025 /*
1026  * UnpinBuffer -- make buffer available for replacement.
1027  *
1028  * This should be applied only to shared buffers, never local ones.
1029  *
1030  * Most but not all callers want CurrentResourceOwner to be adjusted.
1031  * Those that don't should pass fixOwner = FALSE.
1032  */
1033 static void
1034 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
1035 {
1036         int                     b = buf->buf_id;
1037
1038         if (fixOwner)
1039                 ResourceOwnerForgetBuffer(CurrentResourceOwner,
1040                                                                   BufferDescriptorGetBuffer(buf));
1041
1042         Assert(PrivateRefCount[b] > 0);
1043         PrivateRefCount[b]--;
1044         if (PrivateRefCount[b] == 0)
1045         {
1046                 /* I'd better not still hold any locks on the buffer */
1047                 Assert(!LWLockHeldByMe(buf->content_lock));
1048                 Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1049
1050                 LockBufHdr(buf);
1051
1052                 /* Decrement the shared reference count */
1053                 Assert(buf->refcount > 0);
1054                 buf->refcount--;
1055
1056                 /* Support LockBufferForCleanup() */
1057                 if ((buf->flags & BM_PIN_COUNT_WAITER) &&
1058                         buf->refcount == 1)
1059                 {
1060                         /* we just released the last pin other than the waiter's */
1061                         int                     wait_backend_pid = buf->wait_backend_pid;
1062
1063                         buf->flags &= ~BM_PIN_COUNT_WAITER;
1064                         UnlockBufHdr(buf);
1065                         ProcSendSignal(wait_backend_pid);
1066                 }
1067                 else
1068                         UnlockBufHdr(buf);
1069         }
1070 }
1071
1072 /*
1073  * BufferSync -- Write out all dirty buffers in the pool.
1074  *
1075  * This is called at checkpoint time to write out all dirty shared buffers.
1076  * The checkpoint request flags should be passed in; currently the only one
1077  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
1078  */
1079 static void
1080 BufferSync(int flags)
1081 {
1082         int                     buf_id;
1083         int                     num_to_scan;
1084         int                     num_to_write;
1085         int                     num_written;
1086
1087         /* Make sure we can handle the pin inside SyncOneBuffer */
1088         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1089
1090         /*
1091          * Loop over all buffers, and mark the ones that need to be written with
1092          * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_write), so that we
1093          * can estimate how much work needs to be done.
1094          *
1095          * This allows us to write only those pages that were dirty when the
1096          * checkpoint began, and not those that get dirtied while it proceeds.
1097          * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1098          * later in this function, or by normal backends or the bgwriter cleaning
1099          * scan, the flag is cleared.  Any buffer dirtied after this point won't
1100          * have the flag set.
1101          *
1102          * Note that if we fail to write some buffer, we may leave buffers with
1103          * BM_CHECKPOINT_NEEDED still set.      This is OK since any such buffer would
1104          * certainly need to be written for the next checkpoint attempt, too.
1105          */
1106         num_to_write = 0;
1107         for (buf_id = 0; buf_id < NBuffers; buf_id++)
1108         {
1109                 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1110
1111                 /*
1112                  * Header spinlock is enough to examine BM_DIRTY, see comment in
1113                  * SyncOneBuffer.
1114                  */
1115                 LockBufHdr(bufHdr);
1116
1117                 if (bufHdr->flags & BM_DIRTY)
1118                 {
1119                         bufHdr->flags |= BM_CHECKPOINT_NEEDED;
1120                         num_to_write++;
1121                 }
1122
1123                 UnlockBufHdr(bufHdr);
1124         }
1125
1126         if (num_to_write == 0)
1127                 return;                                 /* nothing to do */
1128
1129         TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
1130
1131         /*
1132          * Loop over all buffers again, and write the ones (still) marked with
1133          * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
1134          * since we might as well dump soon-to-be-recycled buffers first.
1135          *
1136          * Note that we don't read the buffer alloc count here --- that should be
1137          * left untouched till the next BgBufferSync() call.
1138          */
1139         buf_id = StrategySyncStart(NULL, NULL);
1140         num_to_scan = NBuffers;
1141         num_written = 0;
1142         while (num_to_scan-- > 0)
1143         {
1144                 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1145
1146                 /*
1147                  * We don't need to acquire the lock here, because we're only looking
1148                  * at a single bit. It's possible that someone else writes the buffer
1149                  * and clears the flag right after we check, but that doesn't matter
1150                  * since SyncOneBuffer will then do nothing.  However, there is a
1151                  * further race condition: it's conceivable that between the time we
1152                  * examine the bit here and the time SyncOneBuffer acquires lock,
1153                  * someone else not only wrote the buffer but replaced it with another
1154                  * page and dirtied it.  In that improbable case, SyncOneBuffer will
1155                  * write the buffer though we didn't need to.  It doesn't seem worth
1156                  * guarding against this, though.
1157                  */
1158                 if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
1159                 {
1160                         if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
1161                         {
1162                                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1163                                 BgWriterStats.m_buf_written_checkpoints++;
1164                                 num_written++;
1165
1166                                 /*
1167                                  * We know there are at most num_to_write buffers with
1168                                  * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
1169                                  * num_written reaches num_to_write.
1170                                  *
1171                                  * Note that num_written doesn't include buffers written by
1172                                  * other backends, or by the bgwriter cleaning scan. That
1173                                  * means that the estimate of how much progress we've made is
1174                                  * conservative, and also that this test will often fail to
1175                                  * trigger.  But it seems worth making anyway.
1176                                  */
1177                                 if (num_written >= num_to_write)
1178                                         break;
1179
1180                                 /*
1181                                  * Perform normal bgwriter duties and sleep to throttle our
1182                                  * I/O rate.
1183                                  */
1184                                 CheckpointWriteDelay(flags,
1185                                                                          (double) num_written / num_to_write);
1186                         }
1187                 }
1188
1189                 if (++buf_id >= NBuffers)
1190                         buf_id = 0;
1191         }
1192
1193         TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
1194
1195         /*
1196          * Update checkpoint statistics. As noted above, this doesn't include
1197          * buffers written by other backends or bgwriter scan.
1198          */
1199         CheckpointStats.ckpt_bufs_written += num_written;
1200 }
1201
1202 /*
1203  * BgBufferSync -- Write out some dirty buffers in the pool.
1204  *
1205  * This is called periodically by the background writer process.
1206  */
1207 void
1208 BgBufferSync(void)
1209 {
1210         /* info obtained from freelist.c */
1211         int                     strategy_buf_id;
1212         uint32          strategy_passes;
1213         uint32          recent_alloc;
1214
1215         /*
1216          * Information saved between calls so we can determine the strategy
1217          * point's advance rate and avoid scanning already-cleaned buffers.
1218          */
1219         static bool saved_info_valid = false;
1220         static int      prev_strategy_buf_id;
1221         static uint32 prev_strategy_passes;
1222         static int      next_to_clean;
1223         static uint32 next_passes;
1224
1225         /* Moving averages of allocation rate and clean-buffer density */
1226         static float smoothed_alloc = 0;
1227         static float smoothed_density = 10.0;
1228
1229         /* Potentially these could be tunables, but for now, not */
1230         float           smoothing_samples = 16;
1231         float           scan_whole_pool_milliseconds = 120000.0;
1232
1233         /* Used to compute how far we scan ahead */
1234         long            strategy_delta;
1235         int                     bufs_to_lap;
1236         int                     bufs_ahead;
1237         float           scans_per_alloc;
1238         int                     reusable_buffers_est;
1239         int                     upcoming_alloc_est;
1240         int                     min_scan_buffers;
1241
1242         /* Variables for the scanning loop proper */
1243         int                     num_to_scan;
1244         int                     num_written;
1245         int                     reusable_buffers;
1246
1247         /*
1248          * Find out where the freelist clock sweep currently is, and how many
1249          * buffer allocations have happened since our last call.
1250          */
1251         strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
1252
1253         /* Report buffer alloc counts to pgstat */
1254         BgWriterStats.m_buf_alloc += recent_alloc;
1255
1256         /*
1257          * If we're not running the LRU scan, just stop after doing the stats
1258          * stuff.  We mark the saved state invalid so that we can recover sanely
1259          * if LRU scan is turned back on later.
1260          */
1261         if (bgwriter_lru_maxpages <= 0)
1262         {
1263                 saved_info_valid = false;
1264                 return;
1265         }
1266
1267         /*
1268          * Compute strategy_delta = how many buffers have been scanned by the
1269          * clock sweep since last time.  If first time through, assume none. Then
1270          * see if we are still ahead of the clock sweep, and if so, how many
1271          * buffers we could scan before we'd catch up with it and "lap" it. Note:
1272          * weird-looking coding of xxx_passes comparisons are to avoid bogus
1273          * behavior when the passes counts wrap around.
1274          */
1275         if (saved_info_valid)
1276         {
1277                 int32           passes_delta = strategy_passes - prev_strategy_passes;
1278
1279                 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
1280                 strategy_delta += (long) passes_delta *NBuffers;
1281
1282                 Assert(strategy_delta >= 0);
1283
1284                 if ((int32) (next_passes - strategy_passes) > 0)
1285                 {
1286                         /* we're one pass ahead of the strategy point */
1287                         bufs_to_lap = strategy_buf_id - next_to_clean;
1288 #ifdef BGW_DEBUG
1289                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1290                                  next_passes, next_to_clean,
1291                                  strategy_passes, strategy_buf_id,
1292                                  strategy_delta, bufs_to_lap);
1293 #endif
1294                 }
1295                 else if (next_passes == strategy_passes &&
1296                                  next_to_clean >= strategy_buf_id)
1297                 {
1298                         /* on same pass, but ahead or at least not behind */
1299                         bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
1300 #ifdef BGW_DEBUG
1301                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1302                                  next_passes, next_to_clean,
1303                                  strategy_passes, strategy_buf_id,
1304                                  strategy_delta, bufs_to_lap);
1305 #endif
1306                 }
1307                 else
1308                 {
1309                         /*
1310                          * We're behind, so skip forward to the strategy point and start
1311                          * cleaning from there.
1312                          */
1313 #ifdef BGW_DEBUG
1314                         elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
1315                                  next_passes, next_to_clean,
1316                                  strategy_passes, strategy_buf_id,
1317                                  strategy_delta);
1318 #endif
1319                         next_to_clean = strategy_buf_id;
1320                         next_passes = strategy_passes;
1321                         bufs_to_lap = NBuffers;
1322                 }
1323         }
1324         else
1325         {
1326                 /*
1327                  * Initializing at startup or after LRU scanning had been off. Always
1328                  * start at the strategy point.
1329                  */
1330 #ifdef BGW_DEBUG
1331                 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
1332                          strategy_passes, strategy_buf_id);
1333 #endif
1334                 strategy_delta = 0;
1335                 next_to_clean = strategy_buf_id;
1336                 next_passes = strategy_passes;
1337                 bufs_to_lap = NBuffers;
1338         }
1339
1340         /* Update saved info for next time */
1341         prev_strategy_buf_id = strategy_buf_id;
1342         prev_strategy_passes = strategy_passes;
1343         saved_info_valid = true;
1344
1345         /*
1346          * Compute how many buffers had to be scanned for each new allocation, ie,
1347          * 1/density of reusable buffers, and track a moving average of that.
1348          *
1349          * If the strategy point didn't move, we don't update the density estimate
1350          */
1351         if (strategy_delta > 0 && recent_alloc > 0)
1352         {
1353                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1354                 smoothed_density += (scans_per_alloc - smoothed_density) /
1355                         smoothing_samples;
1356         }
1357
1358         /*
1359          * Estimate how many reusable buffers there are between the current
1360          * strategy point and where we've scanned ahead to, based on the smoothed
1361          * density estimate.
1362          */
1363         bufs_ahead = NBuffers - bufs_to_lap;
1364         reusable_buffers_est = (float) bufs_ahead / smoothed_density;
1365
1366         /*
1367          * Track a moving average of recent buffer allocations.  Here, rather than
1368          * a true average we want a fast-attack, slow-decline behavior: we
1369          * immediately follow any increase.
1370          */
1371         if (smoothed_alloc <= (float) recent_alloc)
1372                 smoothed_alloc = recent_alloc;
1373         else
1374                 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
1375                         smoothing_samples;
1376
1377         /* Scale the estimate by a GUC to allow more aggressive tuning. */
1378         upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier;
1379
1380         /*
1381          * Even in cases where there's been little or no buffer allocation
1382          * activity, we want to make a small amount of progress through the buffer
1383          * cache so that as many reusable buffers as possible are clean after an
1384          * idle period.
1385          *
1386          * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
1387          * the BGW will be called during the scan_whole_pool time; slice the
1388          * buffer pool into that many sections.
1389          */
1390         min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
1391
1392         if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
1393         {
1394 #ifdef BGW_DEBUG
1395                 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
1396                          upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
1397 #endif
1398                 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
1399         }
1400
1401         /*
1402          * Now write out dirty reusable buffers, working forward from the
1403          * next_to_clean point, until we have lapped the strategy scan, or cleaned
1404          * enough buffers to match our estimate of the next cycle's allocation
1405          * requirements, or hit the bgwriter_lru_maxpages limit.
1406          */
1407
1408         /* Make sure we can handle the pin inside SyncOneBuffer */
1409         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1410
1411         num_to_scan = bufs_to_lap;
1412         num_written = 0;
1413         reusable_buffers = reusable_buffers_est;
1414
1415         /* Execute the LRU scan */
1416         while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
1417         {
1418                 int                     buffer_state = SyncOneBuffer(next_to_clean, true);
1419
1420                 if (++next_to_clean >= NBuffers)
1421                 {
1422                         next_to_clean = 0;
1423                         next_passes++;
1424                 }
1425                 num_to_scan--;
1426
1427                 if (buffer_state & BUF_WRITTEN)
1428                 {
1429                         reusable_buffers++;
1430                         if (++num_written >= bgwriter_lru_maxpages)
1431                         {
1432                                 BgWriterStats.m_maxwritten_clean++;
1433                                 break;
1434                         }
1435                 }
1436                 else if (buffer_state & BUF_REUSABLE)
1437                         reusable_buffers++;
1438         }
1439
1440         BgWriterStats.m_buf_written_clean += num_written;
1441
1442 #ifdef BGW_DEBUG
1443         elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
1444                  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
1445                  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
1446                  bufs_to_lap - num_to_scan,
1447                  num_written,
1448                  reusable_buffers - reusable_buffers_est);
1449 #endif
1450
1451         /*
1452          * Consider the above scan as being like a new allocation scan.
1453          * Characterize its density and update the smoothed one based on it. This
1454          * effectively halves the moving average period in cases where both the
1455          * strategy and the background writer are doing some useful scanning,
1456          * which is helpful because a long memory isn't as desirable on the
1457          * density estimates.
1458          */
1459         strategy_delta = bufs_to_lap - num_to_scan;
1460         recent_alloc = reusable_buffers - reusable_buffers_est;
1461         if (strategy_delta > 0 && recent_alloc > 0)
1462         {
1463                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1464                 smoothed_density += (scans_per_alloc - smoothed_density) /
1465                         smoothing_samples;
1466
1467 #ifdef BGW_DEBUG
1468                 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
1469                          recent_alloc, strategy_delta, scans_per_alloc, smoothed_density);
1470 #endif
1471         }
1472 }
1473
1474 /*
1475  * SyncOneBuffer -- process a single buffer during syncing.
1476  *
1477  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
1478  * buffers marked recently used, as these are not replacement candidates.
1479  *
1480  * Returns a bitmask containing the following flag bits:
1481  *      BUF_WRITTEN: we wrote the buffer.
1482  *      BUF_REUSABLE: buffer is available for replacement, ie, it has
1483  *              pin count 0 and usage count 0.
1484  *
1485  * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
1486  * after locking it, but we don't care all that much.)
1487  *
1488  * Note: caller must have done ResourceOwnerEnlargeBuffers.
1489  */
1490 static int
1491 SyncOneBuffer(int buf_id, bool skip_recently_used)
1492 {
1493         volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1494         int                     result = 0;
1495
1496         /*
1497          * Check whether buffer needs writing.
1498          *
1499          * We can make this check without taking the buffer content lock so long
1500          * as we mark pages dirty in access methods *before* logging changes with
1501          * XLogInsert(): if someone marks the buffer dirty just after our check we
1502          * don't worry because our checkpoint.redo points before log record for
1503          * upcoming changes and so we are not required to write such dirty buffer.
1504          */
1505         LockBufHdr(bufHdr);
1506
1507         if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
1508                 result |= BUF_REUSABLE;
1509         else if (skip_recently_used)
1510         {
1511                 /* Caller told us not to write recently-used buffers */
1512                 UnlockBufHdr(bufHdr);
1513                 return result;
1514         }
1515
1516         if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
1517         {
1518                 /* It's clean, so nothing to do */
1519                 UnlockBufHdr(bufHdr);
1520                 return result;
1521         }
1522
1523         /*
1524          * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
1525          * buffer is clean by the time we've locked it.)
1526          */
1527         PinBuffer_Locked(bufHdr);
1528         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
1529
1530         FlushBuffer(bufHdr, NULL);
1531
1532         LWLockRelease(bufHdr->content_lock);
1533         UnpinBuffer(bufHdr, true);
1534
1535         return result | BUF_WRITTEN;
1536 }
1537
1538
1539 /*
1540  * Return a palloc'd string containing buffer usage statistics.
1541  */
1542 char *
1543 ShowBufferUsage(void)
1544 {
1545         StringInfoData str;
1546         float           hitrate;
1547         float           localhitrate;
1548
1549         initStringInfo(&str);
1550
1551         if (ReadBufferCount == 0)
1552                 hitrate = 0.0;
1553         else
1554                 hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
1555
1556         if (ReadLocalBufferCount == 0)
1557                 localhitrate = 0.0;
1558         else
1559                 localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
1560
1561         appendStringInfo(&str,
1562         "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1563                                 ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
1564         appendStringInfo(&str,
1565         "!\tLocal  blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1566                                          ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
1567         appendStringInfo(&str,
1568                                          "!\tDirect blocks: %10ld read, %10ld written\n",
1569                                          BufFileReadCount, BufFileWriteCount);
1570
1571         return str.data;
1572 }
1573
1574 void
1575 ResetBufferUsage(void)
1576 {
1577         BufferHitCount = 0;
1578         ReadBufferCount = 0;
1579         BufferFlushCount = 0;
1580         LocalBufferHitCount = 0;
1581         ReadLocalBufferCount = 0;
1582         LocalBufferFlushCount = 0;
1583         BufFileReadCount = 0;
1584         BufFileWriteCount = 0;
1585 }
1586
1587 /*
1588  *              AtEOXact_Buffers - clean up at end of transaction.
1589  *
1590  *              As of PostgreSQL 8.0, buffer pins should get released by the
1591  *              ResourceOwner mechanism.  This routine is just a debugging
1592  *              cross-check that no pins remain.
1593  */
1594 void
1595 AtEOXact_Buffers(bool isCommit)
1596 {
1597 #ifdef USE_ASSERT_CHECKING
1598         if (assert_enabled)
1599         {
1600                 int                     i;
1601
1602                 for (i = 0; i < NBuffers; i++)
1603                 {
1604                         Assert(PrivateRefCount[i] == 0);
1605                 }
1606         }
1607 #endif
1608
1609         AtEOXact_LocalBuffers(isCommit);
1610 }
1611
1612 /*
1613  * InitBufferPoolBackend --- second-stage initialization of a new backend
1614  *
1615  * This is called after we have acquired a PGPROC and so can safely get
1616  * LWLocks.  We don't currently need to do anything at this stage ...
1617  * except register a shmem-exit callback.  AtProcExit_Buffers needs LWLock
1618  * access, and thereby has to be called at the corresponding phase of
1619  * backend shutdown.
1620  */
1621 void
1622 InitBufferPoolBackend(void)
1623 {
1624         on_shmem_exit(AtProcExit_Buffers, 0);
1625 }
1626
1627 /*
1628  * Ensure we have released all shared-buffer locks and pins during backend exit
1629  */
1630 static void
1631 AtProcExit_Buffers(int code, Datum arg)
1632 {
1633         int                     i;
1634
1635         AbortBufferIO();
1636         UnlockBuffers();
1637
1638         for (i = 0; i < NBuffers; i++)
1639         {
1640                 if (PrivateRefCount[i] != 0)
1641                 {
1642                         volatile BufferDesc *buf = &(BufferDescriptors[i]);
1643
1644                         /*
1645                          * We don't worry about updating ResourceOwner; if we even got
1646                          * here, it suggests that ResourceOwners are messed up.
1647                          */
1648                         PrivateRefCount[i] = 1;         /* make sure we release shared pin */
1649                         UnpinBuffer(buf, false);
1650                         Assert(PrivateRefCount[i] == 0);
1651                 }
1652         }
1653
1654         /* localbuf.c needs a chance too */
1655         AtProcExit_LocalBuffers();
1656 }
1657
1658 /*
1659  * Helper routine to issue warnings when a buffer is unexpectedly pinned
1660  */
1661 void
1662 PrintBufferLeakWarning(Buffer buffer)
1663 {
1664         volatile BufferDesc *buf;
1665         int32           loccount;
1666
1667         Assert(BufferIsValid(buffer));
1668         if (BufferIsLocal(buffer))
1669         {
1670                 buf = &LocalBufferDescriptors[-buffer - 1];
1671                 loccount = LocalRefCount[-buffer - 1];
1672         }
1673         else
1674         {
1675                 buf = &BufferDescriptors[buffer - 1];
1676                 loccount = PrivateRefCount[buffer - 1];
1677         }
1678
1679         /* theoretically we should lock the bufhdr here */
1680         elog(WARNING,
1681                  "buffer refcount leak: [%03d] "
1682                  "(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
1683                  buffer,
1684                  buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1685                  buf->tag.rnode.relNode,
1686                  buf->tag.blockNum, buf->flags,
1687                  buf->refcount, loccount);
1688 }
1689
1690 /*
1691  * CheckPointBuffers
1692  *
1693  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
1694  *
1695  * Note: temporary relations do not participate in checkpoints, so they don't
1696  * need to be flushed.
1697  */
1698 void
1699 CheckPointBuffers(int flags)
1700 {
1701         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
1702         CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
1703         BufferSync(flags);
1704         CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
1705         smgrsync();
1706         CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
1707         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
1708 }
1709
1710
1711 /*
1712  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
1713  */
1714 void
1715 BufmgrCommit(void)
1716 {
1717         /* Nothing to do in bufmgr anymore... */
1718
1719         smgrcommit();
1720 }
1721
1722 /*
1723  * BufferGetBlockNumber
1724  *              Returns the block number associated with a buffer.
1725  *
1726  * Note:
1727  *              Assumes that the buffer is valid and pinned, else the
1728  *              value may be obsolete immediately...
1729  */
1730 BlockNumber
1731 BufferGetBlockNumber(Buffer buffer)
1732 {
1733         volatile BufferDesc *bufHdr;
1734
1735         Assert(BufferIsPinned(buffer));
1736
1737         if (BufferIsLocal(buffer))
1738                 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1739         else
1740                 bufHdr = &BufferDescriptors[buffer - 1];
1741
1742         /* pinned, so OK to read tag without spinlock */
1743         return bufHdr->tag.blockNum;
1744 }
1745
1746 /*
1747  * BufferGetTag
1748  *              Returns the relfilenode, fork number and block number associated with
1749  *              a buffer.
1750  */
1751 void
1752 BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
1753                          BlockNumber *blknum)
1754 {
1755         volatile BufferDesc *bufHdr;
1756
1757         /* Do the same checks as BufferGetBlockNumber. */
1758         Assert(BufferIsPinned(buffer));
1759
1760         if (BufferIsLocal(buffer))
1761                 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1762         else
1763                 bufHdr = &BufferDescriptors[buffer - 1];
1764
1765         /* pinned, so OK to read tag without spinlock */
1766         *rnode = bufHdr->tag.rnode;
1767         *forknum = bufHdr->tag.forkNum;
1768         *blknum = bufHdr->tag.blockNum;
1769 }
1770
1771 /*
1772  * FlushBuffer
1773  *              Physically write out a shared buffer.
1774  *
1775  * NOTE: this actually just passes the buffer contents to the kernel; the
1776  * real write to disk won't happen until the kernel feels like it.  This
1777  * is okay from our point of view since we can redo the changes from WAL.
1778  * However, we will need to force the changes to disk via fsync before
1779  * we can checkpoint WAL.
1780  *
1781  * The caller must hold a pin on the buffer and have share-locked the
1782  * buffer contents.  (Note: a share-lock does not prevent updates of
1783  * hint bits in the buffer, so the page could change while the write
1784  * is in progress, but we assume that that will not invalidate the data
1785  * written.)
1786  *
1787  * If the caller has an smgr reference for the buffer's relation, pass it
1788  * as the second parameter.  If not, pass NULL.
1789  */
1790 static void
1791 FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
1792 {
1793         XLogRecPtr      recptr;
1794         ErrorContextCallback errcontext;
1795
1796         /*
1797          * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
1798          * false, then someone else flushed the buffer before we could, so we need
1799          * not do anything.
1800          */
1801         if (!StartBufferIO(buf, false))
1802                 return;
1803
1804         /* Setup error traceback support for ereport() */
1805         errcontext.callback = buffer_write_error_callback;
1806         errcontext.arg = (void *) buf;
1807         errcontext.previous = error_context_stack;
1808         error_context_stack = &errcontext;
1809
1810         /* Find smgr relation for buffer */
1811         if (reln == NULL)
1812                 reln = smgropen(buf->tag.rnode);
1813
1814         TRACE_POSTGRESQL_BUFFER_FLUSH_START(reln->smgr_rnode.spcNode,
1815                  reln->smgr_rnode.dbNode,
1816                  reln->smgr_rnode.relNode);
1817
1818         /*
1819          * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
1820          * rule that log updates must hit disk before any of the data-file changes
1821          * they describe do.
1822          */
1823         recptr = BufferGetLSN(buf);
1824         XLogFlush(recptr);
1825
1826         /*
1827          * Now it's safe to write buffer to disk. Note that no one else should
1828          * have been able to write it while we were busy with log flushing because
1829          * we have the io_in_progress lock.
1830          */
1831
1832         /* To check if block content changes while flushing. - vadim 01/17/97 */
1833         LockBufHdr(buf);
1834         buf->flags &= ~BM_JUST_DIRTIED;
1835         UnlockBufHdr(buf);
1836
1837         smgrwrite(reln,
1838                           buf->tag.forkNum,
1839                           buf->tag.blockNum,
1840                           (char *) BufHdrGetBlock(buf),
1841                           false);
1842
1843         BufferFlushCount++;
1844
1845         TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(reln->smgr_rnode.spcNode,
1846                  reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
1847
1848         /*
1849          * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
1850          * end the io_in_progress state.
1851          */
1852         TerminateBufferIO(buf, true, 0);
1853
1854         /* Pop the error context stack */
1855         error_context_stack = errcontext.previous;
1856 }
1857
1858 /*
1859  * RelationGetNumberOfBlocks
1860  *              Determines the current number of pages in the relation.
1861  */
1862 BlockNumber
1863 RelationGetNumberOfBlocks(Relation relation)
1864 {
1865         /* Open it at the smgr level if not already done */
1866         RelationOpenSmgr(relation);
1867
1868         return smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
1869 }
1870
1871 /*
1872  * RelationTruncate
1873  *              Physically truncate a relation to the specified number of blocks.
1874  *
1875  * As of Postgres 8.1, this includes getting rid of any buffers for the
1876  * blocks that are to be dropped; previously, callers had to do that.
1877  */
1878 void
1879 RelationTruncate(Relation rel, BlockNumber nblocks)
1880 {
1881         /* Open it at the smgr level if not already done */
1882         RelationOpenSmgr(rel);
1883
1884         /* Make sure rd_targblock isn't pointing somewhere past end */
1885         rel->rd_targblock = InvalidBlockNumber;
1886
1887         /* Do the real work */
1888         smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp);
1889 }
1890
1891 /* ---------------------------------------------------------------------
1892  *              DropRelFileNodeBuffers
1893  *
1894  *              This function removes from the buffer pool all the pages of the
1895  *              specified relation that have block numbers >= firstDelBlock.
1896  *              (In particular, with firstDelBlock = 0, all pages are removed.)
1897  *              Dirty pages are simply dropped, without bothering to write them
1898  *              out first.      Therefore, this is NOT rollback-able, and so should be
1899  *              used only with extreme caution!
1900  *
1901  *              Currently, this is called only from smgr.c when the underlying file
1902  *              is about to be deleted or truncated (firstDelBlock is needed for
1903  *              the truncation case).  The data in the affected pages would therefore
1904  *              be deleted momentarily anyway, and there is no point in writing it.
1905  *              It is the responsibility of higher-level code to ensure that the
1906  *              deletion or truncation does not lose any data that could be needed
1907  *              later.  It is also the responsibility of higher-level code to ensure
1908  *              that no other process could be trying to load more pages of the
1909  *              relation into buffers.
1910  *
1911  *              XXX currently it sequentially searches the buffer pool, should be
1912  *              changed to more clever ways of searching.  However, this routine
1913  *              is used only in code paths that aren't very performance-critical,
1914  *              and we shouldn't slow down the hot paths to make it faster ...
1915  * --------------------------------------------------------------------
1916  */
1917 void
1918 DropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum, bool istemp,
1919                                            BlockNumber firstDelBlock)
1920 {
1921         int                     i;
1922
1923         if (istemp)
1924         {
1925                 DropRelFileNodeLocalBuffers(rnode, forkNum, firstDelBlock);
1926                 return;
1927         }
1928
1929         for (i = 0; i < NBuffers; i++)
1930         {
1931                 volatile BufferDesc *bufHdr = &BufferDescriptors[i];
1932
1933                 LockBufHdr(bufHdr);
1934                 if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
1935                         bufHdr->tag.forkNum == forkNum &&
1936                         bufHdr->tag.blockNum >= firstDelBlock)
1937                         InvalidateBuffer(bufHdr);       /* releases spinlock */
1938                 else
1939                         UnlockBufHdr(bufHdr);
1940         }
1941 }
1942
1943 /* ---------------------------------------------------------------------
1944  *              DropDatabaseBuffers
1945  *
1946  *              This function removes all the buffers in the buffer cache for a
1947  *              particular database.  Dirty pages are simply dropped, without
1948  *              bothering to write them out first.      This is used when we destroy a
1949  *              database, to avoid trying to flush data to disk when the directory
1950  *              tree no longer exists.  Implementation is pretty similar to
1951  *              DropRelFileNodeBuffers() which is for destroying just one relation.
1952  * --------------------------------------------------------------------
1953  */
1954 void
1955 DropDatabaseBuffers(Oid dbid)
1956 {
1957         int                     i;
1958         volatile BufferDesc *bufHdr;
1959
1960         /*
1961          * We needn't consider local buffers, since by assumption the target
1962          * database isn't our own.
1963          */
1964
1965         for (i = 0; i < NBuffers; i++)
1966         {
1967                 bufHdr = &BufferDescriptors[i];
1968                 LockBufHdr(bufHdr);
1969                 if (bufHdr->tag.rnode.dbNode == dbid)
1970                         InvalidateBuffer(bufHdr);       /* releases spinlock */
1971                 else
1972                         UnlockBufHdr(bufHdr);
1973         }
1974 }
1975
1976 /* -----------------------------------------------------------------
1977  *              PrintBufferDescs
1978  *
1979  *              this function prints all the buffer descriptors, for debugging
1980  *              use only.
1981  * -----------------------------------------------------------------
1982  */
1983 #ifdef NOT_USED
1984 void
1985 PrintBufferDescs(void)
1986 {
1987         int                     i;
1988         volatile BufferDesc *buf = BufferDescriptors;
1989
1990         for (i = 0; i < NBuffers; ++i, ++buf)
1991         {
1992                 /* theoretically we should lock the bufhdr here */
1993                 elog(LOG,
1994                          "[%02d] (freeNext=%d, rel=%u/%u/%u, "
1995                          "blockNum=%u, flags=0x%x, refcount=%u %d)",
1996                          i, buf->freeNext,
1997                          buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1998                          buf->tag.rnode.relNode,
1999                          buf->tag.blockNum, buf->flags,
2000                          buf->refcount, PrivateRefCount[i]);
2001         }
2002 }
2003 #endif
2004
2005 #ifdef NOT_USED
2006 void
2007 PrintPinnedBufs(void)
2008 {
2009         int                     i;
2010         volatile BufferDesc *buf = BufferDescriptors;
2011
2012         for (i = 0; i < NBuffers; ++i, ++buf)
2013         {
2014                 if (PrivateRefCount[i] > 0)
2015                 {
2016                         /* theoretically we should lock the bufhdr here */
2017                         elog(LOG,
2018                                  "[%02d] (freeNext=%d, rel=%u/%u/%u, "
2019                                  "blockNum=%u, flags=0x%x, refcount=%u %d)",
2020                                  i, buf->freeNext,
2021                                  buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
2022                                  buf->tag.rnode.relNode,
2023                                  buf->tag.blockNum, buf->flags,
2024                                  buf->refcount, PrivateRefCount[i]);
2025                 }
2026         }
2027 }
2028 #endif
2029
2030 /* ---------------------------------------------------------------------
2031  *              FlushRelationBuffers
2032  *
2033  *              This function writes all dirty pages of a relation out to disk
2034  *              (or more accurately, out to kernel disk buffers), ensuring that the
2035  *              kernel has an up-to-date view of the relation.
2036  *
2037  *              Generally, the caller should be holding AccessExclusiveLock on the
2038  *              target relation to ensure that no other backend is busy dirtying
2039  *              more blocks of the relation; the effects can't be expected to last
2040  *              after the lock is released.
2041  *
2042  *              XXX currently it sequentially searches the buffer pool, should be
2043  *              changed to more clever ways of searching.  This routine is not
2044  *              used in any performance-critical code paths, so it's not worth
2045  *              adding additional overhead to normal paths to make it go faster;
2046  *              but see also DropRelFileNodeBuffers.
2047  * --------------------------------------------------------------------
2048  */
2049 void
2050 FlushRelationBuffers(Relation rel)
2051 {
2052         int                     i;
2053         volatile BufferDesc *bufHdr;
2054
2055         /* Open rel at the smgr level if not already done */
2056         RelationOpenSmgr(rel);
2057
2058         if (rel->rd_istemp)
2059         {
2060                 for (i = 0; i < NLocBuffer; i++)
2061                 {
2062                         bufHdr = &LocalBufferDescriptors[i];
2063                         if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2064                                 (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2065                         {
2066                                 ErrorContextCallback errcontext;
2067
2068                                 /* Setup error traceback support for ereport() */
2069                                 errcontext.callback = buffer_write_error_callback;
2070                                 errcontext.arg = (void *) bufHdr;
2071                                 errcontext.previous = error_context_stack;
2072                                 error_context_stack = &errcontext;
2073
2074                                 smgrwrite(rel->rd_smgr,
2075                                                   bufHdr->tag.forkNum,
2076                                                   bufHdr->tag.blockNum,
2077                                                   (char *) LocalBufHdrGetBlock(bufHdr),
2078                                                   true);
2079
2080                                 bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
2081
2082                                 /* Pop the error context stack */
2083                                 error_context_stack = errcontext.previous;
2084                         }
2085                 }
2086
2087                 return;
2088         }
2089
2090         /* Make sure we can handle the pin inside the loop */
2091         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2092
2093         for (i = 0; i < NBuffers; i++)
2094         {
2095                 bufHdr = &BufferDescriptors[i];
2096                 LockBufHdr(bufHdr);
2097                 if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2098                         (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2099                 {
2100                         PinBuffer_Locked(bufHdr);
2101                         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2102                         FlushBuffer(bufHdr, rel->rd_smgr);
2103                         LWLockRelease(bufHdr->content_lock);
2104                         UnpinBuffer(bufHdr, true);
2105                 }
2106                 else
2107                         UnlockBufHdr(bufHdr);
2108         }
2109 }
2110
2111 /* ---------------------------------------------------------------------
2112  *              FlushDatabaseBuffers
2113  *
2114  *              This function writes all dirty pages of a database out to disk
2115  *              (or more accurately, out to kernel disk buffers), ensuring that the
2116  *              kernel has an up-to-date view of the database.
2117  *
2118  *              Generally, the caller should be holding an appropriate lock to ensure
2119  *              no other backend is active in the target database; otherwise more
2120  *              pages could get dirtied.
2121  *
2122  *              Note we don't worry about flushing any pages of temporary relations.
2123  *              It's assumed these wouldn't be interesting.
2124  * --------------------------------------------------------------------
2125  */
2126 void
2127 FlushDatabaseBuffers(Oid dbid)
2128 {
2129         int                     i;
2130         volatile BufferDesc *bufHdr;
2131
2132         /* Make sure we can handle the pin inside the loop */
2133         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2134
2135         for (i = 0; i < NBuffers; i++)
2136         {
2137                 bufHdr = &BufferDescriptors[i];
2138                 LockBufHdr(bufHdr);
2139                 if (bufHdr->tag.rnode.dbNode == dbid &&
2140                         (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2141                 {
2142                         PinBuffer_Locked(bufHdr);
2143                         LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2144                         FlushBuffer(bufHdr, NULL);
2145                         LWLockRelease(bufHdr->content_lock);
2146                         UnpinBuffer(bufHdr, true);
2147                 }
2148                 else
2149                         UnlockBufHdr(bufHdr);
2150         }
2151 }
2152
2153 /*
2154  * ReleaseBuffer -- release the pin on a buffer
2155  */
2156 void
2157 ReleaseBuffer(Buffer buffer)
2158 {
2159         volatile BufferDesc *bufHdr;
2160
2161         if (!BufferIsValid(buffer))
2162                 elog(ERROR, "bad buffer id: %d", buffer);
2163
2164         ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
2165
2166         if (BufferIsLocal(buffer))
2167         {
2168                 Assert(LocalRefCount[-buffer - 1] > 0);
2169                 LocalRefCount[-buffer - 1]--;
2170                 return;
2171         }
2172
2173         bufHdr = &BufferDescriptors[buffer - 1];
2174
2175         Assert(PrivateRefCount[buffer - 1] > 0);
2176
2177         if (PrivateRefCount[buffer - 1] > 1)
2178                 PrivateRefCount[buffer - 1]--;
2179         else
2180                 UnpinBuffer(bufHdr, false);
2181 }
2182
2183 /*
2184  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
2185  *
2186  * This is just a shorthand for a common combination.
2187  */
2188 void
2189 UnlockReleaseBuffer(Buffer buffer)
2190 {
2191         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2192         ReleaseBuffer(buffer);
2193 }
2194
2195 /*
2196  * IncrBufferRefCount
2197  *              Increment the pin count on a buffer that we have *already* pinned
2198  *              at least once.
2199  *
2200  *              This function cannot be used on a buffer we do not have pinned,
2201  *              because it doesn't change the shared buffer state.
2202  */
2203 void
2204 IncrBufferRefCount(Buffer buffer)
2205 {
2206         Assert(BufferIsPinned(buffer));
2207         ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2208         ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
2209         if (BufferIsLocal(buffer))
2210                 LocalRefCount[-buffer - 1]++;
2211         else
2212                 PrivateRefCount[buffer - 1]++;
2213 }
2214
2215 /*
2216  * SetBufferCommitInfoNeedsSave
2217  *
2218  *      Mark a buffer dirty when we have updated tuple commit-status bits in it.
2219  *
2220  * This is essentially the same as MarkBufferDirty, except that the caller
2221  * might have only share-lock instead of exclusive-lock on the buffer's
2222  * content lock.  We preserve the distinction mainly as a way of documenting
2223  * that the caller has not made a critical data change --- the status-bit
2224  * update could be redone by someone else just as easily.  Therefore, no WAL
2225  * log record need be generated, whereas calls to MarkBufferDirty really ought
2226  * to be associated with a WAL-entry-creating action.
2227  */
2228 void
2229 SetBufferCommitInfoNeedsSave(Buffer buffer)
2230 {
2231         volatile BufferDesc *bufHdr;
2232
2233         if (!BufferIsValid(buffer))
2234                 elog(ERROR, "bad buffer id: %d", buffer);
2235
2236         if (BufferIsLocal(buffer))
2237         {
2238                 MarkLocalBufferDirty(buffer);
2239                 return;
2240         }
2241
2242         bufHdr = &BufferDescriptors[buffer - 1];
2243
2244         Assert(PrivateRefCount[buffer - 1] > 0);
2245         /* here, either share or exclusive lock is OK */
2246         Assert(LWLockHeldByMe(bufHdr->content_lock));
2247
2248         /*
2249          * This routine might get called many times on the same page, if we are
2250          * making the first scan after commit of an xact that added/deleted many
2251          * tuples.      So, be as quick as we can if the buffer is already dirty.  We
2252          * do this by not acquiring spinlock if it looks like the status bits are
2253          * already OK.  (Note it is okay if someone else clears BM_JUST_DIRTIED
2254          * immediately after we look, because the buffer content update is already
2255          * done and will be reflected in the I/O.)
2256          */
2257         if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
2258                 (BM_DIRTY | BM_JUST_DIRTIED))
2259         {
2260                 LockBufHdr(bufHdr);
2261                 Assert(bufHdr->refcount > 0);
2262                 if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
2263                         VacuumCostBalance += VacuumCostPageDirty;
2264                 bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
2265                 UnlockBufHdr(bufHdr);
2266         }
2267 }
2268
2269 /*
2270  * Release buffer content locks for shared buffers.
2271  *
2272  * Used to clean up after errors.
2273  *
2274  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
2275  * of releasing buffer content locks per se; the only thing we need to deal
2276  * with here is clearing any PIN_COUNT request that was in progress.
2277  */
2278 void
2279 UnlockBuffers(void)
2280 {
2281         volatile BufferDesc *buf = PinCountWaitBuf;
2282
2283         if (buf)
2284         {
2285                 LockBufHdr(buf);
2286
2287                 /*
2288                  * Don't complain if flag bit not set; it could have been reset but we
2289                  * got a cancel/die interrupt before getting the signal.
2290                  */
2291                 if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
2292                         buf->wait_backend_pid == MyProcPid)
2293                         buf->flags &= ~BM_PIN_COUNT_WAITER;
2294
2295                 UnlockBufHdr(buf);
2296
2297                 PinCountWaitBuf = NULL;
2298         }
2299 }
2300
2301 /*
2302  * Acquire or release the content_lock for the buffer.
2303  */
2304 void
2305 LockBuffer(Buffer buffer, int mode)
2306 {
2307         volatile BufferDesc *buf;
2308
2309         Assert(BufferIsValid(buffer));
2310         if (BufferIsLocal(buffer))
2311                 return;                                 /* local buffers need no lock */
2312
2313         buf = &(BufferDescriptors[buffer - 1]);
2314
2315         if (mode == BUFFER_LOCK_UNLOCK)
2316                 LWLockRelease(buf->content_lock);
2317         else if (mode == BUFFER_LOCK_SHARE)
2318                 LWLockAcquire(buf->content_lock, LW_SHARED);
2319         else if (mode == BUFFER_LOCK_EXCLUSIVE)
2320                 LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
2321         else
2322                 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
2323 }
2324
2325 /*
2326  * Acquire the content_lock for the buffer, but only if we don't have to wait.
2327  *
2328  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
2329  */
2330 bool
2331 ConditionalLockBuffer(Buffer buffer)
2332 {
2333         volatile BufferDesc *buf;
2334
2335         Assert(BufferIsValid(buffer));
2336         if (BufferIsLocal(buffer))
2337                 return true;                    /* act as though we got it */
2338
2339         buf = &(BufferDescriptors[buffer - 1]);
2340
2341         return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
2342 }
2343
2344 /*
2345  * LockBufferForCleanup - lock a buffer in preparation for deleting items
2346  *
2347  * Items may be deleted from a disk page only when the caller (a) holds an
2348  * exclusive lock on the buffer and (b) has observed that no other backend
2349  * holds a pin on the buffer.  If there is a pin, then the other backend
2350  * might have a pointer into the buffer (for example, a heapscan reference
2351  * to an item --- see README for more details).  It's OK if a pin is added
2352  * after the cleanup starts, however; the newly-arrived backend will be
2353  * unable to look at the page until we release the exclusive lock.
2354  *
2355  * To implement this protocol, a would-be deleter must pin the buffer and
2356  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
2357  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
2358  * it has successfully observed pin count = 1.
2359  */
2360 void
2361 LockBufferForCleanup(Buffer buffer)
2362 {
2363         volatile BufferDesc *bufHdr;
2364
2365         Assert(BufferIsValid(buffer));
2366         Assert(PinCountWaitBuf == NULL);
2367
2368         if (BufferIsLocal(buffer))
2369         {
2370                 /* There should be exactly one pin */
2371                 if (LocalRefCount[-buffer - 1] != 1)
2372                         elog(ERROR, "incorrect local pin count: %d",
2373                                  LocalRefCount[-buffer - 1]);
2374                 /* Nobody else to wait for */
2375                 return;
2376         }
2377
2378         /* There should be exactly one local pin */
2379         if (PrivateRefCount[buffer - 1] != 1)
2380                 elog(ERROR, "incorrect local pin count: %d",
2381                          PrivateRefCount[buffer - 1]);
2382
2383         bufHdr = &BufferDescriptors[buffer - 1];
2384
2385         for (;;)
2386         {
2387                 /* Try to acquire lock */
2388                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2389                 LockBufHdr(bufHdr);
2390                 Assert(bufHdr->refcount > 0);
2391                 if (bufHdr->refcount == 1)
2392                 {
2393                         /* Successfully acquired exclusive lock with pincount 1 */
2394                         UnlockBufHdr(bufHdr);
2395                         return;
2396                 }
2397                 /* Failed, so mark myself as waiting for pincount 1 */
2398                 if (bufHdr->flags & BM_PIN_COUNT_WAITER)
2399                 {
2400                         UnlockBufHdr(bufHdr);
2401                         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2402                         elog(ERROR, "multiple backends attempting to wait for pincount 1");
2403                 }
2404                 bufHdr->wait_backend_pid = MyProcPid;
2405                 bufHdr->flags |= BM_PIN_COUNT_WAITER;
2406                 PinCountWaitBuf = bufHdr;
2407                 UnlockBufHdr(bufHdr);
2408                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2409                 /* Wait to be signaled by UnpinBuffer() */
2410                 ProcWaitForSignal();
2411                 PinCountWaitBuf = NULL;
2412                 /* Loop back and try again */
2413         }
2414 }
2415
2416 /*
2417  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
2418  *
2419  * We won't loop, but just check once to see if the pin count is OK.  If
2420  * not, return FALSE with no lock held.
2421  */
2422 bool
2423 ConditionalLockBufferForCleanup(Buffer buffer)
2424 {
2425         volatile BufferDesc *bufHdr;
2426
2427         Assert(BufferIsValid(buffer));
2428
2429         if (BufferIsLocal(buffer))
2430         {
2431                 /* There should be exactly one pin */
2432                 Assert(LocalRefCount[-buffer - 1] > 0);
2433                 if (LocalRefCount[-buffer - 1] != 1)
2434                         return false;
2435                 /* Nobody else to wait for */
2436                 return true;
2437         }
2438
2439         /* There should be exactly one local pin */
2440         Assert(PrivateRefCount[buffer - 1] > 0);
2441         if (PrivateRefCount[buffer - 1] != 1)
2442                 return false;
2443
2444         /* Try to acquire lock */
2445         if (!ConditionalLockBuffer(buffer))
2446                 return false;
2447
2448         bufHdr = &BufferDescriptors[buffer - 1];
2449         LockBufHdr(bufHdr);
2450         Assert(bufHdr->refcount > 0);
2451         if (bufHdr->refcount == 1)
2452         {
2453                 /* Successfully acquired exclusive lock with pincount 1 */
2454                 UnlockBufHdr(bufHdr);
2455                 return true;
2456         }
2457
2458         /* Failed, so release the lock */
2459         UnlockBufHdr(bufHdr);
2460         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2461         return false;
2462 }
2463
2464
2465 /*
2466  *      Functions for buffer I/O handling
2467  *
2468  *      Note: We assume that nested buffer I/O never occurs.
2469  *      i.e at most one io_in_progress lock is held per proc.
2470  *
2471  *      Also note that these are used only for shared buffers, not local ones.
2472  */
2473
2474 /*
2475  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
2476  */
2477 static void
2478 WaitIO(volatile BufferDesc *buf)
2479 {
2480         /*
2481          * Changed to wait until there's no IO - Inoue 01/13/2000
2482          *
2483          * Note this is *necessary* because an error abort in the process doing
2484          * I/O could release the io_in_progress_lock prematurely. See
2485          * AbortBufferIO.
2486          */
2487         for (;;)
2488         {
2489                 BufFlags        sv_flags;
2490
2491                 /*
2492                  * It may not be necessary to acquire the spinlock to check the flag
2493                  * here, but since this test is essential for correctness, we'd better
2494                  * play it safe.
2495                  */
2496                 LockBufHdr(buf);
2497                 sv_flags = buf->flags;
2498                 UnlockBufHdr(buf);
2499                 if (!(sv_flags & BM_IO_IN_PROGRESS))
2500                         break;
2501                 LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
2502                 LWLockRelease(buf->io_in_progress_lock);
2503         }
2504 }
2505
2506 /*
2507  * StartBufferIO: begin I/O on this buffer
2508  *      (Assumptions)
2509  *      My process is executing no IO
2510  *      The buffer is Pinned
2511  *
2512  * In some scenarios there are race conditions in which multiple backends
2513  * could attempt the same I/O operation concurrently.  If someone else
2514  * has already started I/O on this buffer then we will block on the
2515  * io_in_progress lock until he's done.
2516  *
2517  * Input operations are only attempted on buffers that are not BM_VALID,
2518  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
2519  * so we can always tell if the work is already done.
2520  *
2521  * Returns TRUE if we successfully marked the buffer as I/O busy,
2522  * FALSE if someone else already did the work.
2523  */
2524 static bool
2525 StartBufferIO(volatile BufferDesc *buf, bool forInput)
2526 {
2527         Assert(!InProgressBuf);
2528
2529         for (;;)
2530         {
2531                 /*
2532                  * Grab the io_in_progress lock so that other processes can wait for
2533                  * me to finish the I/O.
2534                  */
2535                 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2536
2537                 LockBufHdr(buf);
2538
2539                 if (!(buf->flags & BM_IO_IN_PROGRESS))
2540                         break;
2541
2542                 /*
2543                  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
2544                  * lock isn't held is if the process doing the I/O is recovering from
2545                  * an error (see AbortBufferIO).  If that's the case, we must wait for
2546                  * him to get unwedged.
2547                  */
2548                 UnlockBufHdr(buf);
2549                 LWLockRelease(buf->io_in_progress_lock);
2550                 WaitIO(buf);
2551         }
2552
2553         /* Once we get here, there is definitely no I/O active on this buffer */
2554
2555         if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
2556         {
2557                 /* someone else already did the I/O */
2558                 UnlockBufHdr(buf);
2559                 LWLockRelease(buf->io_in_progress_lock);
2560                 return false;
2561         }
2562
2563         buf->flags |= BM_IO_IN_PROGRESS;
2564
2565         UnlockBufHdr(buf);
2566
2567         InProgressBuf = buf;
2568         IsForInput = forInput;
2569
2570         return true;
2571 }
2572
2573 /*
2574  * TerminateBufferIO: release a buffer we were doing I/O on
2575  *      (Assumptions)
2576  *      My process is executing IO for the buffer
2577  *      BM_IO_IN_PROGRESS bit is set for the buffer
2578  *      We hold the buffer's io_in_progress lock
2579  *      The buffer is Pinned
2580  *
2581  * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
2582  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
2583  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
2584  * marking the buffer clean if it was re-dirtied while we were writing.
2585  *
2586  * set_flag_bits gets ORed into the buffer's flags.  It must include
2587  * BM_IO_ERROR in a failure case.  For successful completion it could
2588  * be 0, or BM_VALID if we just finished reading in the page.
2589  */
2590 static void
2591 TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
2592                                   int set_flag_bits)
2593 {
2594         Assert(buf == InProgressBuf);
2595
2596         LockBufHdr(buf);
2597
2598         Assert(buf->flags & BM_IO_IN_PROGRESS);
2599         buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
2600         if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
2601                 buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
2602         buf->flags |= set_flag_bits;
2603
2604         UnlockBufHdr(buf);
2605
2606         InProgressBuf = NULL;
2607
2608         LWLockRelease(buf->io_in_progress_lock);
2609 }
2610
2611 /*
2612  * AbortBufferIO: Clean up any active buffer I/O after an error.
2613  *
2614  *      All LWLocks we might have held have been released,
2615  *      but we haven't yet released buffer pins, so the buffer is still pinned.
2616  *
2617  *      If I/O was in progress, we always set BM_IO_ERROR, even though it's
2618  *      possible the error condition wasn't related to the I/O.
2619  */
2620 void
2621 AbortBufferIO(void)
2622 {
2623         volatile BufferDesc *buf = InProgressBuf;
2624
2625         if (buf)
2626         {
2627                 /*
2628                  * Since LWLockReleaseAll has already been called, we're not holding
2629                  * the buffer's io_in_progress_lock. We have to re-acquire it so that
2630                  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
2631                  * buffer will be in a busy spin until we succeed in doing this.
2632                  */
2633                 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2634
2635                 LockBufHdr(buf);
2636                 Assert(buf->flags & BM_IO_IN_PROGRESS);
2637                 if (IsForInput)
2638                 {
2639                         Assert(!(buf->flags & BM_DIRTY));
2640                         /* We'd better not think buffer is valid yet */
2641                         Assert(!(buf->flags & BM_VALID));
2642                         UnlockBufHdr(buf);
2643                 }
2644                 else
2645                 {
2646                         BufFlags        sv_flags;
2647
2648                         sv_flags = buf->flags;
2649                         Assert(sv_flags & BM_DIRTY);
2650                         UnlockBufHdr(buf);
2651                         /* Issue notice if this is not the first failure... */
2652                         if (sv_flags & BM_IO_ERROR)
2653                         {
2654                                 /* Buffer is pinned, so we can read tag without spinlock */
2655                                 ereport(WARNING,
2656                                                 (errcode(ERRCODE_IO_ERROR),
2657                                                  errmsg("could not write block %u of %u/%u/%u",
2658                                                                 buf->tag.blockNum,
2659                                                                 buf->tag.rnode.spcNode,
2660                                                                 buf->tag.rnode.dbNode,
2661                                                                 buf->tag.rnode.relNode),
2662                                                  errdetail("Multiple failures --- write error might be permanent.")));
2663                         }
2664                 }
2665                 TerminateBufferIO(buf, false, BM_IO_ERROR);
2666         }
2667 }
2668
2669 /*
2670  * Error context callback for errors occurring during buffer writes.
2671  */
2672 static void
2673 buffer_write_error_callback(void *arg)
2674 {
2675         volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;
2676
2677         /* Buffer is pinned, so we can read the tag without locking the spinlock */
2678         if (bufHdr != NULL)
2679                 errcontext("writing block %u of relation %u/%u/%u",
2680                                    bufHdr->tag.blockNum,
2681                                    bufHdr->tag.rnode.spcNode,
2682                                    bufHdr->tag.rnode.dbNode,
2683                                    bufHdr->tag.rnode.relNode);
2684 }