Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / storage / buffer / bufmgr.c
blobde28374b40507ac494c85a154cc0c39547d38409
1 /*-------------------------------------------------------------------------
3 * bufmgr.c
4 * buffer manager interface routines
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 *-------------------------------------------------------------------------
16 * Principal entry points:
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
22 * ReleaseBuffer() -- unpin a buffer
24 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25 * The disk write is delayed until buffer replacement or checkpoint.
27 * See also these files:
28 * freelist.c -- chooses victim for buffer replacement
29 * buf_table.c -- manages the buffer lookup table
31 #include "postgres.h"
33 #include <sys/file.h>
34 #include <unistd.h>
36 #include "catalog/catalog.h"
37 #include "miscadmin.h"
38 #include "pg_trace.h"
39 #include "pgstat.h"
40 #include "postmaster/bgwriter.h"
41 #include "storage/buf_internals.h"
42 #include "storage/bufmgr.h"
43 #include "storage/ipc.h"
44 #include "storage/proc.h"
45 #include "storage/smgr.h"
46 #include "utils/rel.h"
47 #include "utils/resowner.h"
50 /* Note: these two macros only work on shared buffers, not local ones! */
51 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
52 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
54 /* Note: this macro only works on local buffers, not shared ones! */
55 #define LocalBufHdrGetBlock(bufHdr) \
56 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
58 /* Bits in SyncOneBuffer's return value */
59 #define BUF_WRITTEN 0x01
60 #define BUF_REUSABLE 0x02
63 /* GUC variables */
64 bool zero_damaged_pages = false;
65 int bgwriter_lru_maxpages = 100;
66 double bgwriter_lru_multiplier = 2.0;
69 * How many buffers PrefetchBuffer callers should try to stay ahead of their
70 * ReadBuffer calls by. This is maintained by the assign hook for
71 * effective_io_concurrency. Zero means "never prefetch".
73 int target_prefetch_pages = 0;
75 /* local state for StartBufferIO and related functions */
76 static volatile BufferDesc *InProgressBuf = NULL;
77 static bool IsForInput;
79 /* local state for LockBufferForCleanup */
80 static volatile BufferDesc *PinCountWaitBuf = NULL;
83 static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
84 ForkNumber forkNum, BlockNumber blockNum,
85 ReadBufferMode mode, BufferAccessStrategy strategy,
86 bool *hit);
87 static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
88 static void PinBuffer_Locked(volatile BufferDesc *buf);
89 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
90 static void BufferSync(int flags);
91 static int SyncOneBuffer(int buf_id, bool skip_recently_used);
92 static void WaitIO(volatile BufferDesc *buf);
93 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
94 static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
95 int set_flag_bits);
96 static void buffer_write_error_callback(void *arg);
97 static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
98 BlockNumber blockNum,
99 BufferAccessStrategy strategy,
100 bool *foundPtr);
101 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
102 static void AtProcExit_Buffers(int code, Datum arg);
106 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
108 * This is named by analogy to ReadBuffer but doesn't actually allocate a
109 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
110 * block will not be delayed by the I/O. Prefetching is optional.
111 * No-op if prefetching isn't compiled in.
113 void
114 PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
116 #ifdef USE_PREFETCH
117 Assert(RelationIsValid(reln));
118 Assert(BlockNumberIsValid(blockNum));
120 /* Open it at the smgr level if not already done */
121 RelationOpenSmgr(reln);
123 if (reln->rd_istemp)
125 /* see comments in ReadBufferExtended */
126 if (RELATION_IS_OTHER_TEMP(reln))
127 ereport(ERROR,
128 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
129 errmsg("cannot access temporary tables of other sessions")));
131 /* pass it off to localbuf.c */
132 LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
134 else
136 BufferTag newTag; /* identity of requested block */
137 uint32 newHash; /* hash value for newTag */
138 LWLockId newPartitionLock; /* buffer partition lock for it */
139 int buf_id;
141 /* create a tag so we can lookup the buffer */
142 INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode, forkNum, blockNum);
144 /* determine its hash code and partition lock ID */
145 newHash = BufTableHashCode(&newTag);
146 newPartitionLock = BufMappingPartitionLock(newHash);
148 /* see if the block is in the buffer pool already */
149 LWLockAcquire(newPartitionLock, LW_SHARED);
150 buf_id = BufTableLookup(&newTag, newHash);
151 LWLockRelease(newPartitionLock);
153 /* If not in buffers, initiate prefetch */
154 if (buf_id < 0)
155 smgrprefetch(reln->rd_smgr, forkNum, blockNum);
158 * If the block *is* in buffers, we do nothing. This is not really
159 * ideal: the block might be just about to be evicted, which would be
160 * stupid since we know we are going to need it soon. But the only
161 * easy answer is to bump the usage_count, which does not seem like a
162 * great solution: when the caller does ultimately touch the block,
163 * usage_count would get bumped again, resulting in too much
164 * favoritism for blocks that are involved in a prefetch sequence. A
165 * real fix would involve some additional per-buffer state, and it's
166 * not clear that there's enough of a problem to justify that.
169 #endif /* USE_PREFETCH */
174 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
175 * fork with RBM_NORMAL mode and default strategy.
177 Buffer
178 ReadBuffer(Relation reln, BlockNumber blockNum)
180 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
184 * ReadBufferExtended -- returns a buffer containing the requested
185 * block of the requested relation. If the blknum
186 * requested is P_NEW, extend the relation file and
187 * allocate a new block. (Caller is responsible for
188 * ensuring that only one backend tries to extend a
189 * relation at the same time!)
191 * Returns: the buffer number for the buffer containing
192 * the block read. The returned buffer has been pinned.
193 * Does not return on error --- elog's instead.
195 * Assume when this function is called, that reln has been opened already.
197 * In RBM_NORMAL mode, the page is read from disk, and the page header is
198 * validated. An error is thrown if the page header is not valid.
200 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
201 * valid, the page is zeroed instead of throwing an error. This is intended
202 * for non-critical data, where the caller is prepared to repair errors.
204 * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
205 * with zeros instead of reading it from disk. Useful when the caller is
206 * going to fill the page from scratch, since this saves I/O and avoids
207 * unnecessary failure if the page-on-disk has corrupt page headers.
208 * Caution: do not use this mode to read a page that is beyond the relation's
209 * current physical EOF; that is likely to cause problems in md.c when
210 * the page is modified and written out. P_NEW is OK, though.
212 * If strategy is not NULL, a nondefault buffer access strategy is used.
213 * See buffer/README for details.
215 Buffer
216 ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
217 ReadBufferMode mode, BufferAccessStrategy strategy)
219 bool hit;
220 Buffer buf;
222 /* Open it at the smgr level if not already done */
223 RelationOpenSmgr(reln);
226 * Reject attempts to read non-local temporary relations; we would be
227 * likely to get wrong data since we have no visibility into the owning
228 * session's local buffers.
230 if (RELATION_IS_OTHER_TEMP(reln))
231 ereport(ERROR,
232 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
233 errmsg("cannot access temporary tables of other sessions")));
236 * Read the buffer, and update pgstat counters to reflect a cache hit or
237 * miss.
239 pgstat_count_buffer_read(reln);
240 buf = ReadBuffer_common(reln->rd_smgr, reln->rd_istemp, forkNum, blockNum,
241 mode, strategy, &hit);
242 if (hit)
243 pgstat_count_buffer_hit(reln);
244 return buf;
249 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
250 * a relcache entry for the relation.
252 * NB: caller is assumed to know what it's doing if isTemp is true.
254 Buffer
255 ReadBufferWithoutRelcache(RelFileNode rnode, bool isTemp,
256 ForkNumber forkNum, BlockNumber blockNum,
257 ReadBufferMode mode, BufferAccessStrategy strategy)
259 bool hit;
261 SMgrRelation smgr = smgropen(rnode);
263 return ReadBuffer_common(smgr, isTemp, forkNum, blockNum, mode, strategy,
264 &hit);
269 * ReadBuffer_common -- common logic for all ReadBuffer variants
271 * *hit is set to true if the request was satisfied from shared buffer cache.
273 static Buffer
274 ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
275 BlockNumber blockNum, ReadBufferMode mode,
276 BufferAccessStrategy strategy, bool *hit)
278 volatile BufferDesc *bufHdr;
279 Block bufBlock;
280 bool found;
281 bool isExtend;
283 *hit = false;
285 /* Make sure we will have room to remember the buffer pin */
286 ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
288 isExtend = (blockNum == P_NEW);
290 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
291 smgr->smgr_rnode.spcNode,
292 smgr->smgr_rnode.dbNode,
293 smgr->smgr_rnode.relNode,
294 isLocalBuf,
295 isExtend);
297 /* Substitute proper block number if caller asked for P_NEW */
298 if (isExtend)
299 blockNum = smgrnblocks(smgr, forkNum);
301 if (isLocalBuf)
303 ReadLocalBufferCount++;
304 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
305 if (found)
306 LocalBufferHitCount++;
308 else
310 ReadBufferCount++;
313 * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
314 * not currently in memory.
316 bufHdr = BufferAlloc(smgr, forkNum, blockNum, strategy, &found);
317 if (found)
318 BufferHitCount++;
321 /* At this point we do NOT hold any locks. */
323 /* if it was already in the buffer pool, we're done */
324 if (found)
326 if (!isExtend)
328 /* Just need to update stats before we exit */
329 *hit = true;
331 if (VacuumCostActive)
332 VacuumCostBalance += VacuumCostPageHit;
334 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
335 smgr->smgr_rnode.spcNode,
336 smgr->smgr_rnode.dbNode,
337 smgr->smgr_rnode.relNode,
338 isLocalBuf,
339 isExtend,
340 found);
342 return BufferDescriptorGetBuffer(bufHdr);
346 * We get here only in the corner case where we are trying to extend
347 * the relation but we found a pre-existing buffer marked BM_VALID.
348 * This can happen because mdread doesn't complain about reads beyond
349 * EOF (when zero_damaged_pages is ON) and so a previous attempt to
350 * read a block beyond EOF could have left a "valid" zero-filled
351 * buffer. Unfortunately, we have also seen this case occurring
352 * because of buggy Linux kernels that sometimes return an
353 * lseek(SEEK_END) result that doesn't account for a recent write. In
354 * that situation, the pre-existing buffer would contain valid data
355 * that we don't want to overwrite. Since the legitimate case should
356 * always have left a zero-filled buffer, complain if not PageIsNew.
358 bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
359 if (!PageIsNew((Page) bufBlock))
360 ereport(ERROR,
361 (errmsg("unexpected data beyond EOF in block %u of relation %s",
362 blockNum, relpath(smgr->smgr_rnode, forkNum)),
363 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
366 * We *must* do smgrextend before succeeding, else the page will not
367 * be reserved by the kernel, and the next P_NEW call will decide to
368 * return the same page. Clear the BM_VALID bit, do the StartBufferIO
369 * call that BufferAlloc didn't, and proceed.
371 if (isLocalBuf)
373 /* Only need to adjust flags */
374 Assert(bufHdr->flags & BM_VALID);
375 bufHdr->flags &= ~BM_VALID;
377 else
380 * Loop to handle the very small possibility that someone re-sets
381 * BM_VALID between our clearing it and StartBufferIO inspecting
382 * it.
386 LockBufHdr(bufHdr);
387 Assert(bufHdr->flags & BM_VALID);
388 bufHdr->flags &= ~BM_VALID;
389 UnlockBufHdr(bufHdr);
390 } while (!StartBufferIO(bufHdr, true));
395 * if we have gotten to this point, we have allocated a buffer for the
396 * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
397 * if it's a shared buffer.
399 * Note: if smgrextend fails, we will end up with a buffer that is
400 * allocated but not marked BM_VALID. P_NEW will still select the same
401 * block number (because the relation didn't get any longer on disk) and
402 * so future attempts to extend the relation will find the same buffer (if
403 * it's not been recycled) but come right back here to try smgrextend
404 * again.
406 Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */
408 bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
410 if (isExtend)
412 /* new buffers are zero-filled */
413 MemSet((char *) bufBlock, 0, BLCKSZ);
414 smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, isLocalBuf);
416 else
419 * Read in the page, unless the caller intends to overwrite it and
420 * just wants us to allocate a buffer.
422 if (mode == RBM_ZERO)
423 MemSet((char *) bufBlock, 0, BLCKSZ);
424 else
426 smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
428 /* check for garbage data */
429 if (!PageHeaderIsValid((PageHeader) bufBlock))
431 if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
433 ereport(WARNING,
434 (errcode(ERRCODE_DATA_CORRUPTED),
435 errmsg("invalid page header in block %u of relation %s; zeroing out page",
436 blockNum,
437 relpath(smgr->smgr_rnode, forkNum))));
438 MemSet((char *) bufBlock, 0, BLCKSZ);
440 else
441 ereport(ERROR,
442 (errcode(ERRCODE_DATA_CORRUPTED),
443 errmsg("invalid page header in block %u of relation %s",
444 blockNum,
445 relpath(smgr->smgr_rnode, forkNum))));
450 if (isLocalBuf)
452 /* Only need to adjust flags */
453 bufHdr->flags |= BM_VALID;
455 else
457 /* Set BM_VALID, terminate IO, and wake up any waiters */
458 TerminateBufferIO(bufHdr, false, BM_VALID);
461 if (VacuumCostActive)
462 VacuumCostBalance += VacuumCostPageMiss;
464 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
465 smgr->smgr_rnode.spcNode,
466 smgr->smgr_rnode.dbNode,
467 smgr->smgr_rnode.relNode,
468 isLocalBuf,
469 isExtend,
470 found);
472 return BufferDescriptorGetBuffer(bufHdr);
476 * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
477 * buffer. If no buffer exists already, selects a replacement
478 * victim and evicts the old page, but does NOT read in new page.
480 * "strategy" can be a buffer replacement strategy object, or NULL for
481 * the default strategy. The selected buffer's usage_count is advanced when
482 * using the default strategy, but otherwise possibly not (see PinBuffer).
484 * The returned buffer is pinned and is already marked as holding the
485 * desired page. If it already did have the desired page, *foundPtr is
486 * set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked
487 * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
489 * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
490 * we keep it for simplicity in ReadBuffer.
492 * No locks are held either at entry or exit.
494 static volatile BufferDesc *
495 BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
496 BlockNumber blockNum,
497 BufferAccessStrategy strategy,
498 bool *foundPtr)
500 BufferTag newTag; /* identity of requested block */
501 uint32 newHash; /* hash value for newTag */
502 LWLockId newPartitionLock; /* buffer partition lock for it */
503 BufferTag oldTag; /* previous identity of selected buffer */
504 uint32 oldHash; /* hash value for oldTag */
505 LWLockId oldPartitionLock; /* buffer partition lock for it */
506 BufFlags oldFlags;
507 int buf_id;
508 volatile BufferDesc *buf;
509 bool valid;
511 /* create a tag so we can lookup the buffer */
512 INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum);
514 /* determine its hash code and partition lock ID */
515 newHash = BufTableHashCode(&newTag);
516 newPartitionLock = BufMappingPartitionLock(newHash);
518 /* see if the block is in the buffer pool already */
519 LWLockAcquire(newPartitionLock, LW_SHARED);
520 buf_id = BufTableLookup(&newTag, newHash);
521 if (buf_id >= 0)
524 * Found it. Now, pin the buffer so no one can steal it from the
525 * buffer pool, and check to see if the correct data has been loaded
526 * into the buffer.
528 buf = &BufferDescriptors[buf_id];
530 valid = PinBuffer(buf, strategy);
532 /* Can release the mapping lock as soon as we've pinned it */
533 LWLockRelease(newPartitionLock);
535 *foundPtr = TRUE;
537 if (!valid)
540 * We can only get here if (a) someone else is still reading in
541 * the page, or (b) a previous read attempt failed. We have to
542 * wait for any active read attempt to finish, and then set up our
543 * own read attempt if the page is still not BM_VALID.
544 * StartBufferIO does it all.
546 if (StartBufferIO(buf, true))
549 * If we get here, previous attempts to read the buffer must
550 * have failed ... but we shall bravely try again.
552 *foundPtr = FALSE;
556 return buf;
560 * Didn't find it in the buffer pool. We'll have to initialize a new
561 * buffer. Remember to unlock the mapping lock while doing the work.
563 LWLockRelease(newPartitionLock);
565 /* Loop here in case we have to try another victim buffer */
566 for (;;)
568 bool lock_held;
571 * Select a victim buffer. The buffer is returned with its header
572 * spinlock still held! Also (in most cases) the BufFreelistLock is
573 * still held, since it would be bad to hold the spinlock while
574 * possibly waking up other processes.
576 buf = StrategyGetBuffer(strategy, &lock_held);
578 Assert(buf->refcount == 0);
580 /* Must copy buffer flags while we still hold the spinlock */
581 oldFlags = buf->flags;
583 /* Pin the buffer and then release the buffer spinlock */
584 PinBuffer_Locked(buf);
586 /* Now it's safe to release the freelist lock */
587 if (lock_held)
588 LWLockRelease(BufFreelistLock);
591 * If the buffer was dirty, try to write it out. There is a race
592 * condition here, in that someone might dirty it after we released it
593 * above, or even while we are writing it out (since our share-lock
594 * won't prevent hint-bit updates). We will recheck the dirty bit
595 * after re-locking the buffer header.
597 if (oldFlags & BM_DIRTY)
600 * We need a share-lock on the buffer contents to write it out
601 * (else we might write invalid data, eg because someone else is
602 * compacting the page contents while we write). We must use a
603 * conditional lock acquisition here to avoid deadlock. Even
604 * though the buffer was not pinned (and therefore surely not
605 * locked) when StrategyGetBuffer returned it, someone else could
606 * have pinned and exclusive-locked it by the time we get here. If
607 * we try to get the lock unconditionally, we'd block waiting for
608 * them; if they later block waiting for us, deadlock ensues.
609 * (This has been observed to happen when two backends are both
610 * trying to split btree index pages, and the second one just
611 * happens to be trying to split the page the first one got from
612 * StrategyGetBuffer.)
614 if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
617 * If using a nondefault strategy, and writing the buffer
618 * would require a WAL flush, let the strategy decide whether
619 * to go ahead and write/reuse the buffer or to choose another
620 * victim. We need lock to inspect the page LSN, so this
621 * can't be done inside StrategyGetBuffer.
623 if (strategy != NULL &&
624 XLogNeedsFlush(BufferGetLSN(buf)) &&
625 StrategyRejectBuffer(strategy, buf))
627 /* Drop lock/pin and loop around for another buffer */
628 LWLockRelease(buf->content_lock);
629 UnpinBuffer(buf, true);
630 continue;
633 /* OK, do the I/O */
634 TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
635 smgr->smgr_rnode.spcNode,
636 smgr->smgr_rnode.dbNode,
637 smgr->smgr_rnode.relNode);
639 FlushBuffer(buf, NULL);
640 LWLockRelease(buf->content_lock);
642 TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
643 smgr->smgr_rnode.spcNode,
644 smgr->smgr_rnode.dbNode,
645 smgr->smgr_rnode.relNode);
647 else
650 * Someone else has locked the buffer, so give it up and loop
651 * back to get another one.
653 UnpinBuffer(buf, true);
654 continue;
659 * To change the association of a valid buffer, we'll need to have
660 * exclusive lock on both the old and new mapping partitions.
662 if (oldFlags & BM_TAG_VALID)
665 * Need to compute the old tag's hashcode and partition lock ID.
666 * XXX is it worth storing the hashcode in BufferDesc so we need
667 * not recompute it here? Probably not.
669 oldTag = buf->tag;
670 oldHash = BufTableHashCode(&oldTag);
671 oldPartitionLock = BufMappingPartitionLock(oldHash);
674 * Must lock the lower-numbered partition first to avoid
675 * deadlocks.
677 if (oldPartitionLock < newPartitionLock)
679 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
680 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
682 else if (oldPartitionLock > newPartitionLock)
684 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
685 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
687 else
689 /* only one partition, only one lock */
690 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
693 else
695 /* if it wasn't valid, we need only the new partition */
696 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
697 /* these just keep the compiler quiet about uninit variables */
698 oldHash = 0;
699 oldPartitionLock = 0;
703 * Try to make a hashtable entry for the buffer under its new tag.
704 * This could fail because while we were writing someone else
705 * allocated another buffer for the same block we want to read in.
706 * Note that we have not yet removed the hashtable entry for the old
707 * tag.
709 buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
711 if (buf_id >= 0)
714 * Got a collision. Someone has already done what we were about to
715 * do. We'll just handle this as if it were found in the buffer
716 * pool in the first place. First, give up the buffer we were
717 * planning to use.
719 UnpinBuffer(buf, true);
721 /* Can give up that buffer's mapping partition lock now */
722 if ((oldFlags & BM_TAG_VALID) &&
723 oldPartitionLock != newPartitionLock)
724 LWLockRelease(oldPartitionLock);
726 /* remaining code should match code at top of routine */
728 buf = &BufferDescriptors[buf_id];
730 valid = PinBuffer(buf, strategy);
732 /* Can release the mapping lock as soon as we've pinned it */
733 LWLockRelease(newPartitionLock);
735 *foundPtr = TRUE;
737 if (!valid)
740 * We can only get here if (a) someone else is still reading
741 * in the page, or (b) a previous read attempt failed. We
742 * have to wait for any active read attempt to finish, and
743 * then set up our own read attempt if the page is still not
744 * BM_VALID. StartBufferIO does it all.
746 if (StartBufferIO(buf, true))
749 * If we get here, previous attempts to read the buffer
750 * must have failed ... but we shall bravely try again.
752 *foundPtr = FALSE;
756 return buf;
760 * Need to lock the buffer header too in order to change its tag.
762 LockBufHdr(buf);
765 * Somebody could have pinned or re-dirtied the buffer while we were
766 * doing the I/O and making the new hashtable entry. If so, we can't
767 * recycle this buffer; we must undo everything we've done and start
768 * over with a new victim buffer.
770 oldFlags = buf->flags;
771 if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
772 break;
774 UnlockBufHdr(buf);
775 BufTableDelete(&newTag, newHash);
776 if ((oldFlags & BM_TAG_VALID) &&
777 oldPartitionLock != newPartitionLock)
778 LWLockRelease(oldPartitionLock);
779 LWLockRelease(newPartitionLock);
780 UnpinBuffer(buf, true);
784 * Okay, it's finally safe to rename the buffer.
786 * Clearing BM_VALID here is necessary, clearing the dirtybits is just
787 * paranoia. We also reset the usage_count since any recency of use of
788 * the old content is no longer relevant. (The usage_count starts out at
789 * 1 so that the buffer can survive one clock-sweep pass.)
791 buf->tag = newTag;
792 buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR);
793 buf->flags |= BM_TAG_VALID;
794 buf->usage_count = 1;
796 UnlockBufHdr(buf);
798 if (oldFlags & BM_TAG_VALID)
800 BufTableDelete(&oldTag, oldHash);
801 if (oldPartitionLock != newPartitionLock)
802 LWLockRelease(oldPartitionLock);
805 LWLockRelease(newPartitionLock);
808 * Buffer contents are currently invalid. Try to get the io_in_progress
809 * lock. If StartBufferIO returns false, then someone else managed to
810 * read it before we did, so there's nothing left for BufferAlloc() to do.
812 if (StartBufferIO(buf, true))
813 *foundPtr = FALSE;
814 else
815 *foundPtr = TRUE;
817 return buf;
821 * InvalidateBuffer -- mark a shared buffer invalid and return it to the
822 * freelist.
824 * The buffer header spinlock must be held at entry. We drop it before
825 * returning. (This is sane because the caller must have locked the
826 * buffer in order to be sure it should be dropped.)
828 * This is used only in contexts such as dropping a relation. We assume
829 * that no other backend could possibly be interested in using the page,
830 * so the only reason the buffer might be pinned is if someone else is
831 * trying to write it out. We have to let them finish before we can
832 * reclaim the buffer.
834 * The buffer could get reclaimed by someone else while we are waiting
835 * to acquire the necessary locks; if so, don't mess it up.
837 static void
838 InvalidateBuffer(volatile BufferDesc *buf)
840 BufferTag oldTag;
841 uint32 oldHash; /* hash value for oldTag */
842 LWLockId oldPartitionLock; /* buffer partition lock for it */
843 BufFlags oldFlags;
845 /* Save the original buffer tag before dropping the spinlock */
846 oldTag = buf->tag;
848 UnlockBufHdr(buf);
851 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
852 * worth storing the hashcode in BufferDesc so we need not recompute it
853 * here? Probably not.
855 oldHash = BufTableHashCode(&oldTag);
856 oldPartitionLock = BufMappingPartitionLock(oldHash);
858 retry:
861 * Acquire exclusive mapping lock in preparation for changing the buffer's
862 * association.
864 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
866 /* Re-lock the buffer header */
867 LockBufHdr(buf);
869 /* If it's changed while we were waiting for lock, do nothing */
870 if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
872 UnlockBufHdr(buf);
873 LWLockRelease(oldPartitionLock);
874 return;
878 * We assume the only reason for it to be pinned is that someone else is
879 * flushing the page out. Wait for them to finish. (This could be an
880 * infinite loop if the refcount is messed up... it would be nice to time
881 * out after awhile, but there seems no way to be sure how many loops may
882 * be needed. Note that if the other guy has pinned the buffer but not
883 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
884 * be busy-looping here.)
886 if (buf->refcount != 0)
888 UnlockBufHdr(buf);
889 LWLockRelease(oldPartitionLock);
890 /* safety check: should definitely not be our *own* pin */
891 if (PrivateRefCount[buf->buf_id] != 0)
892 elog(ERROR, "buffer is pinned in InvalidateBuffer");
893 WaitIO(buf);
894 goto retry;
898 * Clear out the buffer's tag and flags. We must do this to ensure that
899 * linear scans of the buffer array don't think the buffer is valid.
901 oldFlags = buf->flags;
902 CLEAR_BUFFERTAG(buf->tag);
903 buf->flags = 0;
904 buf->usage_count = 0;
906 UnlockBufHdr(buf);
909 * Remove the buffer from the lookup hashtable, if it was in there.
911 if (oldFlags & BM_TAG_VALID)
912 BufTableDelete(&oldTag, oldHash);
915 * Done with mapping lock.
917 LWLockRelease(oldPartitionLock);
920 * Insert the buffer at the head of the list of free buffers.
922 StrategyFreeBuffer(buf);
926 * MarkBufferDirty
928 * Marks buffer contents as dirty (actual write happens later).
930 * Buffer must be pinned and exclusive-locked. (If caller does not hold
931 * exclusive lock, then somebody could be in process of writing the buffer,
932 * leading to risk of bad data written to disk.)
934 void
935 MarkBufferDirty(Buffer buffer)
937 volatile BufferDesc *bufHdr;
939 if (!BufferIsValid(buffer))
940 elog(ERROR, "bad buffer id: %d", buffer);
942 if (BufferIsLocal(buffer))
944 MarkLocalBufferDirty(buffer);
945 return;
948 bufHdr = &BufferDescriptors[buffer - 1];
950 Assert(PrivateRefCount[buffer - 1] > 0);
951 /* unfortunately we can't check if the lock is held exclusively */
952 Assert(LWLockHeldByMe(bufHdr->content_lock));
954 LockBufHdr(bufHdr);
956 Assert(bufHdr->refcount > 0);
959 * If the buffer was not dirty already, do vacuum cost accounting.
961 if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
962 VacuumCostBalance += VacuumCostPageDirty;
964 bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
966 UnlockBufHdr(bufHdr);
970 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
972 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
973 * compared to calling the two routines separately. Now it's mainly just
974 * a convenience function. However, if the passed buffer is valid and
975 * already contains the desired block, we just return it as-is; and that
976 * does save considerable work compared to a full release and reacquire.
978 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
979 * buffer actually needs to be released. This case is the same as ReadBuffer,
980 * but can save some tests in the caller.
982 Buffer
983 ReleaseAndReadBuffer(Buffer buffer,
984 Relation relation,
985 BlockNumber blockNum)
987 ForkNumber forkNum = MAIN_FORKNUM;
988 volatile BufferDesc *bufHdr;
990 if (BufferIsValid(buffer))
992 if (BufferIsLocal(buffer))
994 Assert(LocalRefCount[-buffer - 1] > 0);
995 bufHdr = &LocalBufferDescriptors[-buffer - 1];
996 if (bufHdr->tag.blockNum == blockNum &&
997 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
998 bufHdr->tag.forkNum == forkNum)
999 return buffer;
1000 ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
1001 LocalRefCount[-buffer - 1]--;
1003 else
1005 Assert(PrivateRefCount[buffer - 1] > 0);
1006 bufHdr = &BufferDescriptors[buffer - 1];
1007 /* we have pin, so it's ok to examine tag without spinlock */
1008 if (bufHdr->tag.blockNum == blockNum &&
1009 RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1010 bufHdr->tag.forkNum == forkNum)
1011 return buffer;
1012 UnpinBuffer(bufHdr, true);
1016 return ReadBuffer(relation, blockNum);
1020 * PinBuffer -- make buffer unavailable for replacement.
1022 * For the default access strategy, the buffer's usage_count is incremented
1023 * when we first pin it; for other strategies we just make sure the usage_count
1024 * isn't zero. (The idea of the latter is that we don't want synchronized
1025 * heap scans to inflate the count, but we need it to not be zero to discourage
1026 * other backends from stealing buffers from our ring. As long as we cycle
1027 * through the ring faster than the global clock-sweep cycles, buffers in
1028 * our ring won't be chosen as victims for replacement by other backends.)
1030 * This should be applied only to shared buffers, never local ones.
1032 * Note that ResourceOwnerEnlargeBuffers must have been done already.
1034 * Returns TRUE if buffer is BM_VALID, else FALSE. This provision allows
1035 * some callers to avoid an extra spinlock cycle.
1037 static bool
1038 PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
1040 int b = buf->buf_id;
1041 bool result;
1043 if (PrivateRefCount[b] == 0)
1045 LockBufHdr(buf);
1046 buf->refcount++;
1047 if (strategy == NULL)
1049 if (buf->usage_count < BM_MAX_USAGE_COUNT)
1050 buf->usage_count++;
1052 else
1054 if (buf->usage_count == 0)
1055 buf->usage_count = 1;
1057 result = (buf->flags & BM_VALID) != 0;
1058 UnlockBufHdr(buf);
1060 else
1062 /* If we previously pinned the buffer, it must surely be valid */
1063 result = true;
1065 PrivateRefCount[b]++;
1066 Assert(PrivateRefCount[b] > 0);
1067 ResourceOwnerRememberBuffer(CurrentResourceOwner,
1068 BufferDescriptorGetBuffer(buf));
1069 return result;
1073 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1074 * The spinlock is released before return.
1076 * Currently, no callers of this function want to modify the buffer's
1077 * usage_count at all, so there's no need for a strategy parameter.
1078 * Also we don't bother with a BM_VALID test (the caller could check that for
1079 * itself).
1081 * Note: use of this routine is frequently mandatory, not just an optimization
1082 * to save a spin lock/unlock cycle, because we need to pin a buffer before
1083 * its state can change under us.
1085 static void
1086 PinBuffer_Locked(volatile BufferDesc *buf)
1088 int b = buf->buf_id;
1090 if (PrivateRefCount[b] == 0)
1091 buf->refcount++;
1092 UnlockBufHdr(buf);
1093 PrivateRefCount[b]++;
1094 Assert(PrivateRefCount[b] > 0);
1095 ResourceOwnerRememberBuffer(CurrentResourceOwner,
1096 BufferDescriptorGetBuffer(buf));
1100 * UnpinBuffer -- make buffer available for replacement.
1102 * This should be applied only to shared buffers, never local ones.
1104 * Most but not all callers want CurrentResourceOwner to be adjusted.
1105 * Those that don't should pass fixOwner = FALSE.
1107 static void
1108 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
1110 int b = buf->buf_id;
1112 if (fixOwner)
1113 ResourceOwnerForgetBuffer(CurrentResourceOwner,
1114 BufferDescriptorGetBuffer(buf));
1116 Assert(PrivateRefCount[b] > 0);
1117 PrivateRefCount[b]--;
1118 if (PrivateRefCount[b] == 0)
1120 /* I'd better not still hold any locks on the buffer */
1121 Assert(!LWLockHeldByMe(buf->content_lock));
1122 Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1124 LockBufHdr(buf);
1126 /* Decrement the shared reference count */
1127 Assert(buf->refcount > 0);
1128 buf->refcount--;
1130 /* Support LockBufferForCleanup() */
1131 if ((buf->flags & BM_PIN_COUNT_WAITER) &&
1132 buf->refcount == 1)
1134 /* we just released the last pin other than the waiter's */
1135 int wait_backend_pid = buf->wait_backend_pid;
1137 buf->flags &= ~BM_PIN_COUNT_WAITER;
1138 UnlockBufHdr(buf);
1139 ProcSendSignal(wait_backend_pid);
1141 else
1142 UnlockBufHdr(buf);
1147 * BufferSync -- Write out all dirty buffers in the pool.
1149 * This is called at checkpoint time to write out all dirty shared buffers.
1150 * The checkpoint request flags should be passed in; currently the only one
1151 * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
1153 static void
1154 BufferSync(int flags)
1156 int buf_id;
1157 int num_to_scan;
1158 int num_to_write;
1159 int num_written;
1161 /* Make sure we can handle the pin inside SyncOneBuffer */
1162 ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1165 * Loop over all buffers, and mark the ones that need to be written with
1166 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_write), so that we
1167 * can estimate how much work needs to be done.
1169 * This allows us to write only those pages that were dirty when the
1170 * checkpoint began, and not those that get dirtied while it proceeds.
1171 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1172 * later in this function, or by normal backends or the bgwriter cleaning
1173 * scan, the flag is cleared. Any buffer dirtied after this point won't
1174 * have the flag set.
1176 * Note that if we fail to write some buffer, we may leave buffers with
1177 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1178 * certainly need to be written for the next checkpoint attempt, too.
1180 num_to_write = 0;
1181 for (buf_id = 0; buf_id < NBuffers; buf_id++)
1183 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1186 * Header spinlock is enough to examine BM_DIRTY, see comment in
1187 * SyncOneBuffer.
1189 LockBufHdr(bufHdr);
1191 if (bufHdr->flags & BM_DIRTY)
1193 bufHdr->flags |= BM_CHECKPOINT_NEEDED;
1194 num_to_write++;
1197 UnlockBufHdr(bufHdr);
1200 if (num_to_write == 0)
1201 return; /* nothing to do */
1203 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
1206 * Loop over all buffers again, and write the ones (still) marked with
1207 * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep point
1208 * since we might as well dump soon-to-be-recycled buffers first.
1210 * Note that we don't read the buffer alloc count here --- that should be
1211 * left untouched till the next BgBufferSync() call.
1213 buf_id = StrategySyncStart(NULL, NULL);
1214 num_to_scan = NBuffers;
1215 num_written = 0;
1216 while (num_to_scan-- > 0)
1218 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1221 * We don't need to acquire the lock here, because we're only looking
1222 * at a single bit. It's possible that someone else writes the buffer
1223 * and clears the flag right after we check, but that doesn't matter
1224 * since SyncOneBuffer will then do nothing. However, there is a
1225 * further race condition: it's conceivable that between the time we
1226 * examine the bit here and the time SyncOneBuffer acquires lock,
1227 * someone else not only wrote the buffer but replaced it with another
1228 * page and dirtied it. In that improbable case, SyncOneBuffer will
1229 * write the buffer though we didn't need to. It doesn't seem worth
1230 * guarding against this, though.
1232 if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
1234 if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
1236 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1237 BgWriterStats.m_buf_written_checkpoints++;
1238 num_written++;
1241 * We know there are at most num_to_write buffers with
1242 * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
1243 * num_written reaches num_to_write.
1245 * Note that num_written doesn't include buffers written by
1246 * other backends, or by the bgwriter cleaning scan. That
1247 * means that the estimate of how much progress we've made is
1248 * conservative, and also that this test will often fail to
1249 * trigger. But it seems worth making anyway.
1251 if (num_written >= num_to_write)
1252 break;
1255 * Perform normal bgwriter duties and sleep to throttle our
1256 * I/O rate.
1258 CheckpointWriteDelay(flags,
1259 (double) num_written / num_to_write);
1263 if (++buf_id >= NBuffers)
1264 buf_id = 0;
1268 * Update checkpoint statistics. As noted above, this doesn't include
1269 * buffers written by other backends or bgwriter scan.
1271 CheckpointStats.ckpt_bufs_written += num_written;
1273 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
1277 * BgBufferSync -- Write out some dirty buffers in the pool.
1279 * This is called periodically by the background writer process.
1281 void
1282 BgBufferSync(void)
1284 /* info obtained from freelist.c */
1285 int strategy_buf_id;
1286 uint32 strategy_passes;
1287 uint32 recent_alloc;
1290 * Information saved between calls so we can determine the strategy
1291 * point's advance rate and avoid scanning already-cleaned buffers.
1293 static bool saved_info_valid = false;
1294 static int prev_strategy_buf_id;
1295 static uint32 prev_strategy_passes;
1296 static int next_to_clean;
1297 static uint32 next_passes;
1299 /* Moving averages of allocation rate and clean-buffer density */
1300 static float smoothed_alloc = 0;
1301 static float smoothed_density = 10.0;
1303 /* Potentially these could be tunables, but for now, not */
1304 float smoothing_samples = 16;
1305 float scan_whole_pool_milliseconds = 120000.0;
1307 /* Used to compute how far we scan ahead */
1308 long strategy_delta;
1309 int bufs_to_lap;
1310 int bufs_ahead;
1311 float scans_per_alloc;
1312 int reusable_buffers_est;
1313 int upcoming_alloc_est;
1314 int min_scan_buffers;
1316 /* Variables for the scanning loop proper */
1317 int num_to_scan;
1318 int num_written;
1319 int reusable_buffers;
1322 * Find out where the freelist clock sweep currently is, and how many
1323 * buffer allocations have happened since our last call.
1325 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
1327 /* Report buffer alloc counts to pgstat */
1328 BgWriterStats.m_buf_alloc += recent_alloc;
1331 * If we're not running the LRU scan, just stop after doing the stats
1332 * stuff. We mark the saved state invalid so that we can recover sanely
1333 * if LRU scan is turned back on later.
1335 if (bgwriter_lru_maxpages <= 0)
1337 saved_info_valid = false;
1338 return;
1342 * Compute strategy_delta = how many buffers have been scanned by the
1343 * clock sweep since last time. If first time through, assume none. Then
1344 * see if we are still ahead of the clock sweep, and if so, how many
1345 * buffers we could scan before we'd catch up with it and "lap" it. Note:
1346 * weird-looking coding of xxx_passes comparisons are to avoid bogus
1347 * behavior when the passes counts wrap around.
1349 if (saved_info_valid)
1351 int32 passes_delta = strategy_passes - prev_strategy_passes;
1353 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
1354 strategy_delta += (long) passes_delta *NBuffers;
1356 Assert(strategy_delta >= 0);
1358 if ((int32) (next_passes - strategy_passes) > 0)
1360 /* we're one pass ahead of the strategy point */
1361 bufs_to_lap = strategy_buf_id - next_to_clean;
1362 #ifdef BGW_DEBUG
1363 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1364 next_passes, next_to_clean,
1365 strategy_passes, strategy_buf_id,
1366 strategy_delta, bufs_to_lap);
1367 #endif
1369 else if (next_passes == strategy_passes &&
1370 next_to_clean >= strategy_buf_id)
1372 /* on same pass, but ahead or at least not behind */
1373 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
1374 #ifdef BGW_DEBUG
1375 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
1376 next_passes, next_to_clean,
1377 strategy_passes, strategy_buf_id,
1378 strategy_delta, bufs_to_lap);
1379 #endif
1381 else
1384 * We're behind, so skip forward to the strategy point and start
1385 * cleaning from there.
1387 #ifdef BGW_DEBUG
1388 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
1389 next_passes, next_to_clean,
1390 strategy_passes, strategy_buf_id,
1391 strategy_delta);
1392 #endif
1393 next_to_clean = strategy_buf_id;
1394 next_passes = strategy_passes;
1395 bufs_to_lap = NBuffers;
1398 else
1401 * Initializing at startup or after LRU scanning had been off. Always
1402 * start at the strategy point.
1404 #ifdef BGW_DEBUG
1405 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
1406 strategy_passes, strategy_buf_id);
1407 #endif
1408 strategy_delta = 0;
1409 next_to_clean = strategy_buf_id;
1410 next_passes = strategy_passes;
1411 bufs_to_lap = NBuffers;
1414 /* Update saved info for next time */
1415 prev_strategy_buf_id = strategy_buf_id;
1416 prev_strategy_passes = strategy_passes;
1417 saved_info_valid = true;
1420 * Compute how many buffers had to be scanned for each new allocation, ie,
1421 * 1/density of reusable buffers, and track a moving average of that.
1423 * If the strategy point didn't move, we don't update the density estimate
1425 if (strategy_delta > 0 && recent_alloc > 0)
1427 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1428 smoothed_density += (scans_per_alloc - smoothed_density) /
1429 smoothing_samples;
1433 * Estimate how many reusable buffers there are between the current
1434 * strategy point and where we've scanned ahead to, based on the smoothed
1435 * density estimate.
1437 bufs_ahead = NBuffers - bufs_to_lap;
1438 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
1441 * Track a moving average of recent buffer allocations. Here, rather than
1442 * a true average we want a fast-attack, slow-decline behavior: we
1443 * immediately follow any increase.
1445 if (smoothed_alloc <= (float) recent_alloc)
1446 smoothed_alloc = recent_alloc;
1447 else
1448 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
1449 smoothing_samples;
1451 /* Scale the estimate by a GUC to allow more aggressive tuning. */
1452 upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier;
1455 * Even in cases where there's been little or no buffer allocation
1456 * activity, we want to make a small amount of progress through the buffer
1457 * cache so that as many reusable buffers as possible are clean after an
1458 * idle period.
1460 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
1461 * the BGW will be called during the scan_whole_pool time; slice the
1462 * buffer pool into that many sections.
1464 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
1466 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
1468 #ifdef BGW_DEBUG
1469 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
1470 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
1471 #endif
1472 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
1476 * Now write out dirty reusable buffers, working forward from the
1477 * next_to_clean point, until we have lapped the strategy scan, or cleaned
1478 * enough buffers to match our estimate of the next cycle's allocation
1479 * requirements, or hit the bgwriter_lru_maxpages limit.
1482 /* Make sure we can handle the pin inside SyncOneBuffer */
1483 ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1485 num_to_scan = bufs_to_lap;
1486 num_written = 0;
1487 reusable_buffers = reusable_buffers_est;
1489 /* Execute the LRU scan */
1490 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
1492 int buffer_state = SyncOneBuffer(next_to_clean, true);
1494 if (++next_to_clean >= NBuffers)
1496 next_to_clean = 0;
1497 next_passes++;
1499 num_to_scan--;
1501 if (buffer_state & BUF_WRITTEN)
1503 reusable_buffers++;
1504 if (++num_written >= bgwriter_lru_maxpages)
1506 BgWriterStats.m_maxwritten_clean++;
1507 break;
1510 else if (buffer_state & BUF_REUSABLE)
1511 reusable_buffers++;
1514 BgWriterStats.m_buf_written_clean += num_written;
1516 #ifdef BGW_DEBUG
1517 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
1518 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
1519 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
1520 bufs_to_lap - num_to_scan,
1521 num_written,
1522 reusable_buffers - reusable_buffers_est);
1523 #endif
1526 * Consider the above scan as being like a new allocation scan.
1527 * Characterize its density and update the smoothed one based on it. This
1528 * effectively halves the moving average period in cases where both the
1529 * strategy and the background writer are doing some useful scanning,
1530 * which is helpful because a long memory isn't as desirable on the
1531 * density estimates.
1533 strategy_delta = bufs_to_lap - num_to_scan;
1534 recent_alloc = reusable_buffers - reusable_buffers_est;
1535 if (strategy_delta > 0 && recent_alloc > 0)
1537 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
1538 smoothed_density += (scans_per_alloc - smoothed_density) /
1539 smoothing_samples;
1541 #ifdef BGW_DEBUG
1542 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
1543 recent_alloc, strategy_delta, scans_per_alloc, smoothed_density);
1544 #endif
1549 * SyncOneBuffer -- process a single buffer during syncing.
1551 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
1552 * buffers marked recently used, as these are not replacement candidates.
1554 * Returns a bitmask containing the following flag bits:
1555 * BUF_WRITTEN: we wrote the buffer.
1556 * BUF_REUSABLE: buffer is available for replacement, ie, it has
1557 * pin count 0 and usage count 0.
1559 * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
1560 * after locking it, but we don't care all that much.)
1562 * Note: caller must have done ResourceOwnerEnlargeBuffers.
1564 static int
1565 SyncOneBuffer(int buf_id, bool skip_recently_used)
1567 volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
1568 int result = 0;
1571 * Check whether buffer needs writing.
1573 * We can make this check without taking the buffer content lock so long
1574 * as we mark pages dirty in access methods *before* logging changes with
1575 * XLogInsert(): if someone marks the buffer dirty just after our check we
1576 * don't worry because our checkpoint.redo points before log record for
1577 * upcoming changes and so we are not required to write such dirty buffer.
1579 LockBufHdr(bufHdr);
1581 if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
1582 result |= BUF_REUSABLE;
1583 else if (skip_recently_used)
1585 /* Caller told us not to write recently-used buffers */
1586 UnlockBufHdr(bufHdr);
1587 return result;
1590 if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
1592 /* It's clean, so nothing to do */
1593 UnlockBufHdr(bufHdr);
1594 return result;
1598 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
1599 * buffer is clean by the time we've locked it.)
1601 PinBuffer_Locked(bufHdr);
1602 LWLockAcquire(bufHdr->content_lock, LW_SHARED);
1604 FlushBuffer(bufHdr, NULL);
1606 LWLockRelease(bufHdr->content_lock);
1607 UnpinBuffer(bufHdr, true);
1609 return result | BUF_WRITTEN;
1614 * Return a palloc'd string containing buffer usage statistics.
1616 char *
1617 ShowBufferUsage(void)
1619 StringInfoData str;
1620 float hitrate;
1621 float localhitrate;
1623 initStringInfo(&str);
1625 if (ReadBufferCount == 0)
1626 hitrate = 0.0;
1627 else
1628 hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
1630 if (ReadLocalBufferCount == 0)
1631 localhitrate = 0.0;
1632 else
1633 localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
1635 appendStringInfo(&str,
1636 "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1637 ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
1638 appendStringInfo(&str,
1639 "!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
1640 ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
1641 appendStringInfo(&str,
1642 "!\tDirect blocks: %10ld read, %10ld written\n",
1643 BufFileReadCount, BufFileWriteCount);
1645 return str.data;
1648 void
1649 ResetBufferUsage(void)
1651 BufferHitCount = 0;
1652 ReadBufferCount = 0;
1653 BufferFlushCount = 0;
1654 LocalBufferHitCount = 0;
1655 ReadLocalBufferCount = 0;
1656 LocalBufferFlushCount = 0;
1657 BufFileReadCount = 0;
1658 BufFileWriteCount = 0;
1662 * AtEOXact_Buffers - clean up at end of transaction.
1664 * As of PostgreSQL 8.0, buffer pins should get released by the
1665 * ResourceOwner mechanism. This routine is just a debugging
1666 * cross-check that no pins remain.
1668 void
1669 AtEOXact_Buffers(bool isCommit)
1671 #ifdef USE_ASSERT_CHECKING
1672 if (assert_enabled)
1674 int i;
1676 for (i = 0; i < NBuffers; i++)
1678 Assert(PrivateRefCount[i] == 0);
1681 #endif
1683 AtEOXact_LocalBuffers(isCommit);
1687 * InitBufferPoolBackend --- second-stage initialization of a new backend
1689 * This is called after we have acquired a PGPROC and so can safely get
1690 * LWLocks. We don't currently need to do anything at this stage ...
1691 * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
1692 * access, and thereby has to be called at the corresponding phase of
1693 * backend shutdown.
1695 void
1696 InitBufferPoolBackend(void)
1698 on_shmem_exit(AtProcExit_Buffers, 0);
1702 * Ensure we have released all shared-buffer locks and pins during backend exit
1704 static void
1705 AtProcExit_Buffers(int code, Datum arg)
1707 int i;
1709 AbortBufferIO();
1710 UnlockBuffers();
1712 for (i = 0; i < NBuffers; i++)
1714 if (PrivateRefCount[i] != 0)
1716 volatile BufferDesc *buf = &(BufferDescriptors[i]);
1719 * We don't worry about updating ResourceOwner; if we even got
1720 * here, it suggests that ResourceOwners are messed up.
1722 PrivateRefCount[i] = 1; /* make sure we release shared pin */
1723 UnpinBuffer(buf, false);
1724 Assert(PrivateRefCount[i] == 0);
1728 /* localbuf.c needs a chance too */
1729 AtProcExit_LocalBuffers();
1733 * Helper routine to issue warnings when a buffer is unexpectedly pinned
1735 void
1736 PrintBufferLeakWarning(Buffer buffer)
1738 volatile BufferDesc *buf;
1739 int32 loccount;
1740 char *path;
1742 Assert(BufferIsValid(buffer));
1743 if (BufferIsLocal(buffer))
1745 buf = &LocalBufferDescriptors[-buffer - 1];
1746 loccount = LocalRefCount[-buffer - 1];
1748 else
1750 buf = &BufferDescriptors[buffer - 1];
1751 loccount = PrivateRefCount[buffer - 1];
1754 /* theoretically we should lock the bufhdr here */
1755 path = relpath(buf->tag.rnode, buf->tag.forkNum);
1756 elog(WARNING,
1757 "buffer refcount leak: [%03d] "
1758 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
1759 buffer, path,
1760 buf->tag.blockNum, buf->flags,
1761 buf->refcount, loccount);
1762 pfree(path);
1766 * CheckPointBuffers
1768 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
1770 * Note: temporary relations do not participate in checkpoints, so they don't
1771 * need to be flushed.
1773 void
1774 CheckPointBuffers(int flags)
1776 TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
1777 CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
1778 BufferSync(flags);
1779 CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
1780 TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
1781 smgrsync();
1782 CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
1783 TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
1788 * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
1790 void
1791 BufmgrCommit(void)
1793 /* Nothing to do in bufmgr anymore... */
1797 * BufferGetBlockNumber
1798 * Returns the block number associated with a buffer.
1800 * Note:
1801 * Assumes that the buffer is valid and pinned, else the
1802 * value may be obsolete immediately...
1804 BlockNumber
1805 BufferGetBlockNumber(Buffer buffer)
1807 volatile BufferDesc *bufHdr;
1809 Assert(BufferIsPinned(buffer));
1811 if (BufferIsLocal(buffer))
1812 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1813 else
1814 bufHdr = &BufferDescriptors[buffer - 1];
1816 /* pinned, so OK to read tag without spinlock */
1817 return bufHdr->tag.blockNum;
1821 * BufferGetTag
1822 * Returns the relfilenode, fork number and block number associated with
1823 * a buffer.
1825 void
1826 BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
1827 BlockNumber *blknum)
1829 volatile BufferDesc *bufHdr;
1831 /* Do the same checks as BufferGetBlockNumber. */
1832 Assert(BufferIsPinned(buffer));
1834 if (BufferIsLocal(buffer))
1835 bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1836 else
1837 bufHdr = &BufferDescriptors[buffer - 1];
1839 /* pinned, so OK to read tag without spinlock */
1840 *rnode = bufHdr->tag.rnode;
1841 *forknum = bufHdr->tag.forkNum;
1842 *blknum = bufHdr->tag.blockNum;
1846 * FlushBuffer
1847 * Physically write out a shared buffer.
1849 * NOTE: this actually just passes the buffer contents to the kernel; the
1850 * real write to disk won't happen until the kernel feels like it. This
1851 * is okay from our point of view since we can redo the changes from WAL.
1852 * However, we will need to force the changes to disk via fsync before
1853 * we can checkpoint WAL.
1855 * The caller must hold a pin on the buffer and have share-locked the
1856 * buffer contents. (Note: a share-lock does not prevent updates of
1857 * hint bits in the buffer, so the page could change while the write
1858 * is in progress, but we assume that that will not invalidate the data
1859 * written.)
1861 * If the caller has an smgr reference for the buffer's relation, pass it
1862 * as the second parameter. If not, pass NULL.
1864 static void
1865 FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
1867 XLogRecPtr recptr;
1868 ErrorContextCallback errcontext;
1871 * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
1872 * false, then someone else flushed the buffer before we could, so we need
1873 * not do anything.
1875 if (!StartBufferIO(buf, false))
1876 return;
1878 /* Setup error traceback support for ereport() */
1879 errcontext.callback = buffer_write_error_callback;
1880 errcontext.arg = (void *) buf;
1881 errcontext.previous = error_context_stack;
1882 error_context_stack = &errcontext;
1884 /* Find smgr relation for buffer */
1885 if (reln == NULL)
1886 reln = smgropen(buf->tag.rnode);
1888 TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
1889 buf->tag.blockNum,
1890 reln->smgr_rnode.spcNode,
1891 reln->smgr_rnode.dbNode,
1892 reln->smgr_rnode.relNode);
1895 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
1896 * rule that log updates must hit disk before any of the data-file changes
1897 * they describe do.
1899 recptr = BufferGetLSN(buf);
1900 XLogFlush(recptr);
1903 * Now it's safe to write buffer to disk. Note that no one else should
1904 * have been able to write it while we were busy with log flushing because
1905 * we have the io_in_progress lock.
1908 /* To check if block content changes while flushing. - vadim 01/17/97 */
1909 LockBufHdr(buf);
1910 buf->flags &= ~BM_JUST_DIRTIED;
1911 UnlockBufHdr(buf);
1913 smgrwrite(reln,
1914 buf->tag.forkNum,
1915 buf->tag.blockNum,
1916 (char *) BufHdrGetBlock(buf),
1917 false);
1919 BufferFlushCount++;
1922 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
1923 * end the io_in_progress state.
1925 TerminateBufferIO(buf, true, 0);
1927 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
1928 buf->tag.blockNum,
1929 reln->smgr_rnode.spcNode,
1930 reln->smgr_rnode.dbNode,
1931 reln->smgr_rnode.relNode);
1933 /* Pop the error context stack */
1934 error_context_stack = errcontext.previous;
1938 * RelationGetNumberOfBlocks
1939 * Determines the current number of pages in the relation.
1941 BlockNumber
1942 RelationGetNumberOfBlocks(Relation relation)
1944 /* Open it at the smgr level if not already done */
1945 RelationOpenSmgr(relation);
1947 return smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
1950 /* ---------------------------------------------------------------------
1951 * DropRelFileNodeBuffers
1953 * This function removes from the buffer pool all the pages of the
1954 * specified relation that have block numbers >= firstDelBlock.
1955 * (In particular, with firstDelBlock = 0, all pages are removed.)
1956 * Dirty pages are simply dropped, without bothering to write them
1957 * out first. Therefore, this is NOT rollback-able, and so should be
1958 * used only with extreme caution!
1960 * Currently, this is called only from smgr.c when the underlying file
1961 * is about to be deleted or truncated (firstDelBlock is needed for
1962 * the truncation case). The data in the affected pages would therefore
1963 * be deleted momentarily anyway, and there is no point in writing it.
1964 * It is the responsibility of higher-level code to ensure that the
1965 * deletion or truncation does not lose any data that could be needed
1966 * later. It is also the responsibility of higher-level code to ensure
1967 * that no other process could be trying to load more pages of the
1968 * relation into buffers.
1970 * XXX currently it sequentially searches the buffer pool, should be
1971 * changed to more clever ways of searching. However, this routine
1972 * is used only in code paths that aren't very performance-critical,
1973 * and we shouldn't slow down the hot paths to make it faster ...
1974 * --------------------------------------------------------------------
1976 void
1977 DropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum, bool istemp,
1978 BlockNumber firstDelBlock)
1980 int i;
1982 if (istemp)
1984 DropRelFileNodeLocalBuffers(rnode, forkNum, firstDelBlock);
1985 return;
1988 for (i = 0; i < NBuffers; i++)
1990 volatile BufferDesc *bufHdr = &BufferDescriptors[i];
1992 LockBufHdr(bufHdr);
1993 if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
1994 bufHdr->tag.forkNum == forkNum &&
1995 bufHdr->tag.blockNum >= firstDelBlock)
1996 InvalidateBuffer(bufHdr); /* releases spinlock */
1997 else
1998 UnlockBufHdr(bufHdr);
2002 /* ---------------------------------------------------------------------
2003 * DropDatabaseBuffers
2005 * This function removes all the buffers in the buffer cache for a
2006 * particular database. Dirty pages are simply dropped, without
2007 * bothering to write them out first. This is used when we destroy a
2008 * database, to avoid trying to flush data to disk when the directory
2009 * tree no longer exists. Implementation is pretty similar to
2010 * DropRelFileNodeBuffers() which is for destroying just one relation.
2011 * --------------------------------------------------------------------
2013 void
2014 DropDatabaseBuffers(Oid dbid)
2016 int i;
2017 volatile BufferDesc *bufHdr;
2020 * We needn't consider local buffers, since by assumption the target
2021 * database isn't our own.
2024 for (i = 0; i < NBuffers; i++)
2026 bufHdr = &BufferDescriptors[i];
2027 LockBufHdr(bufHdr);
2028 if (bufHdr->tag.rnode.dbNode == dbid)
2029 InvalidateBuffer(bufHdr); /* releases spinlock */
2030 else
2031 UnlockBufHdr(bufHdr);
2035 /* -----------------------------------------------------------------
2036 * PrintBufferDescs
2038 * this function prints all the buffer descriptors, for debugging
2039 * use only.
2040 * -----------------------------------------------------------------
2042 #ifdef NOT_USED
2043 void
2044 PrintBufferDescs(void)
2046 int i;
2047 volatile BufferDesc *buf = BufferDescriptors;
2049 for (i = 0; i < NBuffers; ++i, ++buf)
2051 /* theoretically we should lock the bufhdr here */
2052 elog(LOG,
2053 "[%02d] (freeNext=%d, rel=%s, "
2054 "blockNum=%u, flags=0x%x, refcount=%u %d)",
2055 i, buf->freeNext,
2056 relpath(buf->tag.rnode, buf->tag.forkNum),
2057 buf->tag.blockNum, buf->flags,
2058 buf->refcount, PrivateRefCount[i]);
2061 #endif
2063 #ifdef NOT_USED
2064 void
2065 PrintPinnedBufs(void)
2067 int i;
2068 volatile BufferDesc *buf = BufferDescriptors;
2070 for (i = 0; i < NBuffers; ++i, ++buf)
2072 if (PrivateRefCount[i] > 0)
2074 /* theoretically we should lock the bufhdr here */
2075 elog(LOG,
2076 "[%02d] (freeNext=%d, rel=%s, "
2077 "blockNum=%u, flags=0x%x, refcount=%u %d)",
2078 i, buf->freeNext,
2079 relpath(buf->tag.rnode, buf->tag.forkNum),
2080 buf->tag.blockNum, buf->flags,
2081 buf->refcount, PrivateRefCount[i]);
2085 #endif
2087 /* ---------------------------------------------------------------------
2088 * FlushRelationBuffers
2090 * This function writes all dirty pages of a relation out to disk
2091 * (or more accurately, out to kernel disk buffers), ensuring that the
2092 * kernel has an up-to-date view of the relation.
2094 * Generally, the caller should be holding AccessExclusiveLock on the
2095 * target relation to ensure that no other backend is busy dirtying
2096 * more blocks of the relation; the effects can't be expected to last
2097 * after the lock is released.
2099 * XXX currently it sequentially searches the buffer pool, should be
2100 * changed to more clever ways of searching. This routine is not
2101 * used in any performance-critical code paths, so it's not worth
2102 * adding additional overhead to normal paths to make it go faster;
2103 * but see also DropRelFileNodeBuffers.
2104 * --------------------------------------------------------------------
2106 void
2107 FlushRelationBuffers(Relation rel)
2109 int i;
2110 volatile BufferDesc *bufHdr;
2112 /* Open rel at the smgr level if not already done */
2113 RelationOpenSmgr(rel);
2115 if (rel->rd_istemp)
2117 for (i = 0; i < NLocBuffer; i++)
2119 bufHdr = &LocalBufferDescriptors[i];
2120 if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2121 (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2123 ErrorContextCallback errcontext;
2125 /* Setup error traceback support for ereport() */
2126 errcontext.callback = buffer_write_error_callback;
2127 errcontext.arg = (void *) bufHdr;
2128 errcontext.previous = error_context_stack;
2129 error_context_stack = &errcontext;
2131 smgrwrite(rel->rd_smgr,
2132 bufHdr->tag.forkNum,
2133 bufHdr->tag.blockNum,
2134 (char *) LocalBufHdrGetBlock(bufHdr),
2135 true);
2137 bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
2139 /* Pop the error context stack */
2140 error_context_stack = errcontext.previous;
2144 return;
2147 /* Make sure we can handle the pin inside the loop */
2148 ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2150 for (i = 0; i < NBuffers; i++)
2152 bufHdr = &BufferDescriptors[i];
2153 LockBufHdr(bufHdr);
2154 if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
2155 (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2157 PinBuffer_Locked(bufHdr);
2158 LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2159 FlushBuffer(bufHdr, rel->rd_smgr);
2160 LWLockRelease(bufHdr->content_lock);
2161 UnpinBuffer(bufHdr, true);
2163 else
2164 UnlockBufHdr(bufHdr);
2168 /* ---------------------------------------------------------------------
2169 * FlushDatabaseBuffers
2171 * This function writes all dirty pages of a database out to disk
2172 * (or more accurately, out to kernel disk buffers), ensuring that the
2173 * kernel has an up-to-date view of the database.
2175 * Generally, the caller should be holding an appropriate lock to ensure
2176 * no other backend is active in the target database; otherwise more
2177 * pages could get dirtied.
2179 * Note we don't worry about flushing any pages of temporary relations.
2180 * It's assumed these wouldn't be interesting.
2181 * --------------------------------------------------------------------
2183 void
2184 FlushDatabaseBuffers(Oid dbid)
2186 int i;
2187 volatile BufferDesc *bufHdr;
2189 /* Make sure we can handle the pin inside the loop */
2190 ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2192 for (i = 0; i < NBuffers; i++)
2194 bufHdr = &BufferDescriptors[i];
2195 LockBufHdr(bufHdr);
2196 if (bufHdr->tag.rnode.dbNode == dbid &&
2197 (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
2199 PinBuffer_Locked(bufHdr);
2200 LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2201 FlushBuffer(bufHdr, NULL);
2202 LWLockRelease(bufHdr->content_lock);
2203 UnpinBuffer(bufHdr, true);
2205 else
2206 UnlockBufHdr(bufHdr);
2211 * ReleaseBuffer -- release the pin on a buffer
2213 void
2214 ReleaseBuffer(Buffer buffer)
2216 volatile BufferDesc *bufHdr;
2218 if (!BufferIsValid(buffer))
2219 elog(ERROR, "bad buffer id: %d", buffer);
2221 ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
2223 if (BufferIsLocal(buffer))
2225 Assert(LocalRefCount[-buffer - 1] > 0);
2226 LocalRefCount[-buffer - 1]--;
2227 return;
2230 bufHdr = &BufferDescriptors[buffer - 1];
2232 Assert(PrivateRefCount[buffer - 1] > 0);
2234 if (PrivateRefCount[buffer - 1] > 1)
2235 PrivateRefCount[buffer - 1]--;
2236 else
2237 UnpinBuffer(bufHdr, false);
2241 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
2243 * This is just a shorthand for a common combination.
2245 void
2246 UnlockReleaseBuffer(Buffer buffer)
2248 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2249 ReleaseBuffer(buffer);
2253 * IncrBufferRefCount
2254 * Increment the pin count on a buffer that we have *already* pinned
2255 * at least once.
2257 * This function cannot be used on a buffer we do not have pinned,
2258 * because it doesn't change the shared buffer state.
2260 void
2261 IncrBufferRefCount(Buffer buffer)
2263 Assert(BufferIsPinned(buffer));
2264 ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2265 ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
2266 if (BufferIsLocal(buffer))
2267 LocalRefCount[-buffer - 1]++;
2268 else
2269 PrivateRefCount[buffer - 1]++;
2273 * SetBufferCommitInfoNeedsSave
2275 * Mark a buffer dirty when we have updated tuple commit-status bits in it.
2277 * This is essentially the same as MarkBufferDirty, except that the caller
2278 * might have only share-lock instead of exclusive-lock on the buffer's
2279 * content lock. We preserve the distinction mainly as a way of documenting
2280 * that the caller has not made a critical data change --- the status-bit
2281 * update could be redone by someone else just as easily. Therefore, no WAL
2282 * log record need be generated, whereas calls to MarkBufferDirty really ought
2283 * to be associated with a WAL-entry-creating action.
2285 void
2286 SetBufferCommitInfoNeedsSave(Buffer buffer)
2288 volatile BufferDesc *bufHdr;
2290 if (!BufferIsValid(buffer))
2291 elog(ERROR, "bad buffer id: %d", buffer);
2293 if (BufferIsLocal(buffer))
2295 MarkLocalBufferDirty(buffer);
2296 return;
2299 bufHdr = &BufferDescriptors[buffer - 1];
2301 Assert(PrivateRefCount[buffer - 1] > 0);
2302 /* here, either share or exclusive lock is OK */
2303 Assert(LWLockHeldByMe(bufHdr->content_lock));
2306 * This routine might get called many times on the same page, if we are
2307 * making the first scan after commit of an xact that added/deleted many
2308 * tuples. So, be as quick as we can if the buffer is already dirty. We
2309 * do this by not acquiring spinlock if it looks like the status bits are
2310 * already OK. (Note it is okay if someone else clears BM_JUST_DIRTIED
2311 * immediately after we look, because the buffer content update is already
2312 * done and will be reflected in the I/O.)
2314 if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
2315 (BM_DIRTY | BM_JUST_DIRTIED))
2317 LockBufHdr(bufHdr);
2318 Assert(bufHdr->refcount > 0);
2319 if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
2320 VacuumCostBalance += VacuumCostPageDirty;
2321 bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
2322 UnlockBufHdr(bufHdr);
2327 * Release buffer content locks for shared buffers.
2329 * Used to clean up after errors.
2331 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
2332 * of releasing buffer content locks per se; the only thing we need to deal
2333 * with here is clearing any PIN_COUNT request that was in progress.
2335 void
2336 UnlockBuffers(void)
2338 volatile BufferDesc *buf = PinCountWaitBuf;
2340 if (buf)
2342 LockBufHdr(buf);
2345 * Don't complain if flag bit not set; it could have been reset but we
2346 * got a cancel/die interrupt before getting the signal.
2348 if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
2349 buf->wait_backend_pid == MyProcPid)
2350 buf->flags &= ~BM_PIN_COUNT_WAITER;
2352 UnlockBufHdr(buf);
2354 PinCountWaitBuf = NULL;
2359 * Acquire or release the content_lock for the buffer.
2361 void
2362 LockBuffer(Buffer buffer, int mode)
2364 volatile BufferDesc *buf;
2366 Assert(BufferIsValid(buffer));
2367 if (BufferIsLocal(buffer))
2368 return; /* local buffers need no lock */
2370 buf = &(BufferDescriptors[buffer - 1]);
2372 if (mode == BUFFER_LOCK_UNLOCK)
2373 LWLockRelease(buf->content_lock);
2374 else if (mode == BUFFER_LOCK_SHARE)
2375 LWLockAcquire(buf->content_lock, LW_SHARED);
2376 else if (mode == BUFFER_LOCK_EXCLUSIVE)
2377 LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
2378 else
2379 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
2383 * Acquire the content_lock for the buffer, but only if we don't have to wait.
2385 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
2387 bool
2388 ConditionalLockBuffer(Buffer buffer)
2390 volatile BufferDesc *buf;
2392 Assert(BufferIsValid(buffer));
2393 if (BufferIsLocal(buffer))
2394 return true; /* act as though we got it */
2396 buf = &(BufferDescriptors[buffer - 1]);
2398 return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
2402 * LockBufferForCleanup - lock a buffer in preparation for deleting items
2404 * Items may be deleted from a disk page only when the caller (a) holds an
2405 * exclusive lock on the buffer and (b) has observed that no other backend
2406 * holds a pin on the buffer. If there is a pin, then the other backend
2407 * might have a pointer into the buffer (for example, a heapscan reference
2408 * to an item --- see README for more details). It's OK if a pin is added
2409 * after the cleanup starts, however; the newly-arrived backend will be
2410 * unable to look at the page until we release the exclusive lock.
2412 * To implement this protocol, a would-be deleter must pin the buffer and
2413 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
2414 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
2415 * it has successfully observed pin count = 1.
2417 void
2418 LockBufferForCleanup(Buffer buffer)
2420 volatile BufferDesc *bufHdr;
2422 Assert(BufferIsValid(buffer));
2423 Assert(PinCountWaitBuf == NULL);
2425 if (BufferIsLocal(buffer))
2427 /* There should be exactly one pin */
2428 if (LocalRefCount[-buffer - 1] != 1)
2429 elog(ERROR, "incorrect local pin count: %d",
2430 LocalRefCount[-buffer - 1]);
2431 /* Nobody else to wait for */
2432 return;
2435 /* There should be exactly one local pin */
2436 if (PrivateRefCount[buffer - 1] != 1)
2437 elog(ERROR, "incorrect local pin count: %d",
2438 PrivateRefCount[buffer - 1]);
2440 bufHdr = &BufferDescriptors[buffer - 1];
2442 for (;;)
2444 /* Try to acquire lock */
2445 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2446 LockBufHdr(bufHdr);
2447 Assert(bufHdr->refcount > 0);
2448 if (bufHdr->refcount == 1)
2450 /* Successfully acquired exclusive lock with pincount 1 */
2451 UnlockBufHdr(bufHdr);
2452 return;
2454 /* Failed, so mark myself as waiting for pincount 1 */
2455 if (bufHdr->flags & BM_PIN_COUNT_WAITER)
2457 UnlockBufHdr(bufHdr);
2458 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2459 elog(ERROR, "multiple backends attempting to wait for pincount 1");
2461 bufHdr->wait_backend_pid = MyProcPid;
2462 bufHdr->flags |= BM_PIN_COUNT_WAITER;
2463 PinCountWaitBuf = bufHdr;
2464 UnlockBufHdr(bufHdr);
2465 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2466 /* Wait to be signaled by UnpinBuffer() */
2467 ProcWaitForSignal();
2468 PinCountWaitBuf = NULL;
2469 /* Loop back and try again */
2474 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
2476 * We won't loop, but just check once to see if the pin count is OK. If
2477 * not, return FALSE with no lock held.
2479 bool
2480 ConditionalLockBufferForCleanup(Buffer buffer)
2482 volatile BufferDesc *bufHdr;
2484 Assert(BufferIsValid(buffer));
2486 if (BufferIsLocal(buffer))
2488 /* There should be exactly one pin */
2489 Assert(LocalRefCount[-buffer - 1] > 0);
2490 if (LocalRefCount[-buffer - 1] != 1)
2491 return false;
2492 /* Nobody else to wait for */
2493 return true;
2496 /* There should be exactly one local pin */
2497 Assert(PrivateRefCount[buffer - 1] > 0);
2498 if (PrivateRefCount[buffer - 1] != 1)
2499 return false;
2501 /* Try to acquire lock */
2502 if (!ConditionalLockBuffer(buffer))
2503 return false;
2505 bufHdr = &BufferDescriptors[buffer - 1];
2506 LockBufHdr(bufHdr);
2507 Assert(bufHdr->refcount > 0);
2508 if (bufHdr->refcount == 1)
2510 /* Successfully acquired exclusive lock with pincount 1 */
2511 UnlockBufHdr(bufHdr);
2512 return true;
2515 /* Failed, so release the lock */
2516 UnlockBufHdr(bufHdr);
2517 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2518 return false;
2523 * Functions for buffer I/O handling
2525 * Note: We assume that nested buffer I/O never occurs.
2526 * i.e at most one io_in_progress lock is held per proc.
2528 * Also note that these are used only for shared buffers, not local ones.
2532 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
2534 static void
2535 WaitIO(volatile BufferDesc *buf)
2538 * Changed to wait until there's no IO - Inoue 01/13/2000
2540 * Note this is *necessary* because an error abort in the process doing
2541 * I/O could release the io_in_progress_lock prematurely. See
2542 * AbortBufferIO.
2544 for (;;)
2546 BufFlags sv_flags;
2549 * It may not be necessary to acquire the spinlock to check the flag
2550 * here, but since this test is essential for correctness, we'd better
2551 * play it safe.
2553 LockBufHdr(buf);
2554 sv_flags = buf->flags;
2555 UnlockBufHdr(buf);
2556 if (!(sv_flags & BM_IO_IN_PROGRESS))
2557 break;
2558 LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
2559 LWLockRelease(buf->io_in_progress_lock);
2564 * StartBufferIO: begin I/O on this buffer
2565 * (Assumptions)
2566 * My process is executing no IO
2567 * The buffer is Pinned
2569 * In some scenarios there are race conditions in which multiple backends
2570 * could attempt the same I/O operation concurrently. If someone else
2571 * has already started I/O on this buffer then we will block on the
2572 * io_in_progress lock until he's done.
2574 * Input operations are only attempted on buffers that are not BM_VALID,
2575 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
2576 * so we can always tell if the work is already done.
2578 * Returns TRUE if we successfully marked the buffer as I/O busy,
2579 * FALSE if someone else already did the work.
2581 static bool
2582 StartBufferIO(volatile BufferDesc *buf, bool forInput)
2584 Assert(!InProgressBuf);
2586 for (;;)
2589 * Grab the io_in_progress lock so that other processes can wait for
2590 * me to finish the I/O.
2592 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2594 LockBufHdr(buf);
2596 if (!(buf->flags & BM_IO_IN_PROGRESS))
2597 break;
2600 * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
2601 * lock isn't held is if the process doing the I/O is recovering from
2602 * an error (see AbortBufferIO). If that's the case, we must wait for
2603 * him to get unwedged.
2605 UnlockBufHdr(buf);
2606 LWLockRelease(buf->io_in_progress_lock);
2607 WaitIO(buf);
2610 /* Once we get here, there is definitely no I/O active on this buffer */
2612 if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
2614 /* someone else already did the I/O */
2615 UnlockBufHdr(buf);
2616 LWLockRelease(buf->io_in_progress_lock);
2617 return false;
2620 buf->flags |= BM_IO_IN_PROGRESS;
2622 UnlockBufHdr(buf);
2624 InProgressBuf = buf;
2625 IsForInput = forInput;
2627 return true;
2631 * TerminateBufferIO: release a buffer we were doing I/O on
2632 * (Assumptions)
2633 * My process is executing IO for the buffer
2634 * BM_IO_IN_PROGRESS bit is set for the buffer
2635 * We hold the buffer's io_in_progress lock
2636 * The buffer is Pinned
2638 * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
2639 * buffer's BM_DIRTY flag. This is appropriate when terminating a
2640 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
2641 * marking the buffer clean if it was re-dirtied while we were writing.
2643 * set_flag_bits gets ORed into the buffer's flags. It must include
2644 * BM_IO_ERROR in a failure case. For successful completion it could
2645 * be 0, or BM_VALID if we just finished reading in the page.
2647 static void
2648 TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
2649 int set_flag_bits)
2651 Assert(buf == InProgressBuf);
2653 LockBufHdr(buf);
2655 Assert(buf->flags & BM_IO_IN_PROGRESS);
2656 buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
2657 if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
2658 buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
2659 buf->flags |= set_flag_bits;
2661 UnlockBufHdr(buf);
2663 InProgressBuf = NULL;
2665 LWLockRelease(buf->io_in_progress_lock);
2669 * AbortBufferIO: Clean up any active buffer I/O after an error.
2671 * All LWLocks we might have held have been released,
2672 * but we haven't yet released buffer pins, so the buffer is still pinned.
2674 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
2675 * possible the error condition wasn't related to the I/O.
2677 void
2678 AbortBufferIO(void)
2680 volatile BufferDesc *buf = InProgressBuf;
2682 if (buf)
2685 * Since LWLockReleaseAll has already been called, we're not holding
2686 * the buffer's io_in_progress_lock. We have to re-acquire it so that
2687 * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
2688 * buffer will be in a busy spin until we succeed in doing this.
2690 LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
2692 LockBufHdr(buf);
2693 Assert(buf->flags & BM_IO_IN_PROGRESS);
2694 if (IsForInput)
2696 Assert(!(buf->flags & BM_DIRTY));
2697 /* We'd better not think buffer is valid yet */
2698 Assert(!(buf->flags & BM_VALID));
2699 UnlockBufHdr(buf);
2701 else
2703 BufFlags sv_flags;
2705 sv_flags = buf->flags;
2706 Assert(sv_flags & BM_DIRTY);
2707 UnlockBufHdr(buf);
2708 /* Issue notice if this is not the first failure... */
2709 if (sv_flags & BM_IO_ERROR)
2711 /* Buffer is pinned, so we can read tag without spinlock */
2712 char *path = relpath(buf->tag.rnode, buf->tag.forkNum);
2714 ereport(WARNING,
2715 (errcode(ERRCODE_IO_ERROR),
2716 errmsg("could not write block %u of %s",
2717 buf->tag.blockNum, path),
2718 errdetail("Multiple failures --- write error might be permanent.")));
2719 pfree(path);
2722 TerminateBufferIO(buf, false, BM_IO_ERROR);
2727 * Error context callback for errors occurring during buffer writes.
2729 static void
2730 buffer_write_error_callback(void *arg)
2732 volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;
2734 /* Buffer is pinned, so we can read the tag without locking the spinlock */
2735 if (bufHdr != NULL)
2737 char *path = relpath(bufHdr->tag.rnode, bufHdr->tag.forkNum);
2739 errcontext("writing block %u of relation %s",
2740 bufHdr->tag.blockNum, path);
2741 pfree(path);