Properly access a buffer's LSN using existing access macros instead of abusing
[PostgreSQL.git] / src / backend / storage / buffer / freelist.c
blobdaaf01055f4583a1daf566c280e30ee95385be88
1 /*-------------------------------------------------------------------------
3 * freelist.c
4 * routines for managing the buffer pool's replacement strategy.
7 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * IDENTIFICATION
12 * $PostgreSQL$
14 *-------------------------------------------------------------------------
16 #include "postgres.h"
18 #include "storage/buf_internals.h"
19 #include "storage/bufmgr.h"
23 * The shared freelist control information.
25 typedef struct
27 /* Clock sweep hand: index of next buffer to consider grabbing */
28 int nextVictimBuffer;
30 int firstFreeBuffer; /* Head of list of unused buffers */
31 int lastFreeBuffer; /* Tail of list of unused buffers */
34 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
35 * when the list is empty)
39 * Statistics. These counters should be wide enough that they can't
40 * overflow during a single bgwriter cycle.
42 uint32 completePasses; /* Complete cycles of the clock sweep */
43 uint32 numBufferAllocs; /* Buffers allocated since last reset */
44 } BufferStrategyControl;
46 /* Pointers to shared state */
47 static BufferStrategyControl *StrategyControl = NULL;
50 * Private (non-shared) state for managing a ring of shared buffers to re-use.
51 * This is currently the only kind of BufferAccessStrategy object, but someday
52 * we might have more kinds.
54 typedef struct BufferAccessStrategyData
56 /* Overall strategy type */
57 BufferAccessStrategyType btype;
58 /* Number of elements in buffers[] array */
59 int ring_size;
62 * Index of the "current" slot in the ring, ie, the one most recently
63 * returned by GetBufferFromRing.
65 int current;
68 * True if the buffer just returned by StrategyGetBuffer had been in the
69 * ring already.
71 bool current_was_in_ring;
74 * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
75 * have not yet selected a buffer for this ring slot. For allocation
76 * simplicity this is palloc'd together with the fixed fields of the
77 * struct.
79 Buffer buffers[1]; /* VARIABLE SIZE ARRAY */
80 } BufferAccessStrategyData;
83 /* Prototypes for internal functions */
84 static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
85 static void AddBufferToRing(BufferAccessStrategy strategy,
86 volatile BufferDesc *buf);
90 * StrategyGetBuffer
92 * Called by the bufmgr to get the next candidate buffer to use in
93 * BufferAlloc(). The only hard requirement BufferAlloc() has is that
94 * the selected buffer must not currently be pinned by anyone.
96 * strategy is a BufferAccessStrategy object, or NULL for default strategy.
98 * To ensure that no one else can pin the buffer before we do, we must
99 * return the buffer with the buffer header spinlock still held. If
100 * *lock_held is set on exit, we have returned with the BufFreelistLock
101 * still held, as well; the caller must release that lock once the spinlock
102 * is dropped. We do it that way because releasing the BufFreelistLock
103 * might awaken other processes, and it would be bad to do the associated
104 * kernel calls while holding the buffer header spinlock.
106 volatile BufferDesc *
107 StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
109 volatile BufferDesc *buf;
110 int trycounter;
113 * If given a strategy object, see whether it can select a buffer. We
114 * assume strategy objects don't need the BufFreelistLock.
116 if (strategy != NULL)
118 buf = GetBufferFromRing(strategy);
119 if (buf != NULL)
121 *lock_held = false;
122 return buf;
126 /* Nope, so lock the freelist */
127 *lock_held = true;
128 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
131 * We count buffer allocation requests so that the bgwriter can estimate
132 * the rate of buffer consumption. Note that buffers recycled by a
133 * strategy object are intentionally not counted here.
135 StrategyControl->numBufferAllocs++;
138 * Try to get a buffer from the freelist. Note that the freeNext fields
139 * are considered to be protected by the BufFreelistLock not the
140 * individual buffer spinlocks, so it's OK to manipulate them without
141 * holding the spinlock.
143 while (StrategyControl->firstFreeBuffer >= 0)
145 buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
146 Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
148 /* Unconditionally remove buffer from freelist */
149 StrategyControl->firstFreeBuffer = buf->freeNext;
150 buf->freeNext = FREENEXT_NOT_IN_LIST;
153 * If the buffer is pinned or has a nonzero usage_count, we cannot use
154 * it; discard it and retry. (This can only happen if VACUUM put a
155 * valid buffer in the freelist and then someone else used it before
156 * we got to it. It's probably impossible altogether as of 8.3, but
157 * we'd better check anyway.)
159 LockBufHdr(buf);
160 if (buf->refcount == 0 && buf->usage_count == 0)
162 if (strategy != NULL)
163 AddBufferToRing(strategy, buf);
164 return buf;
166 UnlockBufHdr(buf);
169 /* Nothing on the freelist, so run the "clock sweep" algorithm */
170 trycounter = NBuffers;
171 for (;;)
173 buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
175 if (++StrategyControl->nextVictimBuffer >= NBuffers)
177 StrategyControl->nextVictimBuffer = 0;
178 StrategyControl->completePasses++;
182 * If the buffer is pinned or has a nonzero usage_count, we cannot use
183 * it; decrement the usage_count (unless pinned) and keep scanning.
185 LockBufHdr(buf);
186 if (buf->refcount == 0)
188 if (buf->usage_count > 0)
190 buf->usage_count--;
191 trycounter = NBuffers;
193 else
195 /* Found a usable buffer */
196 if (strategy != NULL)
197 AddBufferToRing(strategy, buf);
198 return buf;
201 else if (--trycounter == 0)
204 * We've scanned all the buffers without making any state changes,
205 * so all the buffers are pinned (or were when we looked at them).
206 * We could hope that someone will free one eventually, but it's
207 * probably better to fail than to risk getting stuck in an
208 * infinite loop.
210 UnlockBufHdr(buf);
211 elog(ERROR, "no unpinned buffers available");
213 UnlockBufHdr(buf);
216 /* not reached */
217 return NULL;
221 * StrategyFreeBuffer: put a buffer on the freelist
223 void
224 StrategyFreeBuffer(volatile BufferDesc *buf)
226 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
229 * It is possible that we are told to put something in the freelist that
230 * is already in it; don't screw up the list if so.
232 if (buf->freeNext == FREENEXT_NOT_IN_LIST)
234 buf->freeNext = StrategyControl->firstFreeBuffer;
235 if (buf->freeNext < 0)
236 StrategyControl->lastFreeBuffer = buf->buf_id;
237 StrategyControl->firstFreeBuffer = buf->buf_id;
240 LWLockRelease(BufFreelistLock);
244 * StrategySyncStart -- tell BufferSync where to start syncing
246 * The result is the buffer index of the best buffer to sync first.
247 * BufferSync() will proceed circularly around the buffer array from there.
249 * In addition, we return the completed-pass count (which is effectively
250 * the higher-order bits of nextVictimBuffer) and the count of recent buffer
251 * allocs if non-NULL pointers are passed. The alloc count is reset after
252 * being read.
255 StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
257 int result;
259 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
260 result = StrategyControl->nextVictimBuffer;
261 if (complete_passes)
262 *complete_passes = StrategyControl->completePasses;
263 if (num_buf_alloc)
265 *num_buf_alloc = StrategyControl->numBufferAllocs;
266 StrategyControl->numBufferAllocs = 0;
268 LWLockRelease(BufFreelistLock);
269 return result;
274 * StrategyShmemSize
276 * estimate the size of shared memory used by the freelist-related structures.
278 * Note: for somewhat historical reasons, the buffer lookup hashtable size
279 * is also determined here.
281 Size
282 StrategyShmemSize(void)
284 Size size = 0;
286 /* size of lookup hash table ... see comment in StrategyInitialize */
287 size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
289 /* size of the shared replacement strategy control block */
290 size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
292 return size;
296 * StrategyInitialize -- initialize the buffer cache replacement
297 * strategy.
299 * Assumes: All of the buffers are already built into a linked list.
300 * Only called by postmaster and only during initialization.
302 void
303 StrategyInitialize(bool init)
305 bool found;
308 * Initialize the shared buffer lookup hashtable.
310 * Since we can't tolerate running out of lookup table entries, we must be
311 * sure to specify an adequate table size here. The maximum steady-state
312 * usage is of course NBuffers entries, but BufferAlloc() tries to insert
313 * a new entry before deleting the old. In principle this could be
314 * happening in each partition concurrently, so we could need as many as
315 * NBuffers + NUM_BUFFER_PARTITIONS entries.
317 InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
320 * Get or create the shared strategy control block
322 StrategyControl = (BufferStrategyControl *)
323 ShmemInitStruct("Buffer Strategy Status",
324 sizeof(BufferStrategyControl),
325 &found);
327 if (!found)
330 * Only done once, usually in postmaster
332 Assert(init);
335 * Grab the whole linked list of free buffers for our strategy. We
336 * assume it was previously set up by InitBufferPool().
338 StrategyControl->firstFreeBuffer = 0;
339 StrategyControl->lastFreeBuffer = NBuffers - 1;
341 /* Initialize the clock sweep pointer */
342 StrategyControl->nextVictimBuffer = 0;
344 /* Clear statistics */
345 StrategyControl->completePasses = 0;
346 StrategyControl->numBufferAllocs = 0;
348 else
349 Assert(!init);
353 /* ----------------------------------------------------------------
354 * Backend-private buffer ring management
355 * ----------------------------------------------------------------
360 * GetAccessStrategy -- create a BufferAccessStrategy object
362 * The object is allocated in the current memory context.
364 BufferAccessStrategy
365 GetAccessStrategy(BufferAccessStrategyType btype)
367 BufferAccessStrategy strategy;
368 int ring_size;
371 * Select ring size to use. See buffer/README for rationales. (Currently
372 * all cases are the same size, but keep this code structure for
373 * flexibility.)
375 * Note: if you change the ring size for BAS_BULKREAD, see also
376 * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
378 switch (btype)
380 case BAS_NORMAL:
381 /* if someone asks for NORMAL, just give 'em a "default" object */
382 return NULL;
384 case BAS_BULKREAD:
385 ring_size = 256 * 1024 / BLCKSZ;
386 break;
387 case BAS_VACUUM:
388 ring_size = 256 * 1024 / BLCKSZ;
389 break;
391 default:
392 elog(ERROR, "unrecognized buffer access strategy: %d",
393 (int) btype);
394 return NULL; /* keep compiler quiet */
397 /* Make sure ring isn't an undue fraction of shared buffers */
398 ring_size = Min(NBuffers / 8, ring_size);
400 /* Allocate the object and initialize all elements to zeroes */
401 strategy = (BufferAccessStrategy)
402 palloc0(offsetof(BufferAccessStrategyData, buffers) +
403 ring_size * sizeof(Buffer));
405 /* Set fields that don't start out zero */
406 strategy->btype = btype;
407 strategy->ring_size = ring_size;
409 return strategy;
413 * FreeAccessStrategy -- release a BufferAccessStrategy object
415 * A simple pfree would do at the moment, but we would prefer that callers
416 * don't assume that much about the representation of BufferAccessStrategy.
418 void
419 FreeAccessStrategy(BufferAccessStrategy strategy)
421 /* don't crash if called on a "default" strategy */
422 if (strategy != NULL)
423 pfree(strategy);
427 * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
428 * ring is empty.
430 * The bufhdr spin lock is held on the returned buffer.
432 static volatile BufferDesc *
433 GetBufferFromRing(BufferAccessStrategy strategy)
435 volatile BufferDesc *buf;
436 Buffer bufnum;
438 /* Advance to next ring slot */
439 if (++strategy->current >= strategy->ring_size)
440 strategy->current = 0;
443 * If the slot hasn't been filled yet, tell the caller to allocate a new
444 * buffer with the normal allocation strategy. He will then fill this
445 * slot by calling AddBufferToRing with the new buffer.
447 bufnum = strategy->buffers[strategy->current];
448 if (bufnum == InvalidBuffer)
450 strategy->current_was_in_ring = false;
451 return NULL;
455 * If the buffer is pinned we cannot use it under any circumstances.
457 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
458 * since our own previous usage of the ring element would have left it
459 * there, but it might've been decremented by clock sweep since then). A
460 * higher usage_count indicates someone else has touched the buffer, so we
461 * shouldn't re-use it.
463 buf = &BufferDescriptors[bufnum - 1];
464 LockBufHdr(buf);
465 if (buf->refcount == 0 && buf->usage_count <= 1)
467 strategy->current_was_in_ring = true;
468 return buf;
470 UnlockBufHdr(buf);
473 * Tell caller to allocate a new buffer with the normal allocation
474 * strategy. He'll then replace this ring element via AddBufferToRing.
476 strategy->current_was_in_ring = false;
477 return NULL;
481 * AddBufferToRing -- add a buffer to the buffer ring
483 * Caller must hold the buffer header spinlock on the buffer. Since this
484 * is called with the spinlock held, it had better be quite cheap.
486 static void
487 AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf)
489 strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
493 * StrategyRejectBuffer -- consider rejecting a dirty buffer
495 * When a nondefault strategy is used, the buffer manager calls this function
496 * when it turns out that the buffer selected by StrategyGetBuffer needs to
497 * be written out and doing so would require flushing WAL too. This gives us
498 * a chance to choose a different victim.
500 * Returns true if buffer manager should ask for a new victim, and false
501 * if this buffer should be written and re-used.
503 bool
504 StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf)
506 /* We only do this in bulkread mode */
507 if (strategy->btype != BAS_BULKREAD)
508 return false;
510 /* Don't muck with behavior of normal buffer-replacement strategy */
511 if (!strategy->current_was_in_ring ||
512 strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
513 return false;
516 * Remove the dirty buffer from the ring; necessary to prevent infinite
517 * loop if all ring members are dirty.
519 strategy->buffers[strategy->current] = InvalidBuffer;
521 return true;