Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
[PostgreSQL.git] / src / backend / access / transam / xlogutils.c
blob38b819da1f1ce4a368723cffdba6e61f49f88042
1 /*-------------------------------------------------------------------------
3 * xlogutils.c
5 * PostgreSQL transaction log manager utility routines
7 * This file contains support routines that are used by XLOG replay functions.
8 * None of this code is used during normal system operation.
11 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
14 * $PostgreSQL$
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include "access/xlogutils.h"
21 #include "storage/bufmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/hsearch.h"
24 #include "utils/rel.h"
28 * During XLOG replay, we may see XLOG records for incremental updates of
29 * pages that no longer exist, because their relation was later dropped or
30 * truncated. (Note: this is only possible when full_page_writes = OFF,
31 * since when it's ON, the first reference we see to a page should always
32 * be a full-page rewrite not an incremental update.) Rather than simply
33 * ignoring such records, we make a note of the referenced page, and then
34 * complain if we don't actually see a drop or truncate covering the page
35 * later in replay.
37 typedef struct xl_invalid_page_key
39 RelFileNode node; /* the relation */
40 ForkNumber forkno; /* the fork number */
41 BlockNumber blkno; /* the page */
42 } xl_invalid_page_key;
44 typedef struct xl_invalid_page
46 xl_invalid_page_key key; /* hash key ... must be first */
47 bool present; /* page existed but contained zeroes */
48 } xl_invalid_page;
50 static HTAB *invalid_page_tab = NULL;
53 /* Log a reference to an invalid page */
54 static void
55 log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
56 bool present)
58 xl_invalid_page_key key;
59 xl_invalid_page *hentry;
60 bool found;
63 * Log references to invalid pages at DEBUG1 level. This allows some
64 * tracing of the cause (note the elog context mechanism will tell us
65 * something about the XLOG record that generated the reference).
67 if (present)
68 elog(DEBUG1, "page %u of relation %u/%u/%u/%u is uninitialized",
69 blkno, node.spcNode, node.dbNode, node.relNode, forkno);
70 else
71 elog(DEBUG1, "page %u of relation %u/%u/%u/%u does not exist",
72 blkno, node.spcNode, node.dbNode, node.relNode, forkno);
74 if (invalid_page_tab == NULL)
76 /* create hash table when first needed */
77 HASHCTL ctl;
79 memset(&ctl, 0, sizeof(ctl));
80 ctl.keysize = sizeof(xl_invalid_page_key);
81 ctl.entrysize = sizeof(xl_invalid_page);
82 ctl.hash = tag_hash;
84 invalid_page_tab = hash_create("XLOG invalid-page table",
85 100,
86 &ctl,
87 HASH_ELEM | HASH_FUNCTION);
90 /* we currently assume xl_invalid_page_key contains no padding */
91 key.node = node;
92 key.forkno = forkno;
93 key.blkno = blkno;
94 hentry = (xl_invalid_page *)
95 hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
97 if (!found)
99 /* hash_search already filled in the key */
100 hentry->present = present;
102 else
104 /* repeat reference ... leave "present" as it was */
108 /* Forget any invalid pages >= minblkno, because they've been dropped */
109 static void
110 forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
112 HASH_SEQ_STATUS status;
113 xl_invalid_page *hentry;
115 if (invalid_page_tab == NULL)
116 return; /* nothing to do */
118 hash_seq_init(&status, invalid_page_tab);
120 while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
122 if (RelFileNodeEquals(hentry->key.node, node) &&
123 hentry->key.forkno == forkno &&
124 hentry->key.blkno >= minblkno)
126 elog(DEBUG2, "page %u of relation %u/%u/%u/%u has been dropped",
127 hentry->key.blkno, hentry->key.node.spcNode,
128 hentry->key.node.dbNode, hentry->key.node.relNode, forkno);
130 if (hash_search(invalid_page_tab,
131 (void *) &hentry->key,
132 HASH_REMOVE, NULL) == NULL)
133 elog(ERROR, "hash table corrupted");
138 /* Forget any invalid pages in a whole database */
139 static void
140 forget_invalid_pages_db(Oid dbid)
142 HASH_SEQ_STATUS status;
143 xl_invalid_page *hentry;
145 if (invalid_page_tab == NULL)
146 return; /* nothing to do */
148 hash_seq_init(&status, invalid_page_tab);
150 while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
152 if (hentry->key.node.dbNode == dbid)
154 elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
155 hentry->key.blkno, hentry->key.node.spcNode,
156 hentry->key.node.dbNode, hentry->key.node.relNode);
158 if (hash_search(invalid_page_tab,
159 (void *) &hentry->key,
160 HASH_REMOVE, NULL) == NULL)
161 elog(ERROR, "hash table corrupted");
166 /* Complain about any remaining invalid-page entries */
167 void
168 XLogCheckInvalidPages(void)
170 HASH_SEQ_STATUS status;
171 xl_invalid_page *hentry;
172 bool foundone = false;
174 if (invalid_page_tab == NULL)
175 return; /* nothing to do */
177 hash_seq_init(&status, invalid_page_tab);
180 * Our strategy is to emit WARNING messages for all remaining entries and
181 * only PANIC after we've dumped all the available info.
183 while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
185 if (hentry->present)
186 elog(WARNING, "page %u of relation %u/%u/%u was uninitialized",
187 hentry->key.blkno, hentry->key.node.spcNode,
188 hentry->key.node.dbNode, hentry->key.node.relNode);
189 else
190 elog(WARNING, "page %u of relation %u/%u/%u did not exist",
191 hentry->key.blkno, hentry->key.node.spcNode,
192 hentry->key.node.dbNode, hentry->key.node.relNode);
193 foundone = true;
196 if (foundone)
197 elog(PANIC, "WAL contains references to invalid pages");
199 hash_destroy(invalid_page_tab);
200 invalid_page_tab = NULL;
204 * XLogReadBufferExtended
205 * A shorthand of XLogReadBufferExtended(), for reading from the main
206 * fork.
208 * For historical reasons, instead of a ReadBufferMode argument, this only
209 * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
211 Buffer
212 XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
214 return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
215 init ? RBM_ZERO : RBM_NORMAL);
219 * XLogReadBuffer
220 * Read a page during XLOG replay
222 * This is functionally comparable to ReadBuffer followed by
223 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
224 * and locked buffer. (Getting the lock is not really necessary, since we
225 * expect that this is only used during single-process XLOG replay, but
226 * some subroutines such as MarkBufferDirty will complain if we don't.)
228 * There's some differences in the behavior wrt. the "mode" argument,
229 * compared to ReadBufferExtended:
231 * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
232 * return InvalidBuffer. In this case the caller should silently skip the
233 * update on this page. (In this situation, we expect that the page was later
234 * dropped or truncated. If we don't see evidence of that later in the WAL
235 * sequence, we'll complain at the end of WAL replay.)
237 * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
238 * relation is extended with all-zeroes pages up to the given block number.
240 Buffer
241 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
242 BlockNumber blkno, ReadBufferMode mode)
244 BlockNumber lastblock;
245 Buffer buffer;
246 SMgrRelation smgr;
248 Assert(blkno != P_NEW);
250 /* Open the relation at smgr level */
251 smgr = smgropen(rnode);
254 * Create the target file if it doesn't already exist. This lets us cope
255 * if the replay sequence contains writes to a relation that is later
256 * deleted. (The original coding of this routine would instead suppress
257 * the writes, but that seems like it risks losing valuable data if the
258 * filesystem loses an inode during a crash. Better to write the data
259 * until we are actually told to delete the file.)
261 smgrcreate(smgr, forknum, false, true);
263 lastblock = smgrnblocks(smgr, forknum);
265 if (blkno < lastblock)
267 /* page exists in file */
268 buffer = ReadBufferWithoutRelcache(rnode, false, forknum, blkno,
269 mode, NULL);
271 else
273 /* hm, page doesn't exist in file */
274 if (mode == RBM_NORMAL)
276 log_invalid_page(rnode, forknum, blkno, false);
277 return InvalidBuffer;
279 /* OK to extend the file */
280 /* we do this in recovery only - no rel-extension lock needed */
281 Assert(InRecovery);
282 buffer = InvalidBuffer;
283 while (blkno >= lastblock)
285 if (buffer != InvalidBuffer)
286 ReleaseBuffer(buffer);
287 buffer = ReadBufferWithoutRelcache(rnode, false, forknum,
288 P_NEW, mode, NULL);
289 lastblock++;
291 Assert(BufferGetBlockNumber(buffer) == blkno);
294 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
296 if (mode == RBM_NORMAL)
298 /* check that page has been initialized */
299 Page page = (Page) BufferGetPage(buffer);
301 if (PageIsNew(page))
303 UnlockReleaseBuffer(buffer);
304 log_invalid_page(rnode, forknum, blkno, true);
305 return InvalidBuffer;
309 return buffer;
314 * Struct actually returned by XLogFakeRelcacheEntry, though the declared
315 * return type is Relation.
317 typedef struct
319 RelationData reldata; /* Note: this must be first */
320 FormData_pg_class pgc;
321 } FakeRelCacheEntryData;
323 typedef FakeRelCacheEntryData *FakeRelCacheEntry;
326 * Create a fake relation cache entry for a physical relation
328 * It's often convenient to use the same functions in XLOG replay as in the
329 * main codepath, but those functions typically work with a relcache entry.
330 * We don't have a working relation cache during XLOG replay, but this
331 * function can be used to create a fake relcache entry instead. Only the
332 * fields related to physical storage, like rd_rel, are initialized, so the
333 * fake entry is only usable in low-level operations like ReadBuffer().
335 * Caller must free the returned entry with FreeFakeRelcacheEntry().
337 Relation
338 CreateFakeRelcacheEntry(RelFileNode rnode)
340 FakeRelCacheEntry fakeentry;
341 Relation rel;
343 /* Allocate the Relation struct and all related space in one block. */
344 fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
345 rel = (Relation) fakeentry;
347 rel->rd_rel = &fakeentry->pgc;
348 rel->rd_node = rnode;
350 /* We don't know the name of the relation; use relfilenode instead */
351 sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
354 * We set up the lockRelId in case anything tries to lock the dummy
355 * relation. Note that this is fairly bogus since relNode may be
356 * different from the relation's OID. It shouldn't really matter
357 * though, since we are presumably running by ourselves and can't have
358 * any lock conflicts ...
360 rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
361 rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
363 rel->rd_targblock = InvalidBlockNumber;
364 rel->rd_fsm_nblocks_cache = InvalidBlockNumber;
365 rel->rd_smgr = NULL;
367 return rel;
371 * Free a fake relation cache entry.
373 void
374 FreeFakeRelcacheEntry(Relation fakerel)
376 pfree(fakerel);
380 * Drop a relation during XLOG replay
382 * This is called when the relation is about to be deleted; we need to remove
383 * any open "invalid-page" records for the relation.
385 void
386 XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
388 forget_invalid_pages(rnode, forknum, 0);
392 * Drop a whole database during XLOG replay
394 * As above, but for DROP DATABASE instead of dropping a single rel
396 void
397 XLogDropDatabase(Oid dbid)
400 * This is unnecessarily heavy-handed, as it will close SMgrRelation
401 * objects for other databases as well. DROP DATABASE occurs seldom
402 * enough that it's not worth introducing a variant of smgrclose for
403 * just this purpose. XXX: Or should we rather leave the smgr entries
404 * dangling?
406 smgrcloseall();
408 forget_invalid_pages_db(dbid);
412 * Truncate a relation during XLOG replay
414 * We need to clean up any open "invalid-page" records for the dropped pages.
416 void
417 XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
418 BlockNumber nblocks)
420 forget_invalid_pages(rnode, forkNum, nblocks);