Introduce the concept of relation forks. An smgr relation can now consist
[PostgreSQL.git] / src / backend / access / transam / xlogutils.c
blobbb2c40bfaaec9d2e93b94e6bc159ae3219724b59
1 /*-------------------------------------------------------------------------
3 * xlogutils.c
5 * PostgreSQL transaction log manager utility routines
7 * This file contains support routines that are used by XLOG replay functions.
8 * None of this code is used during normal system operation.
11 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
14 * $PostgreSQL$
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include "access/xlogutils.h"
21 #include "storage/bufmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/hsearch.h"
24 #include "utils/rel.h"
28 * During XLOG replay, we may see XLOG records for incremental updates of
29 * pages that no longer exist, because their relation was later dropped or
30 * truncated. (Note: this is only possible when full_page_writes = OFF,
31 * since when it's ON, the first reference we see to a page should always
32 * be a full-page rewrite not an incremental update.) Rather than simply
33 * ignoring such records, we make a note of the referenced page, and then
34 * complain if we don't actually see a drop or truncate covering the page
35 * later in replay.
37 typedef struct xl_invalid_page_key
39 RelFileNode node; /* the relation */
40 ForkNumber forkno; /* the fork number */
41 BlockNumber blkno; /* the page */
42 } xl_invalid_page_key;
44 typedef struct xl_invalid_page
46 xl_invalid_page_key key; /* hash key ... must be first */
47 bool present; /* page existed but contained zeroes */
48 } xl_invalid_page;
50 static HTAB *invalid_page_tab = NULL;
53 /* Log a reference to an invalid page */
54 static void
55 log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
56 bool present)
58 xl_invalid_page_key key;
59 xl_invalid_page *hentry;
60 bool found;
63 * Log references to invalid pages at DEBUG1 level. This allows some
64 * tracing of the cause (note the elog context mechanism will tell us
65 * something about the XLOG record that generated the reference).
67 if (present)
68 elog(DEBUG1, "page %u of relation %u/%u/%u/%u is uninitialized",
69 blkno, node.spcNode, node.dbNode, node.relNode, forkno);
70 else
71 elog(DEBUG1, "page %u of relation %u/%u/%u/%u does not exist",
72 blkno, node.spcNode, node.dbNode, node.relNode, forkno);
74 if (invalid_page_tab == NULL)
76 /* create hash table when first needed */
77 HASHCTL ctl;
79 memset(&ctl, 0, sizeof(ctl));
80 ctl.keysize = sizeof(xl_invalid_page_key);
81 ctl.entrysize = sizeof(xl_invalid_page);
82 ctl.hash = tag_hash;
84 invalid_page_tab = hash_create("XLOG invalid-page table",
85 100,
86 &ctl,
87 HASH_ELEM | HASH_FUNCTION);
90 /* we currently assume xl_invalid_page_key contains no padding */
91 key.node = node;
92 key.forkno = forkno;
93 key.blkno = blkno;
94 hentry = (xl_invalid_page *)
95 hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
97 if (!found)
99 /* hash_search already filled in the key */
100 hentry->present = present;
102 else
104 /* repeat reference ... leave "present" as it was */
108 /* Forget any invalid pages >= minblkno, because they've been dropped */
109 static void
110 forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
112 HASH_SEQ_STATUS status;
113 xl_invalid_page *hentry;
115 if (invalid_page_tab == NULL)
116 return; /* nothing to do */
118 hash_seq_init(&status, invalid_page_tab);
120 while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
122 if (RelFileNodeEquals(hentry->key.node, node) &&
123 hentry->key.forkno == forkno &&
124 hentry->key.blkno >= minblkno)
126 elog(DEBUG2, "page %u of relation %u/%u/%u/%u has been dropped",
127 hentry->key.blkno, hentry->key.node.spcNode,
128 hentry->key.node.dbNode, hentry->key.node.relNode, forkno);
130 if (hash_search(invalid_page_tab,
131 (void *) &hentry->key,
132 HASH_REMOVE, NULL) == NULL)
133 elog(ERROR, "hash table corrupted");
138 /* Forget any invalid pages in a whole database */
139 static void
140 forget_invalid_pages_db(Oid dbid)
142 HASH_SEQ_STATUS status;
143 xl_invalid_page *hentry;
145 if (invalid_page_tab == NULL)
146 return; /* nothing to do */
148 hash_seq_init(&status, invalid_page_tab);
150 while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
152 if (hentry->key.node.dbNode == dbid)
154 elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
155 hentry->key.blkno, hentry->key.node.spcNode,
156 hentry->key.node.dbNode, hentry->key.node.relNode);
158 if (hash_search(invalid_page_tab,
159 (void *) &hentry->key,
160 HASH_REMOVE, NULL) == NULL)
161 elog(ERROR, "hash table corrupted");
166 /* Complain about any remaining invalid-page entries */
167 void
168 XLogCheckInvalidPages(void)
170 HASH_SEQ_STATUS status;
171 xl_invalid_page *hentry;
172 bool foundone = false;
174 if (invalid_page_tab == NULL)
175 return; /* nothing to do */
177 hash_seq_init(&status, invalid_page_tab);
180 * Our strategy is to emit WARNING messages for all remaining entries and
181 * only PANIC after we've dumped all the available info.
183 while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
185 if (hentry->present)
186 elog(WARNING, "page %u of relation %u/%u/%u was uninitialized",
187 hentry->key.blkno, hentry->key.node.spcNode,
188 hentry->key.node.dbNode, hentry->key.node.relNode);
189 else
190 elog(WARNING, "page %u of relation %u/%u/%u did not exist",
191 hentry->key.blkno, hentry->key.node.spcNode,
192 hentry->key.node.dbNode, hentry->key.node.relNode);
193 foundone = true;
196 if (foundone)
197 elog(PANIC, "WAL contains references to invalid pages");
199 hash_destroy(invalid_page_tab);
200 invalid_page_tab = NULL;
205 * XLogReadBuffer
206 * Read a page during XLOG replay
208 * This is functionally comparable to ReadBuffer followed by
209 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
210 * and locked buffer. (Getting the lock is not really necessary, since we
211 * expect that this is only used during single-process XLOG replay, but
212 * some subroutines such as MarkBufferDirty will complain if we don't.)
214 * If "init" is true then the caller intends to rewrite the page fully
215 * using the info in the XLOG record. In this case we will extend the
216 * relation if needed to make the page exist, and we will not complain about
217 * the page being "new" (all zeroes); in fact, we usually will supply a
218 * zeroed buffer without reading the page at all, so as to avoid unnecessary
219 * failure if the page is present on disk but has corrupt headers.
221 * If "init" is false then the caller needs the page to be valid already.
222 * If the page doesn't exist or contains zeroes, we return InvalidBuffer.
223 * In this case the caller should silently skip the update on this page.
224 * (In this situation, we expect that the page was later dropped or truncated.
225 * If we don't see evidence of that later in the WAL sequence, we'll complain
226 * at the end of WAL replay.)
228 Buffer
229 XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
231 return XLogReadBufferWithFork(rnode, MAIN_FORKNUM, blkno, init);
235 * XLogReadBufferWithFork
236 * Like XLogReadBuffer, but for reading other relation forks than
237 * the main one.
239 Buffer
240 XLogReadBufferWithFork(RelFileNode rnode, ForkNumber forknum,
241 BlockNumber blkno, bool init)
243 BlockNumber lastblock;
244 Buffer buffer;
245 SMgrRelation smgr;
247 Assert(blkno != P_NEW);
249 /* Open the relation at smgr level */
250 smgr = smgropen(rnode);
253 * Create the target file if it doesn't already exist. This lets us cope
254 * if the replay sequence contains writes to a relation that is later
255 * deleted. (The original coding of this routine would instead suppress
256 * the writes, but that seems like it risks losing valuable data if the
257 * filesystem loses an inode during a crash. Better to write the data
258 * until we are actually told to delete the file.)
260 smgrcreate(smgr, forknum, false, true);
262 lastblock = smgrnblocks(smgr, forknum);
264 if (blkno < lastblock)
266 /* page exists in file */
267 buffer = ReadBufferWithoutRelcache(rnode, false, forknum, blkno, init);
269 else
271 /* hm, page doesn't exist in file */
272 if (!init)
274 log_invalid_page(rnode, forknum, blkno, false);
275 return InvalidBuffer;
277 /* OK to extend the file */
278 /* we do this in recovery only - no rel-extension lock needed */
279 Assert(InRecovery);
280 buffer = InvalidBuffer;
281 while (blkno >= lastblock)
283 if (buffer != InvalidBuffer)
284 ReleaseBuffer(buffer);
285 buffer = ReadBufferWithoutRelcache(rnode, false, forknum,
286 P_NEW, false);
287 lastblock++;
289 Assert(BufferGetBlockNumber(buffer) == blkno);
292 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
294 if (!init)
296 /* check that page has been initialized */
297 Page page = (Page) BufferGetPage(buffer);
299 if (PageIsNew(page))
301 UnlockReleaseBuffer(buffer);
302 log_invalid_page(rnode, forknum, blkno, true);
303 return InvalidBuffer;
307 return buffer;
312 * Struct actually returned by XLogFakeRelcacheEntry, though the declared
313 * return type is Relation.
315 typedef struct
317 RelationData reldata; /* Note: this must be first */
318 FormData_pg_class pgc;
319 } FakeRelCacheEntryData;
321 typedef FakeRelCacheEntryData *FakeRelCacheEntry;
324 * Create a fake relation cache entry for a physical relation
326 * It's often convenient to use the same functions in XLOG replay as in the
327 * main codepath, but those functions typically work with a relcache entry.
328 * We don't have a working relation cache during XLOG replay, but this
329 * function can be used to create a fake relcache entry instead. Only the
330 * fields related to physical storage, like rd_rel, are initialized, so the
331 * fake entry is only usable in low-level operations like ReadBuffer().
333 * Caller must free the returned entry with FreeFakeRelcacheEntry().
335 Relation
336 CreateFakeRelcacheEntry(RelFileNode rnode)
338 FakeRelCacheEntry fakeentry;
339 Relation rel;
341 /* Allocate the Relation struct and all related space in one block. */
342 fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
343 rel = (Relation) fakeentry;
345 rel->rd_rel = &fakeentry->pgc;
346 rel->rd_node = rnode;
348 /* We don't know the name of the relation; use relfilenode instead */
349 sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
352 * We set up the lockRelId in case anything tries to lock the dummy
353 * relation. Note that this is fairly bogus since relNode may be
354 * different from the relation's OID. It shouldn't really matter
355 * though, since we are presumably running by ourselves and can't have
356 * any lock conflicts ...
358 rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
359 rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
361 rel->rd_targblock = InvalidBlockNumber;
362 rel->rd_smgr = NULL;
364 return rel;
368 * Free a fake relation cache entry.
370 void
371 FreeFakeRelcacheEntry(Relation fakerel)
373 pfree(fakerel);
377 * Drop a relation during XLOG replay
379 * This is called when the relation is about to be deleted; we need to remove
380 * any open "invalid-page" records for the relation.
382 void
383 XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
385 forget_invalid_pages(rnode, forknum, 0);
389 * Drop a whole database during XLOG replay
391 * As above, but for DROP DATABASE instead of dropping a single rel
393 void
394 XLogDropDatabase(Oid dbid)
397 * This is unnecessarily heavy-handed, as it will close SMgrRelation
398 * objects for other databases as well. DROP DATABASE occurs seldom
399 * enough that it's not worth introducing a variant of smgrclose for
400 * just this purpose. XXX: Or should we rather leave the smgr entries
401 * dangling?
403 smgrcloseall();
405 forget_invalid_pages_db(dbid);
409 * Truncate a relation during XLOG replay
411 * We need to clean up any open "invalid-page" records for the dropped pages.
413 void
414 XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
415 BlockNumber nblocks)
417 forget_invalid_pages(rnode, forkNum, nblocks);