1 /*-------------------------------------------------------------------------
5 * PostgreSQL transaction log manager utility routines
7 * This file contains support routines that are used by XLOG replay functions.
8 * None of this code is used during normal system operation.
11 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
16 *-------------------------------------------------------------------------
20 #include "access/xlogutils.h"
21 #include "storage/bufmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/hsearch.h"
24 #include "utils/rel.h"
28 * During XLOG replay, we may see XLOG records for incremental updates of
29 * pages that no longer exist, because their relation was later dropped or
30 * truncated. (Note: this is only possible when full_page_writes = OFF,
31 * since when it's ON, the first reference we see to a page should always
32 * be a full-page rewrite not an incremental update.) Rather than simply
33 * ignoring such records, we make a note of the referenced page, and then
34 * complain if we don't actually see a drop or truncate covering the page
37 typedef struct xl_invalid_page_key
39 RelFileNode node
; /* the relation */
40 ForkNumber forkno
; /* the fork number */
41 BlockNumber blkno
; /* the page */
42 } xl_invalid_page_key
;
44 typedef struct xl_invalid_page
46 xl_invalid_page_key key
; /* hash key ... must be first */
47 bool present
; /* page existed but contained zeroes */
50 static HTAB
*invalid_page_tab
= NULL
;
53 /* Log a reference to an invalid page */
55 log_invalid_page(RelFileNode node
, ForkNumber forkno
, BlockNumber blkno
,
58 xl_invalid_page_key key
;
59 xl_invalid_page
*hentry
;
63 * Log references to invalid pages at DEBUG1 level. This allows some
64 * tracing of the cause (note the elog context mechanism will tell us
65 * something about the XLOG record that generated the reference).
68 elog(DEBUG1
, "page %u of relation %u/%u/%u/%u is uninitialized",
69 blkno
, node
.spcNode
, node
.dbNode
, node
.relNode
, forkno
);
71 elog(DEBUG1
, "page %u of relation %u/%u/%u/%u does not exist",
72 blkno
, node
.spcNode
, node
.dbNode
, node
.relNode
, forkno
);
74 if (invalid_page_tab
== NULL
)
76 /* create hash table when first needed */
79 memset(&ctl
, 0, sizeof(ctl
));
80 ctl
.keysize
= sizeof(xl_invalid_page_key
);
81 ctl
.entrysize
= sizeof(xl_invalid_page
);
84 invalid_page_tab
= hash_create("XLOG invalid-page table",
87 HASH_ELEM
| HASH_FUNCTION
);
90 /* we currently assume xl_invalid_page_key contains no padding */
94 hentry
= (xl_invalid_page
*)
95 hash_search(invalid_page_tab
, (void *) &key
, HASH_ENTER
, &found
);
99 /* hash_search already filled in the key */
100 hentry
->present
= present
;
104 /* repeat reference ... leave "present" as it was */
108 /* Forget any invalid pages >= minblkno, because they've been dropped */
110 forget_invalid_pages(RelFileNode node
, ForkNumber forkno
, BlockNumber minblkno
)
112 HASH_SEQ_STATUS status
;
113 xl_invalid_page
*hentry
;
115 if (invalid_page_tab
== NULL
)
116 return; /* nothing to do */
118 hash_seq_init(&status
, invalid_page_tab
);
120 while ((hentry
= (xl_invalid_page
*) hash_seq_search(&status
)) != NULL
)
122 if (RelFileNodeEquals(hentry
->key
.node
, node
) &&
123 hentry
->key
.forkno
== forkno
&&
124 hentry
->key
.blkno
>= minblkno
)
126 elog(DEBUG2
, "page %u of relation %u/%u/%u/%u has been dropped",
127 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
128 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
, forkno
);
130 if (hash_search(invalid_page_tab
,
131 (void *) &hentry
->key
,
132 HASH_REMOVE
, NULL
) == NULL
)
133 elog(ERROR
, "hash table corrupted");
138 /* Forget any invalid pages in a whole database */
140 forget_invalid_pages_db(Oid dbid
)
142 HASH_SEQ_STATUS status
;
143 xl_invalid_page
*hentry
;
145 if (invalid_page_tab
== NULL
)
146 return; /* nothing to do */
148 hash_seq_init(&status
, invalid_page_tab
);
150 while ((hentry
= (xl_invalid_page
*) hash_seq_search(&status
)) != NULL
)
152 if (hentry
->key
.node
.dbNode
== dbid
)
154 elog(DEBUG2
, "page %u of relation %u/%u/%u has been dropped",
155 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
156 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
);
158 if (hash_search(invalid_page_tab
,
159 (void *) &hentry
->key
,
160 HASH_REMOVE
, NULL
) == NULL
)
161 elog(ERROR
, "hash table corrupted");
166 /* Complain about any remaining invalid-page entries */
168 XLogCheckInvalidPages(void)
170 HASH_SEQ_STATUS status
;
171 xl_invalid_page
*hentry
;
172 bool foundone
= false;
174 if (invalid_page_tab
== NULL
)
175 return; /* nothing to do */
177 hash_seq_init(&status
, invalid_page_tab
);
180 * Our strategy is to emit WARNING messages for all remaining entries and
181 * only PANIC after we've dumped all the available info.
183 while ((hentry
= (xl_invalid_page
*) hash_seq_search(&status
)) != NULL
)
186 elog(WARNING
, "page %u of relation %u/%u/%u was uninitialized",
187 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
188 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
);
190 elog(WARNING
, "page %u of relation %u/%u/%u did not exist",
191 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
192 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
);
197 elog(PANIC
, "WAL contains references to invalid pages");
199 hash_destroy(invalid_page_tab
);
200 invalid_page_tab
= NULL
;
206 * Read a page during XLOG replay
208 * This is functionally comparable to ReadBuffer followed by
209 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
210 * and locked buffer. (Getting the lock is not really necessary, since we
211 * expect that this is only used during single-process XLOG replay, but
212 * some subroutines such as MarkBufferDirty will complain if we don't.)
214 * If "init" is true then the caller intends to rewrite the page fully
215 * using the info in the XLOG record. In this case we will extend the
216 * relation if needed to make the page exist, and we will not complain about
217 * the page being "new" (all zeroes); in fact, we usually will supply a
218 * zeroed buffer without reading the page at all, so as to avoid unnecessary
219 * failure if the page is present on disk but has corrupt headers.
221 * If "init" is false then the caller needs the page to be valid already.
222 * If the page doesn't exist or contains zeroes, we return InvalidBuffer.
223 * In this case the caller should silently skip the update on this page.
224 * (In this situation, we expect that the page was later dropped or truncated.
225 * If we don't see evidence of that later in the WAL sequence, we'll complain
226 * at the end of WAL replay.)
229 XLogReadBuffer(RelFileNode rnode
, BlockNumber blkno
, bool init
)
231 return XLogReadBufferWithFork(rnode
, MAIN_FORKNUM
, blkno
, init
);
235 * XLogReadBufferWithFork
236 * Like XLogReadBuffer, but for reading other relation forks than
240 XLogReadBufferWithFork(RelFileNode rnode
, ForkNumber forknum
,
241 BlockNumber blkno
, bool init
)
243 BlockNumber lastblock
;
247 Assert(blkno
!= P_NEW
);
249 /* Open the relation at smgr level */
250 smgr
= smgropen(rnode
);
253 * Create the target file if it doesn't already exist. This lets us cope
254 * if the replay sequence contains writes to a relation that is later
255 * deleted. (The original coding of this routine would instead suppress
256 * the writes, but that seems like it risks losing valuable data if the
257 * filesystem loses an inode during a crash. Better to write the data
258 * until we are actually told to delete the file.)
260 smgrcreate(smgr
, forknum
, false, true);
262 lastblock
= smgrnblocks(smgr
, forknum
);
264 if (blkno
< lastblock
)
266 /* page exists in file */
267 buffer
= ReadBufferWithoutRelcache(rnode
, false, forknum
, blkno
, init
);
271 /* hm, page doesn't exist in file */
274 log_invalid_page(rnode
, forknum
, blkno
, false);
275 return InvalidBuffer
;
277 /* OK to extend the file */
278 /* we do this in recovery only - no rel-extension lock needed */
280 buffer
= InvalidBuffer
;
281 while (blkno
>= lastblock
)
283 if (buffer
!= InvalidBuffer
)
284 ReleaseBuffer(buffer
);
285 buffer
= ReadBufferWithoutRelcache(rnode
, false, forknum
,
289 Assert(BufferGetBlockNumber(buffer
) == blkno
);
292 LockBuffer(buffer
, BUFFER_LOCK_EXCLUSIVE
);
296 /* check that page has been initialized */
297 Page page
= (Page
) BufferGetPage(buffer
);
301 UnlockReleaseBuffer(buffer
);
302 log_invalid_page(rnode
, forknum
, blkno
, true);
303 return InvalidBuffer
;
312 * Struct actually returned by XLogFakeRelcacheEntry, though the declared
313 * return type is Relation.
317 RelationData reldata
; /* Note: this must be first */
318 FormData_pg_class pgc
;
319 } FakeRelCacheEntryData
;
321 typedef FakeRelCacheEntryData
*FakeRelCacheEntry
;
324 * Create a fake relation cache entry for a physical relation
326 * It's often convenient to use the same functions in XLOG replay as in the
327 * main codepath, but those functions typically work with a relcache entry.
328 * We don't have a working relation cache during XLOG replay, but this
329 * function can be used to create a fake relcache entry instead. Only the
330 * fields related to physical storage, like rd_rel, are initialized, so the
331 * fake entry is only usable in low-level operations like ReadBuffer().
333 * Caller must free the returned entry with FreeFakeRelcacheEntry().
336 CreateFakeRelcacheEntry(RelFileNode rnode
)
338 FakeRelCacheEntry fakeentry
;
341 /* Allocate the Relation struct and all related space in one block. */
342 fakeentry
= palloc0(sizeof(FakeRelCacheEntryData
));
343 rel
= (Relation
) fakeentry
;
345 rel
->rd_rel
= &fakeentry
->pgc
;
346 rel
->rd_node
= rnode
;
348 /* We don't know the name of the relation; use relfilenode instead */
349 sprintf(RelationGetRelationName(rel
), "%u", rnode
.relNode
);
352 * We set up the lockRelId in case anything tries to lock the dummy
353 * relation. Note that this is fairly bogus since relNode may be
354 * different from the relation's OID. It shouldn't really matter
355 * though, since we are presumably running by ourselves and can't have
356 * any lock conflicts ...
358 rel
->rd_lockInfo
.lockRelId
.dbId
= rnode
.dbNode
;
359 rel
->rd_lockInfo
.lockRelId
.relId
= rnode
.relNode
;
361 rel
->rd_targblock
= InvalidBlockNumber
;
368 * Free a fake relation cache entry.
371 FreeFakeRelcacheEntry(Relation fakerel
)
377 * Drop a relation during XLOG replay
379 * This is called when the relation is about to be deleted; we need to remove
380 * any open "invalid-page" records for the relation.
383 XLogDropRelation(RelFileNode rnode
, ForkNumber forknum
)
385 forget_invalid_pages(rnode
, forknum
, 0);
389 * Drop a whole database during XLOG replay
391 * As above, but for DROP DATABASE instead of dropping a single rel
394 XLogDropDatabase(Oid dbid
)
397 * This is unnecessarily heavy-handed, as it will close SMgrRelation
398 * objects for other databases as well. DROP DATABASE occurs seldom
399 * enough that it's not worth introducing a variant of smgrclose for
400 * just this purpose. XXX: Or should we rather leave the smgr entries
405 forget_invalid_pages_db(dbid
);
409 * Truncate a relation during XLOG replay
411 * We need to clean up any open "invalid-page" records for the dropped pages.
414 XLogTruncateRelation(RelFileNode rnode
, ForkNumber forkNum
,
417 forget_invalid_pages(rnode
, forkNum
, nblocks
);