1 /*-------------------------------------------------------------------------
5 * PostgreSQL transaction log manager utility routines
7 * This file contains support routines that are used by XLOG replay functions.
8 * None of this code is used during normal system operation.
11 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
16 *-------------------------------------------------------------------------
20 #include "access/xlogutils.h"
21 #include "storage/bufmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/hsearch.h"
24 #include "utils/rel.h"
28 * During XLOG replay, we may see XLOG records for incremental updates of
29 * pages that no longer exist, because their relation was later dropped or
30 * truncated. (Note: this is only possible when full_page_writes = OFF,
31 * since when it's ON, the first reference we see to a page should always
32 * be a full-page rewrite not an incremental update.) Rather than simply
33 * ignoring such records, we make a note of the referenced page, and then
34 * complain if we don't actually see a drop or truncate covering the page
37 typedef struct xl_invalid_page_key
39 RelFileNode node
; /* the relation */
40 ForkNumber forkno
; /* the fork number */
41 BlockNumber blkno
; /* the page */
42 } xl_invalid_page_key
;
44 typedef struct xl_invalid_page
46 xl_invalid_page_key key
; /* hash key ... must be first */
47 bool present
; /* page existed but contained zeroes */
50 static HTAB
*invalid_page_tab
= NULL
;
53 /* Log a reference to an invalid page */
55 log_invalid_page(RelFileNode node
, ForkNumber forkno
, BlockNumber blkno
,
58 xl_invalid_page_key key
;
59 xl_invalid_page
*hentry
;
63 * Log references to invalid pages at DEBUG1 level. This allows some
64 * tracing of the cause (note the elog context mechanism will tell us
65 * something about the XLOG record that generated the reference).
68 elog(DEBUG1
, "page %u of relation %u/%u/%u/%u is uninitialized",
69 blkno
, node
.spcNode
, node
.dbNode
, node
.relNode
, forkno
);
71 elog(DEBUG1
, "page %u of relation %u/%u/%u/%u does not exist",
72 blkno
, node
.spcNode
, node
.dbNode
, node
.relNode
, forkno
);
74 if (invalid_page_tab
== NULL
)
76 /* create hash table when first needed */
79 memset(&ctl
, 0, sizeof(ctl
));
80 ctl
.keysize
= sizeof(xl_invalid_page_key
);
81 ctl
.entrysize
= sizeof(xl_invalid_page
);
84 invalid_page_tab
= hash_create("XLOG invalid-page table",
87 HASH_ELEM
| HASH_FUNCTION
);
90 /* we currently assume xl_invalid_page_key contains no padding */
94 hentry
= (xl_invalid_page
*)
95 hash_search(invalid_page_tab
, (void *) &key
, HASH_ENTER
, &found
);
99 /* hash_search already filled in the key */
100 hentry
->present
= present
;
104 /* repeat reference ... leave "present" as it was */
108 /* Forget any invalid pages >= minblkno, because they've been dropped */
110 forget_invalid_pages(RelFileNode node
, ForkNumber forkno
, BlockNumber minblkno
)
112 HASH_SEQ_STATUS status
;
113 xl_invalid_page
*hentry
;
115 if (invalid_page_tab
== NULL
)
116 return; /* nothing to do */
118 hash_seq_init(&status
, invalid_page_tab
);
120 while ((hentry
= (xl_invalid_page
*) hash_seq_search(&status
)) != NULL
)
122 if (RelFileNodeEquals(hentry
->key
.node
, node
) &&
123 hentry
->key
.forkno
== forkno
&&
124 hentry
->key
.blkno
>= minblkno
)
126 elog(DEBUG2
, "page %u of relation %u/%u/%u/%u has been dropped",
127 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
128 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
, forkno
);
130 if (hash_search(invalid_page_tab
,
131 (void *) &hentry
->key
,
132 HASH_REMOVE
, NULL
) == NULL
)
133 elog(ERROR
, "hash table corrupted");
138 /* Forget any invalid pages in a whole database */
140 forget_invalid_pages_db(Oid dbid
)
142 HASH_SEQ_STATUS status
;
143 xl_invalid_page
*hentry
;
145 if (invalid_page_tab
== NULL
)
146 return; /* nothing to do */
148 hash_seq_init(&status
, invalid_page_tab
);
150 while ((hentry
= (xl_invalid_page
*) hash_seq_search(&status
)) != NULL
)
152 if (hentry
->key
.node
.dbNode
== dbid
)
154 elog(DEBUG2
, "page %u of relation %u/%u/%u has been dropped",
155 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
156 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
);
158 if (hash_search(invalid_page_tab
,
159 (void *) &hentry
->key
,
160 HASH_REMOVE
, NULL
) == NULL
)
161 elog(ERROR
, "hash table corrupted");
166 /* Complain about any remaining invalid-page entries */
168 XLogCheckInvalidPages(void)
170 HASH_SEQ_STATUS status
;
171 xl_invalid_page
*hentry
;
172 bool foundone
= false;
174 if (invalid_page_tab
== NULL
)
175 return; /* nothing to do */
177 hash_seq_init(&status
, invalid_page_tab
);
180 * Our strategy is to emit WARNING messages for all remaining entries and
181 * only PANIC after we've dumped all the available info.
183 while ((hentry
= (xl_invalid_page
*) hash_seq_search(&status
)) != NULL
)
186 elog(WARNING
, "page %u of relation %u/%u/%u was uninitialized",
187 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
188 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
);
190 elog(WARNING
, "page %u of relation %u/%u/%u did not exist",
191 hentry
->key
.blkno
, hentry
->key
.node
.spcNode
,
192 hentry
->key
.node
.dbNode
, hentry
->key
.node
.relNode
);
197 elog(PANIC
, "WAL contains references to invalid pages");
199 hash_destroy(invalid_page_tab
);
200 invalid_page_tab
= NULL
;
204 * XLogReadBufferExtended
205 * A shorthand of XLogReadBufferExtended(), for reading from the main
208 * For historical reasons, instead of a ReadBufferMode argument, this only
209 * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
212 XLogReadBuffer(RelFileNode rnode
, BlockNumber blkno
, bool init
)
214 return XLogReadBufferExtended(rnode
, MAIN_FORKNUM
, blkno
,
215 init
? RBM_ZERO
: RBM_NORMAL
);
220 * Read a page during XLOG replay
222 * This is functionally comparable to ReadBuffer followed by
223 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
224 * and locked buffer. (Getting the lock is not really necessary, since we
225 * expect that this is only used during single-process XLOG replay, but
226 * some subroutines such as MarkBufferDirty will complain if we don't.)
228 * There's some differences in the behavior wrt. the "mode" argument,
229 * compared to ReadBufferExtended:
231 * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
232 * return InvalidBuffer. In this case the caller should silently skip the
233 * update on this page. (In this situation, we expect that the page was later
234 * dropped or truncated. If we don't see evidence of that later in the WAL
235 * sequence, we'll complain at the end of WAL replay.)
237 * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
238 * relation is extended with all-zeroes pages up to the given block number.
241 XLogReadBufferExtended(RelFileNode rnode
, ForkNumber forknum
,
242 BlockNumber blkno
, ReadBufferMode mode
)
244 BlockNumber lastblock
;
248 Assert(blkno
!= P_NEW
);
250 /* Open the relation at smgr level */
251 smgr
= smgropen(rnode
);
254 * Create the target file if it doesn't already exist. This lets us cope
255 * if the replay sequence contains writes to a relation that is later
256 * deleted. (The original coding of this routine would instead suppress
257 * the writes, but that seems like it risks losing valuable data if the
258 * filesystem loses an inode during a crash. Better to write the data
259 * until we are actually told to delete the file.)
261 smgrcreate(smgr
, forknum
, false, true);
263 lastblock
= smgrnblocks(smgr
, forknum
);
265 if (blkno
< lastblock
)
267 /* page exists in file */
268 buffer
= ReadBufferWithoutRelcache(rnode
, false, forknum
, blkno
,
273 /* hm, page doesn't exist in file */
274 if (mode
== RBM_NORMAL
)
276 log_invalid_page(rnode
, forknum
, blkno
, false);
277 return InvalidBuffer
;
279 /* OK to extend the file */
280 /* we do this in recovery only - no rel-extension lock needed */
282 buffer
= InvalidBuffer
;
283 while (blkno
>= lastblock
)
285 if (buffer
!= InvalidBuffer
)
286 ReleaseBuffer(buffer
);
287 buffer
= ReadBufferWithoutRelcache(rnode
, false, forknum
,
291 Assert(BufferGetBlockNumber(buffer
) == blkno
);
294 LockBuffer(buffer
, BUFFER_LOCK_EXCLUSIVE
);
296 if (mode
== RBM_NORMAL
)
298 /* check that page has been initialized */
299 Page page
= (Page
) BufferGetPage(buffer
);
303 UnlockReleaseBuffer(buffer
);
304 log_invalid_page(rnode
, forknum
, blkno
, true);
305 return InvalidBuffer
;
314 * Struct actually returned by XLogFakeRelcacheEntry, though the declared
315 * return type is Relation.
319 RelationData reldata
; /* Note: this must be first */
320 FormData_pg_class pgc
;
321 } FakeRelCacheEntryData
;
323 typedef FakeRelCacheEntryData
*FakeRelCacheEntry
;
326 * Create a fake relation cache entry for a physical relation
328 * It's often convenient to use the same functions in XLOG replay as in the
329 * main codepath, but those functions typically work with a relcache entry.
330 * We don't have a working relation cache during XLOG replay, but this
331 * function can be used to create a fake relcache entry instead. Only the
332 * fields related to physical storage, like rd_rel, are initialized, so the
333 * fake entry is only usable in low-level operations like ReadBuffer().
335 * Caller must free the returned entry with FreeFakeRelcacheEntry().
338 CreateFakeRelcacheEntry(RelFileNode rnode
)
340 FakeRelCacheEntry fakeentry
;
343 /* Allocate the Relation struct and all related space in one block. */
344 fakeentry
= palloc0(sizeof(FakeRelCacheEntryData
));
345 rel
= (Relation
) fakeentry
;
347 rel
->rd_rel
= &fakeentry
->pgc
;
348 rel
->rd_node
= rnode
;
350 /* We don't know the name of the relation; use relfilenode instead */
351 sprintf(RelationGetRelationName(rel
), "%u", rnode
.relNode
);
354 * We set up the lockRelId in case anything tries to lock the dummy
355 * relation. Note that this is fairly bogus since relNode may be
356 * different from the relation's OID. It shouldn't really matter
357 * though, since we are presumably running by ourselves and can't have
358 * any lock conflicts ...
360 rel
->rd_lockInfo
.lockRelId
.dbId
= rnode
.dbNode
;
361 rel
->rd_lockInfo
.lockRelId
.relId
= rnode
.relNode
;
363 rel
->rd_targblock
= InvalidBlockNumber
;
364 rel
->rd_fsm_nblocks_cache
= InvalidBlockNumber
;
371 * Free a fake relation cache entry.
374 FreeFakeRelcacheEntry(Relation fakerel
)
380 * Drop a relation during XLOG replay
382 * This is called when the relation is about to be deleted; we need to remove
383 * any open "invalid-page" records for the relation.
386 XLogDropRelation(RelFileNode rnode
, ForkNumber forknum
)
388 forget_invalid_pages(rnode
, forknum
, 0);
392 * Drop a whole database during XLOG replay
394 * As above, but for DROP DATABASE instead of dropping a single rel
397 XLogDropDatabase(Oid dbid
)
400 * This is unnecessarily heavy-handed, as it will close SMgrRelation
401 * objects for other databases as well. DROP DATABASE occurs seldom
402 * enough that it's not worth introducing a variant of smgrclose for
403 * just this purpose. XXX: Or should we rather leave the smgr entries
408 forget_invalid_pages_db(dbid
);
412 * Truncate a relation during XLOG replay
414 * We need to clean up any open "invalid-page" records for the dropped pages.
417 XLogTruncateRelation(RelFileNode rnode
, ForkNumber forkNum
,
420 forget_invalid_pages(rnode
, forkNum
, nblocks
);