src/backend/access/transam/xlogutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlogutils.c
   4  *
   5  * PostgreSQL transaction log manager utility routines
   6  *
   7  * This file contains support routines that are used by XLOG replay functions.
   8  * None of this code is used during normal system operation.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  * $PostgreSQL$
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/xlogutils.h"
  21 #include "storage/bufmgr.h"
  22 #include "storage/smgr.h"
  23 #include "utils/hsearch.h"
  24 #include "utils/rel.h"
  25
  26
  27 /*
  28  * During XLOG replay, we may see XLOG records for incremental updates of
  29  * pages that no longer exist, because their relation was later dropped or
  30  * truncated.  (Note: this is only possible when full_page_writes = OFF,
  31  * since when it's ON, the first reference we see to a page should always
  32  * be a full-page rewrite not an incremental update.)  Rather than simply
  33  * ignoring such records, we make a note of the referenced page, and then
  34  * complain if we don't actually see a drop or truncate covering the page
  35  * later in replay.
  36  */
  37 typedef struct xl_invalid_page_key
  38 {
  39         RelFileNode node;                       /* the relation */
  40         ForkNumber      forkno;                 /* the fork number */
  41         BlockNumber blkno;                      /* the page */
  42 } xl_invalid_page_key;
  43
  44 typedef struct xl_invalid_page
  45 {
  46         xl_invalid_page_key key;        /* hash key ... must be first */
  47         bool            present;                /* page existed but contained zeroes */
  48 } xl_invalid_page;
  49
  50 static HTAB *invalid_page_tab = NULL;
  51
  52
  53 /* Log a reference to an invalid page */
  54 static void
  55 log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
  56                                  bool present)
  57 {
  58         xl_invalid_page_key key;
  59         xl_invalid_page *hentry;
  60         bool            found;
  61
  62         /*
  63          * Log references to invalid pages at DEBUG1 level.  This allows some
  64          * tracing of the cause (note the elog context mechanism will tell us
  65          * something about the XLOG record that generated the reference).
  66          */
  67         if (present)
  68                 elog(DEBUG1, "page %u of relation %u/%u/%u/%u is uninitialized",
  69                          blkno, node.spcNode, node.dbNode, node.relNode, forkno);
  70         else
  71                 elog(DEBUG1, "page %u of relation %u/%u/%u/%u does not exist",
  72                          blkno, node.spcNode, node.dbNode, node.relNode, forkno);
  73
  74         if (invalid_page_tab == NULL)
  75         {
  76                 /* create hash table when first needed */
  77                 HASHCTL         ctl;
  78
  79                 memset(&ctl, 0, sizeof(ctl));
  80                 ctl.keysize = sizeof(xl_invalid_page_key);
  81                 ctl.entrysize = sizeof(xl_invalid_page);
  82                 ctl.hash = tag_hash;
  83
  84                 invalid_page_tab = hash_create("XLOG invalid-page table",
  85                                                                            100,
  86                                                                            &ctl,
  87                                                                            HASH_ELEM | HASH_FUNCTION);
  88         }
  89
  90         /* we currently assume xl_invalid_page_key contains no padding */
  91         key.node = node;
  92         key.forkno = forkno;
  93         key.blkno = blkno;
  94         hentry = (xl_invalid_page *)
  95                 hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
  96
  97         if (!found)
  98         {
  99                 /* hash_search already filled in the key */
 100                 hentry->present = present;
 101         }
 102         else
 103         {
 104                 /* repeat reference ... leave "present" as it was */
 105         }
 106 }
 107
 108 /* Forget any invalid pages >= minblkno, because they've been dropped */
 109 static void
 110 forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
 111 {
 112         HASH_SEQ_STATUS status;
 113         xl_invalid_page *hentry;
 114
 115         if (invalid_page_tab == NULL)
 116                 return;                                 /* nothing to do */
 117
 118         hash_seq_init(&status, invalid_page_tab);
 119
 120         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
 121         {
 122                 if (RelFileNodeEquals(hentry->key.node, node) &&
 123                         hentry->key.forkno == forkno &&
 124                         hentry->key.blkno >= minblkno)
 125                 {
 126                         elog(DEBUG2, "page %u of relation %u/%u/%u/%u has been dropped",
 127                                  hentry->key.blkno, hentry->key.node.spcNode,
 128                                  hentry->key.node.dbNode, hentry->key.node.relNode, forkno);
 129
 130                         if (hash_search(invalid_page_tab,
 131                                                         (void *) &hentry->key,
 132                                                         HASH_REMOVE, NULL) == NULL)
 133                                 elog(ERROR, "hash table corrupted");
 134                 }
 135         }
 136 }
 137
 138 /* Forget any invalid pages in a whole database */
 139 static void
 140 forget_invalid_pages_db(Oid dbid)
 141 {
 142         HASH_SEQ_STATUS status;
 143         xl_invalid_page *hentry;
 144
 145         if (invalid_page_tab == NULL)
 146                 return;                                 /* nothing to do */
 147
 148         hash_seq_init(&status, invalid_page_tab);
 149
 150         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
 151         {
 152                 if (hentry->key.node.dbNode == dbid)
 153                 {
 154                         elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped",
 155                                  hentry->key.blkno, hentry->key.node.spcNode,
 156                                  hentry->key.node.dbNode, hentry->key.node.relNode);
 157
 158                         if (hash_search(invalid_page_tab,
 159                                                         (void *) &hentry->key,
 160                                                         HASH_REMOVE, NULL) == NULL)
 161                                 elog(ERROR, "hash table corrupted");
 162                 }
 163         }
 164 }
 165
 166 /* Complain about any remaining invalid-page entries */
 167 void
 168 XLogCheckInvalidPages(void)
 169 {
 170         HASH_SEQ_STATUS status;
 171         xl_invalid_page *hentry;
 172         bool            foundone = false;
 173
 174         if (invalid_page_tab == NULL)
 175                 return;                                 /* nothing to do */
 176
 177         hash_seq_init(&status, invalid_page_tab);
 178
 179         /*
 180          * Our strategy is to emit WARNING messages for all remaining entries and
 181          * only PANIC after we've dumped all the available info.
 182          */
 183         while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
 184         {
 185                 if (hentry->present)
 186                         elog(WARNING, "page %u of relation %u/%u/%u was uninitialized",
 187                                  hentry->key.blkno, hentry->key.node.spcNode,
 188                                  hentry->key.node.dbNode, hentry->key.node.relNode);
 189                 else
 190                         elog(WARNING, "page %u of relation %u/%u/%u did not exist",
 191                                  hentry->key.blkno, hentry->key.node.spcNode,
 192                                  hentry->key.node.dbNode, hentry->key.node.relNode);
 193                 foundone = true;
 194         }
 195
 196         if (foundone)
 197                 elog(PANIC, "WAL contains references to invalid pages");
 198
 199         hash_destroy(invalid_page_tab);
 200         invalid_page_tab = NULL;
 201 }
 202
 203
 204 /*
 205  * XLogReadBuffer
 206  *              Read a page during XLOG replay
 207  *
 208  * This is functionally comparable to ReadBuffer followed by
 209  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
 210  * and locked buffer.  (Getting the lock is not really necessary, since we
 211  * expect that this is only used during single-process XLOG replay, but
 212  * some subroutines such as MarkBufferDirty will complain if we don't.)
 213  *
 214  * If "init" is true then the caller intends to rewrite the page fully
 215  * using the info in the XLOG record.  In this case we will extend the
 216  * relation if needed to make the page exist, and we will not complain about
 217  * the page being "new" (all zeroes); in fact, we usually will supply a
 218  * zeroed buffer without reading the page at all, so as to avoid unnecessary
 219  * failure if the page is present on disk but has corrupt headers.
 220  *
 221  * If "init" is false then the caller needs the page to be valid already.
 222  * If the page doesn't exist or contains zeroes, we return InvalidBuffer.
 223  * In this case the caller should silently skip the update on this page.
 224  * (In this situation, we expect that the page was later dropped or truncated.
 225  * If we don't see evidence of that later in the WAL sequence, we'll complain
 226  * at the end of WAL replay.)
 227  */
 228 Buffer
 229 XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
 230 {
 231         return XLogReadBufferWithFork(rnode, MAIN_FORKNUM, blkno, init);
 232 }
 233
 234 /*
 235  * XLogReadBufferWithFork
 236  *              Like XLogReadBuffer, but for reading other relation forks than
 237  *              the main one.
 238  */
 239 Buffer
 240 XLogReadBufferWithFork(RelFileNode rnode, ForkNumber forknum,
 241                                            BlockNumber blkno, bool init)
 242 {
 243         BlockNumber lastblock;
 244         Buffer          buffer;
 245         SMgrRelation smgr;
 246
 247         Assert(blkno != P_NEW);
 248
 249         /* Open the relation at smgr level */
 250         smgr = smgropen(rnode);
 251
 252         /*
 253          * Create the target file if it doesn't already exist.  This lets us cope
 254          * if the replay sequence contains writes to a relation that is later
 255          * deleted.  (The original coding of this routine would instead suppress
 256          * the writes, but that seems like it risks losing valuable data if the
 257          * filesystem loses an inode during a crash.  Better to write the data
 258          * until we are actually told to delete the file.)
 259          */
 260         smgrcreate(smgr, forknum, false, true);
 261
 262         lastblock = smgrnblocks(smgr, forknum);
 263
 264         if (blkno < lastblock)
 265         {
 266                 /* page exists in file */
 267                 buffer = ReadBufferWithoutRelcache(rnode, false, forknum, blkno, init);
 268         }
 269         else
 270         {
 271                 /* hm, page doesn't exist in file */
 272                 if (!init)
 273                 {
 274                         log_invalid_page(rnode, forknum, blkno, false);
 275                         return InvalidBuffer;
 276                 }
 277                 /* OK to extend the file */
 278                 /* we do this in recovery only - no rel-extension lock needed */
 279                 Assert(InRecovery);
 280                 buffer = InvalidBuffer;
 281                 while (blkno >= lastblock)
 282                 {
 283                         if (buffer != InvalidBuffer)
 284                                 ReleaseBuffer(buffer);
 285                         buffer = ReadBufferWithoutRelcache(rnode, false, forknum,
 286                                                                                            P_NEW, false);
 287                         lastblock++;
 288                 }
 289                 Assert(BufferGetBlockNumber(buffer) == blkno);
 290         }
 291
 292         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 293
 294         if (!init)
 295         {
 296                 /* check that page has been initialized */
 297                 Page            page = (Page) BufferGetPage(buffer);
 298
 299                 if (PageIsNew(page))
 300                 {
 301                         UnlockReleaseBuffer(buffer);
 302                         log_invalid_page(rnode, forknum, blkno, true);
 303                         return InvalidBuffer;
 304                 }
 305         }
 306
 307         return buffer;
 308 }
 309
 310
 311 /*
 312  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
 313  * return type is Relation.
 314  */
 315 typedef struct
 316 {
 317         RelationData            reldata;        /* Note: this must be first */
 318         FormData_pg_class       pgc;
 319 } FakeRelCacheEntryData;
 320
 321 typedef FakeRelCacheEntryData *FakeRelCacheEntry;
 322
 323 /*
 324  * Create a fake relation cache entry for a physical relation
 325  *
 326  * It's often convenient to use the same functions in XLOG replay as in the
 327  * main codepath, but those functions typically work with a relcache entry.
 328  * We don't have a working relation cache during XLOG replay, but this
 329  * function can be used to create a fake relcache entry instead. Only the
 330  * fields related to physical storage, like rd_rel, are initialized, so the
 331  * fake entry is only usable in low-level operations like ReadBuffer().
 332  *
 333  * Caller must free the returned entry with FreeFakeRelcacheEntry().
 334  */
 335 Relation
 336 CreateFakeRelcacheEntry(RelFileNode rnode)
 337 {
 338         FakeRelCacheEntry fakeentry;
 339         Relation rel;
 340
 341         /* Allocate the Relation struct and all related space in one block. */
 342         fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
 343         rel = (Relation) fakeentry;
 344
 345         rel->rd_rel = &fakeentry->pgc;
 346         rel->rd_node = rnode;
 347
 348         /* We don't know the name of the relation; use relfilenode instead */
 349         sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
 350
 351         /*
 352          * We set up the lockRelId in case anything tries to lock the dummy
 353          * relation.  Note that this is fairly bogus since relNode may be
 354          * different from the relation's OID.  It shouldn't really matter
 355          * though, since we are presumably running by ourselves and can't have
 356          * any lock conflicts ...
 357          */
 358         rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
 359         rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
 360
 361         rel->rd_targblock = InvalidBlockNumber;
 362         rel->rd_smgr = NULL;
 363
 364         return rel;
 365 }
 366
 367 /*
 368  * Free a fake relation cache entry.
 369  */
 370 void
 371 FreeFakeRelcacheEntry(Relation fakerel)
 372 {
 373         pfree(fakerel);
 374 }
 375
 376 /*
 377  * Drop a relation during XLOG replay
 378  *
 379  * This is called when the relation is about to be deleted; we need to remove
 380  * any open "invalid-page" records for the relation.
 381  */
 382 void
 383 XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
 384 {
 385         forget_invalid_pages(rnode, forknum, 0);
 386 }
 387
 388 /*
 389  * Drop a whole database during XLOG replay
 390  *
 391  * As above, but for DROP DATABASE instead of dropping a single rel
 392  */
 393 void
 394 XLogDropDatabase(Oid dbid)
 395 {
 396         /*
 397          * This is unnecessarily heavy-handed, as it will close SMgrRelation
 398          * objects for other databases as well. DROP DATABASE occurs seldom
 399          * enough that it's not worth introducing a variant of smgrclose for
 400          * just this purpose. XXX: Or should we rather leave the smgr entries
 401          * dangling?
 402          */
 403         smgrcloseall();
 404
 405         forget_invalid_pages_db(dbid);
 406 }
 407
 408 /*
 409  * Truncate a relation during XLOG replay
 410  *
 411  * We need to clean up any open "invalid-page" records for the dropped pages.
 412  */
 413 void
 414 XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 415                                          BlockNumber nblocks)
 416 {
 417         forget_invalid_pages(rnode, forkNum, nblocks);
 418 }