src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <sys/wait.h>
  23 #include <unistd.h>
  24
  25 #include "access/clog.h"
  26 #include "access/multixact.h"
  27 #include "access/subtrans.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogutils.h"
  34 #include "catalog/catversion.h"
  35 #include "catalog/pg_control.h"
  36 #include "catalog/pg_type.h"
  37 #include "funcapi.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "storage/bufmgr.h"
  42 #include "storage/fd.h"
  43 #include "storage/ipc.h"
  44 #include "storage/pmsignal.h"
  45 #include "storage/procarray.h"
  46 #include "storage/smgr.h"
  47 #include "storage/spin.h"
  48 #include "utils/builtins.h"
  49 #include "utils/guc.h"
  50 #include "utils/ps_status.h"
  51
  52
  53 /* File path names (all relative to $PGDATA) */
  54 #define BACKUP_LABEL_FILE               "backup_label"
  55 #define BACKUP_LABEL_OLD                "backup_label.old"
  56 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  57 #define RECOVERY_COMMAND_DONE   "recovery.done"
  58
  59
  60 /* User-settable parameters */
  61 int                     CheckPointSegments = 3;
  62 int                     XLOGbuffers = 8;
  63 int                     XLogArchiveTimeout = 0;
  64 bool            XLogArchiveMode = false;
  65 char       *XLogArchiveCommand = NULL;
  66 bool            fullPageWrites = true;
  67 bool            log_checkpoints = false;
  68 int             sync_method = DEFAULT_SYNC_METHOD;
  69
  70 #ifdef WAL_DEBUG
  71 bool            XLOG_DEBUG = false;
  72 #endif
  73
  74 /*
  75  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  76  * When we are done with an old XLOG segment file, we will recycle it as a
  77  * future XLOG segment as long as there aren't already XLOGfileslop future
  78  * segments; else we'll delete it.  This could be made a separate GUC
  79  * variable, but at present I think it's sufficient to hardwire it as
  80  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  81  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  82  * of them; the +1 allows boundary cases to happen without wasting a
  83  * delete/create-segment cycle.
  84  */
  85 #define XLOGfileslop    (2*CheckPointSegments + 1)
  86
  87 /*
  88  * GUC support
  89  */
  90 const struct config_enum_entry sync_method_options[] = {
  91         {"fsync", SYNC_METHOD_FSYNC, false},
  92 #ifdef HAVE_FSYNC_WRITETHROUGH
  93         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
  94 #endif
  95 #ifdef HAVE_FDATASYNC
  96         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
  97 #endif
  98 #ifdef OPEN_SYNC_FLAG
  99         {"open_sync", SYNC_METHOD_OPEN, false},
 100 #endif
 101 #ifdef OPEN_DATASYNC_FLAG
 102         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 103 #endif
 104         {NULL, 0, false}
 105 };
 106
 107 /*
 108  * Statistics for current checkpoint are collected in this global struct.
 109  * Because only the background writer or a stand-alone backend can perform
 110  * checkpoints, this will be unused in normal backends.
 111  */
 112 CheckpointStatsData CheckpointStats;
 113
 114 /*
 115  * ThisTimeLineID will be same in all backends --- it identifies current
 116  * WAL timeline for the database system.
 117  */
 118 TimeLineID      ThisTimeLineID = 0;
 119
 120 /* Are we doing recovery from XLOG? */
 121 bool            InRecovery = false;
 122
 123 /* Are we recovering using offline XLOG archives? */
 124 static bool InArchiveRecovery = false;
 125
 126 /* Was the last xlog file restored from archive, or local? */
 127 static bool restoredFromArchive = false;
 128
 129 /* options taken from recovery.conf */
 130 static char *recoveryRestoreCommand = NULL;
 131 static bool recoveryTarget = false;
 132 static bool recoveryTargetExact = false;
 133 static bool recoveryTargetInclusive = true;
 134 static bool recoveryLogRestartpoints = false;
 135 static TransactionId recoveryTargetXid;
 136 static TimestampTz recoveryTargetTime;
 137 static TimestampTz recoveryLastXTime = 0;
 138
 139 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 140 static TransactionId recoveryStopXid;
 141 static TimestampTz recoveryStopTime;
 142 static bool recoveryStopAfter;
 143
 144 /*
 145  * During normal operation, the only timeline we care about is ThisTimeLineID.
 146  * During recovery, however, things are more complicated.  To simplify life
 147  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 148  * scan through the WAL history (that is, it is the line that was active when
 149  * the currently-scanned WAL record was generated).  We also need these
 150  * timeline values:
 151  *
 152  * recoveryTargetTLI: the desired timeline that we want to end in.
 153  *
 154  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 155  * its known parents, newest first (so recoveryTargetTLI is always the
 156  * first list member).  Only these TLIs are expected to be seen in the WAL
 157  * segments we read, and indeed only these TLIs will be considered as
 158  * candidate WAL files to open at all.
 159  *
 160  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 161  * (This is not necessarily the same as ThisTimeLineID, because we could
 162  * be scanning data that was copied from an ancestor timeline when the current
 163  * file was created.)  During a sequential scan we do not allow this value
 164  * to decrease.
 165  */
 166 static TimeLineID recoveryTargetTLI;
 167 static List *expectedTLIs;
 168 static TimeLineID curFileTLI;
 169
 170 /*
 171  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 172  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 173  * end+1 of the last record, and is reset when we end a top-level transaction,
 174  * or start a new one; so it can be used to tell if the current transaction has
 175  * created any XLOG records.
 176  */
 177 static XLogRecPtr ProcLastRecPtr = {0, 0};
 178
 179 XLogRecPtr      XactLastRecEnd = {0, 0};
 180
 181 /*
 182  * RedoRecPtr is this backend's local copy of the REDO record pointer
 183  * (which is almost but not quite the same as a pointer to the most recent
 184  * CHECKPOINT record).  We update this from the shared-memory copy,
 185  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 186  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 187  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 188  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 189  * InitXLOGAccess.
 190  */
 191 static XLogRecPtr RedoRecPtr;
 192
 193 /*----------
 194  * Shared-memory data structures for XLOG control
 195  *
 196  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 197  * the log up to (all records before that point must be written or fsynced).
 198  * LogwrtResult indicates the byte positions we have already written/fsynced.
 199  * These structs are identical but are declared separately to indicate their
 200  * slightly different functions.
 201  *
 202  * We do a lot of pushups to minimize the amount of access to lockable
 203  * shared memory values.  There are actually three shared-memory copies of
 204  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 205  *              XLogCtl->LogwrtResult is protected by info_lck
 206  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 207  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 208  * One must hold the associated lock to read or write any of these, but
 209  * of course no lock is needed to read/write the unshared LogwrtResult.
 210  *
 211  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 212  * right", since both are updated by a write or flush operation before
 213  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 214  * is that it can be examined/modified by code that already holds WALWriteLock
 215  * without needing to grab info_lck as well.
 216  *
 217  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 218  * but is updated when convenient.      Again, it exists for the convenience of
 219  * code that is already holding WALInsertLock but not the other locks.
 220  *
 221  * The unshared LogwrtResult may lag behind any or all of these, and again
 222  * is updated when convenient.
 223  *
 224  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 225  * (protected by info_lck), but we don't need to cache any copies of it.
 226  *
 227  * Note that this all works because the request and result positions can only
 228  * advance forward, never back up, and so we can easily determine which of two
 229  * values is "more up to date".
 230  *
 231  * info_lck is only held long enough to read/update the protected variables,
 232  * so it's a plain spinlock.  The other locks are held longer (potentially
 233  * over I/O operations), so we use LWLocks for them.  These locks are:
 234  *
 235  * WALInsertLock: must be held to insert a record into the WAL buffers.
 236  *
 237  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 238  * XLogFlush).
 239  *
 240  * ControlFileLock: must be held to read/update control file or create
 241  * new log file.
 242  *
 243  * CheckpointLock: must be held to do a checkpoint (ensures only one
 244  * checkpointer at a time; currently, with all checkpoints done by the
 245  * bgwriter, this is just pro forma).
 246  *
 247  *----------
 248  */
 249
 250 typedef struct XLogwrtRqst
 251 {
 252         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 253         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 254 } XLogwrtRqst;
 255
 256 typedef struct XLogwrtResult
 257 {
 258         XLogRecPtr      Write;                  /* last byte + 1 written out */
 259         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 260 } XLogwrtResult;
 261
 262 /*
 263  * Shared state data for XLogInsert.
 264  */
 265 typedef struct XLogCtlInsert
 266 {
 267         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 268         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 269         int                     curridx;                /* current block index in cache */
 270         XLogPageHeader currpage;        /* points to header of block in cache */
 271         char       *currpos;            /* current insertion point in cache */
 272         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 273         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 274 } XLogCtlInsert;
 275
 276 /*
 277  * Shared state data for XLogWrite/XLogFlush.
 278  */
 279 typedef struct XLogCtlWrite
 280 {
 281         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 282         int                     curridx;                /* cache index of next block to write */
 283         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 284 } XLogCtlWrite;
 285
 286 /*
 287  * Total shared-memory state for XLOG.
 288  */
 289 typedef struct XLogCtlData
 290 {
 291         /* Protected by WALInsertLock: */
 292         XLogCtlInsert Insert;
 293
 294         /* Protected by info_lck: */
 295         XLogwrtRqst LogwrtRqst;
 296         XLogwrtResult LogwrtResult;
 297         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 298         TransactionId ckptXid;
 299         XLogRecPtr      asyncCommitLSN; /* LSN of newest async commit */
 300
 301         /* Protected by WALWriteLock: */
 302         XLogCtlWrite Write;
 303
 304         /*
 305          * These values do not change after startup, although the pointed-to pages
 306          * and xlblocks values certainly do.  Permission to read/write the pages
 307          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 308          */
 309         char       *pages;                      /* buffers for unwritten XLOG pages */
 310         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 311         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 312         TimeLineID      ThisTimeLineID;
 313
 314         slock_t         info_lck;               /* locks shared variables shown above */
 315 } XLogCtlData;
 316
 317 static XLogCtlData *XLogCtl = NULL;
 318
 319 /*
 320  * We maintain an image of pg_control in shared memory.
 321  */
 322 static ControlFileData *ControlFile = NULL;
 323
 324 /*
 325  * Macros for managing XLogInsert state.  In most cases, the calling routine
 326  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 327  * so these are passed as parameters instead of being fetched via XLogCtl.
 328  */
 329
 330 /* Free space remaining in the current xlog page buffer */
 331 #define INSERT_FREESPACE(Insert)  \
 332         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 333
 334 /* Construct XLogRecPtr value for current insertion point */
 335 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 336         ( \
 337           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 338           (recptr).xrecoff = \
 339                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 340         )
 341
 342 #define PrevBufIdx(idx)         \
 343                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 344
 345 #define NextBufIdx(idx)         \
 346                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 347
 348 /*
 349  * Private, possibly out-of-date copy of shared LogwrtResult.
 350  * See discussion above.
 351  */
 352 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 353
 354 /*
 355  * openLogFile is -1 or a kernel FD for an open log file segment.
 356  * When it's open, openLogOff is the current seek offset in the file.
 357  * openLogId/openLogSeg identify the segment.  These variables are only
 358  * used to write the XLOG, and so will normally refer to the active segment.
 359  */
 360 static int      openLogFile = -1;
 361 static uint32 openLogId = 0;
 362 static uint32 openLogSeg = 0;
 363 static uint32 openLogOff = 0;
 364
 365 /*
 366  * These variables are used similarly to the ones above, but for reading
 367  * the XLOG.  Note, however, that readOff generally represents the offset
 368  * of the page just read, not the seek position of the FD itself, which
 369  * will be just past that page.
 370  */
 371 static int      readFile = -1;
 372 static uint32 readId = 0;
 373 static uint32 readSeg = 0;
 374 static uint32 readOff = 0;
 375
 376 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 377 static char *readBuf = NULL;
 378
 379 /* Buffer for current ReadRecord result (expandable) */
 380 static char *readRecordBuf = NULL;
 381 static uint32 readRecordBufSize = 0;
 382
 383 /* State information for XLOG reading */
 384 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 385 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 386 static XLogRecord *nextRecord = NULL;
 387 static TimeLineID lastPageTLI = 0;
 388
 389 static bool InRedo = false;
 390
 391
 392 static void XLogArchiveNotify(const char *xlog);
 393 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 394 static bool XLogArchiveCheckDone(const char *xlog);
 395 static bool XLogArchiveIsBusy(const char *xlog);
 396 static void XLogArchiveCleanup(const char *xlog);
 397 static void readRecoveryCommandFile(void);
 398 static void exitArchiveRecovery(TimeLineID endTLI,
 399                                         uint32 endLogId, uint32 endLogSeg);
 400 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 401 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 402
 403 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 404                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 405 static bool AdvanceXLInsertBuffer(bool new_segment);
 406 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 407 static int XLogFileInit(uint32 log, uint32 seg,
 408                          bool *use_existent, bool use_lock);
 409 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 410                                            bool find_free, int *max_advance,
 411                                            bool use_lock);
 412 static int      XLogFileOpen(uint32 log, uint32 seg);
 413 static int      XLogFileRead(uint32 log, uint32 seg, int emode);
 414 static void XLogFileClose(void);
 415 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 416                                         const char *recovername, off_t expectedSize);
 417 static void PreallocXlogFiles(XLogRecPtr endptr);
 418 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 419 static void ValidateXLOGDirectoryStructure(void);
 420 static void CleanupBackupHistory(void);
 421 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 422 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 423 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 424 static List *readTimeLineHistory(TimeLineID targetTLI);
 425 static bool existsTimeLineHistory(TimeLineID probeTLI);
 426 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 427 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 428                                          TimeLineID endTLI,
 429                                          uint32 endLogId, uint32 endLogSeg);
 430 static void WriteControlFile(void);
 431 static void ReadControlFile(void);
 432 static char *str_time(pg_time_t tnow);
 433 #ifdef WAL_DEBUG
 434 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 435 #endif
 436 static void issue_xlog_fsync(void);
 437 static void pg_start_backup_callback(int code, Datum arg);
 438 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 439                                   XLogRecPtr *minRecoveryLoc);
 440 static void rm_redo_error_callback(void *arg);
 441 static int get_sync_bit(int method);
 442
 443
 444 /*
 445  * Insert an XLOG record having the specified RMID and info bytes,
 446  * with the body of the record being the data chunk(s) described by
 447  * the rdata chain (see xlog.h for notes about rdata).
 448  *
 449  * Returns XLOG pointer to end of record (beginning of next record).
 450  * This can be used as LSN for data pages affected by the logged action.
 451  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 452  * before the data page can be written out.  This implements the basic
 453  * WAL rule "write the log before the data".)
 454  *
 455  * NB: this routine feels free to scribble on the XLogRecData structs,
 456  * though not on the data they reference.  This is OK since the XLogRecData
 457  * structs are always just temporaries in the calling code.
 458  */
 459 XLogRecPtr
 460 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 461 {
 462         XLogCtlInsert *Insert = &XLogCtl->Insert;
 463         XLogRecord *record;
 464         XLogContRecord *contrecord;
 465         XLogRecPtr      RecPtr;
 466         XLogRecPtr      WriteRqst;
 467         uint32          freespace;
 468         int                     curridx;
 469         XLogRecData *rdt;
 470         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 471         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 472         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 473         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 474         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 475         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 476         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 477         pg_crc32        rdata_crc;
 478         uint32          len,
 479                                 write_len;
 480         unsigned        i;
 481         bool            updrqst;
 482         bool            doPageWrites;
 483         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 484
 485         /* info's high bits are reserved for use by me */
 486         if (info & XLR_INFO_MASK)
 487                 elog(PANIC, "invalid xlog info mask %02X", info);
 488
 489         /*
 490          * In bootstrap mode, we don't actually log anything but XLOG resources;
 491          * return a phony record pointer.
 492          */
 493         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 494         {
 495                 RecPtr.xlogid = 0;
 496                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 497                 return RecPtr;
 498         }
 499
 500         /*
 501          * Here we scan the rdata chain, determine which buffers must be backed
 502          * up, and compute the CRC values for the data.  Note that the record
 503          * header isn't added into the CRC initially since we don't know the final
 504          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
 505          * the whole record in the order "rdata, then backup blocks, then record
 506          * header".
 507          *
 508          * We may have to loop back to here if a race condition is detected below.
 509          * We could prevent the race by doing all this work while holding the
 510          * insert lock, but it seems better to avoid doing CRC calculations while
 511          * holding the lock.  This means we have to be careful about modifying the
 512          * rdata chain until we know we aren't going to loop back again.  The only
 513          * change we allow ourselves to make earlier is to set rdt->data = NULL in
 514          * chain items we have decided we will have to back up the whole buffer
 515          * for.  This is OK because we will certainly decide the same thing again
 516          * for those items if we do it over; doing it here saves an extra pass
 517          * over the chain later.
 518          */
 519 begin:;
 520         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 521         {
 522                 dtbuf[i] = InvalidBuffer;
 523                 dtbuf_bkp[i] = false;
 524         }
 525
 526         /*
 527          * Decide if we need to do full-page writes in this XLOG record: true if
 528          * full_page_writes is on or we have a PITR request for it.  Since we
 529          * don't yet have the insert lock, forcePageWrites could change under us,
 530          * but we'll recheck it once we have the lock.
 531          */
 532         doPageWrites = fullPageWrites || Insert->forcePageWrites;
 533
 534         INIT_CRC32(rdata_crc);
 535         len = 0;
 536         for (rdt = rdata;;)
 537         {
 538                 if (rdt->buffer == InvalidBuffer)
 539                 {
 540                         /* Simple data, just include it */
 541                         len += rdt->len;
 542                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 543                 }
 544                 else
 545                 {
 546                         /* Find info for buffer */
 547                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 548                         {
 549                                 if (rdt->buffer == dtbuf[i])
 550                                 {
 551                                         /* Buffer already referenced by earlier chain item */
 552                                         if (dtbuf_bkp[i])
 553                                                 rdt->data = NULL;
 554                                         else if (rdt->data)
 555                                         {
 556                                                 len += rdt->len;
 557                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 558                                         }
 559                                         break;
 560                                 }
 561                                 if (dtbuf[i] == InvalidBuffer)
 562                                 {
 563                                         /* OK, put it in this slot */
 564                                         dtbuf[i] = rdt->buffer;
 565                                         if (XLogCheckBuffer(rdt, doPageWrites,
 566                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 567                                         {
 568                                                 dtbuf_bkp[i] = true;
 569                                                 rdt->data = NULL;
 570                                         }
 571                                         else if (rdt->data)
 572                                         {
 573                                                 len += rdt->len;
 574                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 575                                         }
 576                                         break;
 577                                 }
 578                         }
 579                         if (i >= XLR_MAX_BKP_BLOCKS)
 580                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 581                                          XLR_MAX_BKP_BLOCKS);
 582                 }
 583                 /* Break out of loop when rdt points to last chain item */
 584                 if (rdt->next == NULL)
 585                         break;
 586                 rdt = rdt->next;
 587         }
 588
 589         /*
 590          * Now add the backup block headers and data into the CRC
 591          */
 592         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 593         {
 594                 if (dtbuf_bkp[i])
 595                 {
 596                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
 597                         char       *page;
 598
 599                         COMP_CRC32(rdata_crc,
 600                                            (char *) bkpb,
 601                                            sizeof(BkpBlock));
 602                         page = (char *) BufferGetBlock(dtbuf[i]);
 603                         if (bkpb->hole_length == 0)
 604                         {
 605                                 COMP_CRC32(rdata_crc,
 606                                                    page,
 607                                                    BLCKSZ);
 608                         }
 609                         else
 610                         {
 611                                 /* must skip the hole */
 612                                 COMP_CRC32(rdata_crc,
 613                                                    page,
 614                                                    bkpb->hole_offset);
 615                                 COMP_CRC32(rdata_crc,
 616                                                    page + (bkpb->hole_offset + bkpb->hole_length),
 617                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
 618                         }
 619                 }
 620         }
 621
 622         /*
 623          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 624          * error checking in ReadRecord.  This means that all callers of
 625          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 626          * make an exception for XLOG SWITCH records because we don't want them to
 627          * ever cross a segment boundary.
 628          */
 629         if (len == 0 && !isLogSwitch)
 630                 elog(PANIC, "invalid xlog record length %u", len);
 631
 632         START_CRIT_SECTION();
 633
 634         /* Now wait to get insert lock */
 635         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 636
 637         /*
 638          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 639          * back and recompute everything.  This can only happen just after a
 640          * checkpoint, so it's better to be slow in this case and fast otherwise.
 641          *
 642          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 643          * affect the contents of the XLOG record, so we'll update our local copy
 644          * but not force a recomputation.
 645          */
 646         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 647         {
 648                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 649                 RedoRecPtr = Insert->RedoRecPtr;
 650
 651                 if (doPageWrites)
 652                 {
 653                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 654                         {
 655                                 if (dtbuf[i] == InvalidBuffer)
 656                                         continue;
 657                                 if (dtbuf_bkp[i] == false &&
 658                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 659                                 {
 660                                         /*
 661                                          * Oops, this buffer now needs to be backed up, but we
 662                                          * didn't think so above.  Start over.
 663                                          */
 664                                         LWLockRelease(WALInsertLock);
 665                                         END_CRIT_SECTION();
 666                                         goto begin;
 667                                 }
 668                         }
 669                 }
 670         }
 671
 672         /*
 673          * Also check to see if forcePageWrites was just turned on; if we weren't
 674          * already doing full-page writes then go back and recompute. (If it was
 675          * just turned off, we could recompute the record without full pages, but
 676          * we choose not to bother.)
 677          */
 678         if (Insert->forcePageWrites && !doPageWrites)
 679         {
 680                 /* Oops, must redo it with full-page data */
 681                 LWLockRelease(WALInsertLock);
 682                 END_CRIT_SECTION();
 683                 goto begin;
 684         }
 685
 686         /*
 687          * Make additional rdata chain entries for the backup blocks, so that we
 688          * don't need to special-case them in the write loop.  Note that we have
 689          * now irrevocably changed the input rdata chain.  At the exit of this
 690          * loop, write_len includes the backup block data.
 691          *
 692          * Also set the appropriate info bits to show which buffers were backed
 693          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 694          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 695          */
 696         write_len = len;
 697         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 698         {
 699                 BkpBlock   *bkpb;
 700                 char       *page;
 701
 702                 if (!dtbuf_bkp[i])
 703                         continue;
 704
 705                 info |= XLR_SET_BKP_BLOCK(i);
 706
 707                 bkpb = &(dtbuf_xlg[i]);
 708                 page = (char *) BufferGetBlock(dtbuf[i]);
 709
 710                 rdt->next = &(dtbuf_rdt1[i]);
 711                 rdt = rdt->next;
 712
 713                 rdt->data = (char *) bkpb;
 714                 rdt->len = sizeof(BkpBlock);
 715                 write_len += sizeof(BkpBlock);
 716
 717                 rdt->next = &(dtbuf_rdt2[i]);
 718                 rdt = rdt->next;
 719
 720                 if (bkpb->hole_length == 0)
 721                 {
 722                         rdt->data = page;
 723                         rdt->len = BLCKSZ;
 724                         write_len += BLCKSZ;
 725                         rdt->next = NULL;
 726                 }
 727                 else
 728                 {
 729                         /* must skip the hole */
 730                         rdt->data = page;
 731                         rdt->len = bkpb->hole_offset;
 732                         write_len += bkpb->hole_offset;
 733
 734                         rdt->next = &(dtbuf_rdt3[i]);
 735                         rdt = rdt->next;
 736
 737                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 738                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 739                         write_len += rdt->len;
 740                         rdt->next = NULL;
 741                 }
 742         }
 743
 744         /*
 745          * If we backed up any full blocks and online backup is not in progress,
 746          * mark the backup blocks as removable.  This allows the WAL archiver to
 747          * know whether it is safe to compress archived WAL data by transforming
 748          * full-block records into the non-full-block format.
 749          *
 750          * Note: we could just set the flag whenever !forcePageWrites, but
 751          * defining it like this leaves the info bit free for some potential other
 752          * use in records without any backup blocks.
 753          */
 754         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
 755                 info |= XLR_BKP_REMOVABLE;
 756
 757         /*
 758          * If there isn't enough space on the current XLOG page for a record
 759          * header, advance to the next page (leaving the unused space as zeroes).
 760          */
 761         updrqst = false;
 762         freespace = INSERT_FREESPACE(Insert);
 763         if (freespace < SizeOfXLogRecord)
 764         {
 765                 updrqst = AdvanceXLInsertBuffer(false);
 766                 freespace = INSERT_FREESPACE(Insert);
 767         }
 768
 769         /* Compute record's XLOG location */
 770         curridx = Insert->curridx;
 771         INSERT_RECPTR(RecPtr, Insert, curridx);
 772
 773         /*
 774          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 775          * segment, we need not insert it (and don't want to because we'd like
 776          * consecutive switch requests to be no-ops).  Instead, make sure
 777          * everything is written and flushed through the end of the prior segment,
 778          * and return the prior segment's end address.
 779          */
 780         if (isLogSwitch &&
 781                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 782         {
 783                 /* We can release insert lock immediately */
 784                 LWLockRelease(WALInsertLock);
 785
 786                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 787                 if (RecPtr.xrecoff == 0)
 788                 {
 789                         /* crossing a logid boundary */
 790                         RecPtr.xlogid -= 1;
 791                         RecPtr.xrecoff = XLogFileSize;
 792                 }
 793
 794                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 795                 LogwrtResult = XLogCtl->Write.LogwrtResult;
 796                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 797                 {
 798                         XLogwrtRqst FlushRqst;
 799
 800                         FlushRqst.Write = RecPtr;
 801                         FlushRqst.Flush = RecPtr;
 802                         XLogWrite(FlushRqst, false, false);
 803                 }
 804                 LWLockRelease(WALWriteLock);
 805
 806                 END_CRIT_SECTION();
 807
 808                 return RecPtr;
 809         }
 810
 811         /* Insert record header */
 812
 813         record = (XLogRecord *) Insert->currpos;
 814         record->xl_prev = Insert->PrevRecord;
 815         record->xl_xid = GetCurrentTransactionIdIfAny();
 816         record->xl_tot_len = SizeOfXLogRecord + write_len;
 817         record->xl_len = len;           /* doesn't include backup blocks */
 818         record->xl_info = info;
 819         record->xl_rmid = rmid;
 820
 821         /* Now we can finish computing the record's CRC */
 822         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
 823                            SizeOfXLogRecord - sizeof(pg_crc32));
 824         FIN_CRC32(rdata_crc);
 825         record->xl_crc = rdata_crc;
 826
 827 #ifdef WAL_DEBUG
 828         if (XLOG_DEBUG)
 829         {
 830                 StringInfoData buf;
 831
 832                 initStringInfo(&buf);
 833                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
 834                                                  RecPtr.xlogid, RecPtr.xrecoff);
 835                 xlog_outrec(&buf, record);
 836                 if (rdata->data != NULL)
 837                 {
 838                         appendStringInfo(&buf, " - ");
 839                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
 840                 }
 841                 elog(LOG, "%s", buf.data);
 842                 pfree(buf.data);
 843         }
 844 #endif
 845
 846         /* Record begin of record in appropriate places */
 847         ProcLastRecPtr = RecPtr;
 848         Insert->PrevRecord = RecPtr;
 849
 850         Insert->currpos += SizeOfXLogRecord;
 851         freespace -= SizeOfXLogRecord;
 852
 853         /*
 854          * Append the data, including backup blocks if any
 855          */
 856         while (write_len)
 857         {
 858                 while (rdata->data == NULL)
 859                         rdata = rdata->next;
 860
 861                 if (freespace > 0)
 862                 {
 863                         if (rdata->len > freespace)
 864                         {
 865                                 memcpy(Insert->currpos, rdata->data, freespace);
 866                                 rdata->data += freespace;
 867                                 rdata->len -= freespace;
 868                                 write_len -= freespace;
 869                         }
 870                         else
 871                         {
 872                                 memcpy(Insert->currpos, rdata->data, rdata->len);
 873                                 freespace -= rdata->len;
 874                                 write_len -= rdata->len;
 875                                 Insert->currpos += rdata->len;
 876                                 rdata = rdata->next;
 877                                 continue;
 878                         }
 879                 }
 880
 881                 /* Use next buffer */
 882                 updrqst = AdvanceXLInsertBuffer(false);
 883                 curridx = Insert->curridx;
 884                 /* Insert cont-record header */
 885                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 886                 contrecord = (XLogContRecord *) Insert->currpos;
 887                 contrecord->xl_rem_len = write_len;
 888                 Insert->currpos += SizeOfXLogContRecord;
 889                 freespace = INSERT_FREESPACE(Insert);
 890         }
 891
 892         /* Ensure next record will be properly aligned */
 893         Insert->currpos = (char *) Insert->currpage +
 894                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
 895         freespace = INSERT_FREESPACE(Insert);
 896
 897         /*
 898          * The recptr I return is the beginning of the *next* record. This will be
 899          * stored as LSN for changed data pages...
 900          */
 901         INSERT_RECPTR(RecPtr, Insert, curridx);
 902
 903         /*
 904          * If the record is an XLOG_SWITCH, we must now write and flush all the
 905          * existing data, and then forcibly advance to the start of the next
 906          * segment.  It's not good to do this I/O while holding the insert lock,
 907          * but there seems too much risk of confusion if we try to release the
 908          * lock sooner.  Fortunately xlog switch needn't be a high-performance
 909          * operation anyway...
 910          */
 911         if (isLogSwitch)
 912         {
 913                 XLogCtlWrite *Write = &XLogCtl->Write;
 914                 XLogwrtRqst FlushRqst;
 915                 XLogRecPtr      OldSegEnd;
 916
 917                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 918
 919                 /*
 920                  * Flush through the end of the page containing XLOG_SWITCH, and
 921                  * perform end-of-segment actions (eg, notifying archiver).
 922                  */
 923                 WriteRqst = XLogCtl->xlblocks[curridx];
 924                 FlushRqst.Write = WriteRqst;
 925                 FlushRqst.Flush = WriteRqst;
 926                 XLogWrite(FlushRqst, false, true);
 927
 928                 /* Set up the next buffer as first page of next segment */
 929                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
 930                 (void) AdvanceXLInsertBuffer(true);
 931
 932                 /* There should be no unwritten data */
 933                 curridx = Insert->curridx;
 934                 Assert(curridx == Write->curridx);
 935
 936                 /* Compute end address of old segment */
 937                 OldSegEnd = XLogCtl->xlblocks[curridx];
 938                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
 939                 if (OldSegEnd.xrecoff == 0)
 940                 {
 941                         /* crossing a logid boundary */
 942                         OldSegEnd.xlogid -= 1;
 943                         OldSegEnd.xrecoff = XLogFileSize;
 944                 }
 945
 946                 /* Make it look like we've written and synced all of old segment */
 947                 LogwrtResult.Write = OldSegEnd;
 948                 LogwrtResult.Flush = OldSegEnd;
 949
 950                 /*
 951                  * Update shared-memory status --- this code should match XLogWrite
 952                  */
 953                 {
 954                         /* use volatile pointer to prevent code rearrangement */
 955                         volatile XLogCtlData *xlogctl = XLogCtl;
 956
 957                         SpinLockAcquire(&xlogctl->info_lck);
 958                         xlogctl->LogwrtResult = LogwrtResult;
 959                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
 960                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
 961                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
 962                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
 963                         SpinLockRelease(&xlogctl->info_lck);
 964                 }
 965
 966                 Write->LogwrtResult = LogwrtResult;
 967
 968                 LWLockRelease(WALWriteLock);
 969
 970                 updrqst = false;                /* done already */
 971         }
 972         else
 973         {
 974                 /* normal case, ie not xlog switch */
 975
 976                 /* Need to update shared LogwrtRqst if some block was filled up */
 977                 if (freespace < SizeOfXLogRecord)
 978                 {
 979                         /* curridx is filled and available for writing out */
 980                         updrqst = true;
 981                 }
 982                 else
 983                 {
 984                         /* if updrqst already set, write through end of previous buf */
 985                         curridx = PrevBufIdx(curridx);
 986                 }
 987                 WriteRqst = XLogCtl->xlblocks[curridx];
 988         }
 989
 990         LWLockRelease(WALInsertLock);
 991
 992         if (updrqst)
 993         {
 994                 /* use volatile pointer to prevent code rearrangement */
 995                 volatile XLogCtlData *xlogctl = XLogCtl;
 996
 997                 SpinLockAcquire(&xlogctl->info_lck);
 998                 /* advance global request to include new block(s) */
 999                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1000                         xlogctl->LogwrtRqst.Write = WriteRqst;
1001                 /* update local result copy while I have the chance */
1002                 LogwrtResult = xlogctl->LogwrtResult;
1003                 SpinLockRelease(&xlogctl->info_lck);
1004         }
1005
1006         XactLastRecEnd = RecPtr;
1007
1008         END_CRIT_SECTION();
1009
1010         return RecPtr;
1011 }
1012
1013 /*
1014  * Determine whether the buffer referenced by an XLogRecData item has to
1015  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1016  * save the buffer's LSN at *lsn.
1017  */
1018 static bool
1019 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1020                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1021 {
1022         Page            page;
1023
1024         page = BufferGetPage(rdata->buffer);
1025
1026         /*
1027          * XXX We assume page LSN is first data on *every* page that can be passed
1028          * to XLogInsert, whether it otherwise has the standard page layout or
1029          * not.
1030          */
1031         *lsn = PageGetLSN(page);
1032
1033         if (doPageWrites &&
1034                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1035         {
1036                 /*
1037                  * The page needs to be backed up, so set up *bkpb
1038                  */
1039                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1040
1041                 if (rdata->buffer_std)
1042                 {
1043                         /* Assume we can omit data between pd_lower and pd_upper */
1044                         uint16          lower = ((PageHeader) page)->pd_lower;
1045                         uint16          upper = ((PageHeader) page)->pd_upper;
1046
1047                         if (lower >= SizeOfPageHeaderData &&
1048                                 upper > lower &&
1049                                 upper <= BLCKSZ)
1050                         {
1051                                 bkpb->hole_offset = lower;
1052                                 bkpb->hole_length = upper - lower;
1053                         }
1054                         else
1055                         {
1056                                 /* No "hole" to compress out */
1057                                 bkpb->hole_offset = 0;
1058                                 bkpb->hole_length = 0;
1059                         }
1060                 }
1061                 else
1062                 {
1063                         /* Not a standard page header, don't try to eliminate "hole" */
1064                         bkpb->hole_offset = 0;
1065                         bkpb->hole_length = 0;
1066                 }
1067
1068                 return true;                    /* buffer requires backup */
1069         }
1070
1071         return false;                           /* buffer does not need to be backed up */
1072 }
1073
1074 /*
1075  * XLogArchiveNotify
1076  *
1077  * Create an archive notification file
1078  *
1079  * The name of the notification file is the message that will be picked up
1080  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1081  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1082  * then when complete, rename it to 0000000100000001000000C6.done
1083  */
1084 static void
1085 XLogArchiveNotify(const char *xlog)
1086 {
1087         char            archiveStatusPath[MAXPGPATH];
1088         FILE       *fd;
1089
1090         /* insert an otherwise empty file called <XLOG>.ready */
1091         StatusFilePath(archiveStatusPath, xlog, ".ready");
1092         fd = AllocateFile(archiveStatusPath, "w");
1093         if (fd == NULL)
1094         {
1095                 ereport(LOG,
1096                                 (errcode_for_file_access(),
1097                                  errmsg("could not create archive status file \"%s\": %m",
1098                                                 archiveStatusPath)));
1099                 return;
1100         }
1101         if (FreeFile(fd))
1102         {
1103                 ereport(LOG,
1104                                 (errcode_for_file_access(),
1105                                  errmsg("could not write archive status file \"%s\": %m",
1106                                                 archiveStatusPath)));
1107                 return;
1108         }
1109
1110         /* Notify archiver that it's got something to do */
1111         if (IsUnderPostmaster)
1112                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1113 }
1114
1115 /*
1116  * Convenience routine to notify using log/seg representation of filename
1117  */
1118 static void
1119 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1120 {
1121         char            xlog[MAXFNAMELEN];
1122
1123         XLogFileName(xlog, ThisTimeLineID, log, seg);
1124         XLogArchiveNotify(xlog);
1125 }
1126
1127 /*
1128  * XLogArchiveCheckDone
1129  *
1130  * This is called when we are ready to delete or recycle an old XLOG segment
1131  * file or backup history file.  If it is okay to delete it then return true.
1132  * If it is not time to delete it, make sure a .ready file exists, and return
1133  * false.
1134  *
1135  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1136  * then return false; else create <XLOG>.ready and return false.
1137  *
1138  * The reason we do things this way is so that if the original attempt to
1139  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1140  */
1141 static bool
1142 XLogArchiveCheckDone(const char *xlog)
1143 {
1144         char            archiveStatusPath[MAXPGPATH];
1145         struct stat stat_buf;
1146
1147         /* Always deletable if archiving is off */
1148         if (!XLogArchivingActive())
1149                 return true;
1150
1151         /* First check for .done --- this means archiver is done with it */
1152         StatusFilePath(archiveStatusPath, xlog, ".done");
1153         if (stat(archiveStatusPath, &stat_buf) == 0)
1154                 return true;
1155
1156         /* check for .ready --- this means archiver is still busy with it */
1157         StatusFilePath(archiveStatusPath, xlog, ".ready");
1158         if (stat(archiveStatusPath, &stat_buf) == 0)
1159                 return false;
1160
1161         /* Race condition --- maybe archiver just finished, so recheck */
1162         StatusFilePath(archiveStatusPath, xlog, ".done");
1163         if (stat(archiveStatusPath, &stat_buf) == 0)
1164                 return true;
1165
1166         /* Retry creation of the .ready file */
1167         XLogArchiveNotify(xlog);
1168         return false;
1169 }
1170
1171 /*
1172  * XLogArchiveIsBusy
1173  *
1174  * Check to see if an XLOG segment file is still unarchived.
1175  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1176  * the first place we aren't chartered to recreate the .ready file, and
1177  * in the second place we should consider that if the file is already gone
1178  * then it's not busy.  (This check is needed to handle the race condition
1179  * that a checkpoint already deleted the no-longer-needed file.)
1180  */
1181 static bool
1182 XLogArchiveIsBusy(const char *xlog)
1183 {
1184         char            archiveStatusPath[MAXPGPATH];
1185         struct stat stat_buf;
1186
1187         /* First check for .done --- this means archiver is done with it */
1188         StatusFilePath(archiveStatusPath, xlog, ".done");
1189         if (stat(archiveStatusPath, &stat_buf) == 0)
1190                 return false;
1191
1192         /* check for .ready --- this means archiver is still busy with it */
1193         StatusFilePath(archiveStatusPath, xlog, ".ready");
1194         if (stat(archiveStatusPath, &stat_buf) == 0)
1195                 return true;
1196
1197         /* Race condition --- maybe archiver just finished, so recheck */
1198         StatusFilePath(archiveStatusPath, xlog, ".done");
1199         if (stat(archiveStatusPath, &stat_buf) == 0)
1200                 return false;
1201
1202         /*
1203          * Check to see if the WAL file has been removed by checkpoint,
1204          * which implies it has already been archived, and explains why we
1205          * can't see a status file for it.
1206          */
1207         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1208         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1209                 errno == ENOENT)
1210                 return false;
1211
1212         return true;
1213 }
1214
1215 /*
1216  * XLogArchiveCleanup
1217  *
1218  * Cleanup archive notification file(s) for a particular xlog segment
1219  */
1220 static void
1221 XLogArchiveCleanup(const char *xlog)
1222 {
1223         char            archiveStatusPath[MAXPGPATH];
1224
1225         /* Remove the .done file */
1226         StatusFilePath(archiveStatusPath, xlog, ".done");
1227         unlink(archiveStatusPath);
1228         /* should we complain about failure? */
1229
1230         /* Remove the .ready file if present --- normally it shouldn't be */
1231         StatusFilePath(archiveStatusPath, xlog, ".ready");
1232         unlink(archiveStatusPath);
1233         /* should we complain about failure? */
1234 }
1235
1236 /*
1237  * Advance the Insert state to the next buffer page, writing out the next
1238  * buffer if it still contains unwritten data.
1239  *
1240  * If new_segment is TRUE then we set up the next buffer page as the first
1241  * page of the next xlog segment file, possibly but not usually the next
1242  * consecutive file page.
1243  *
1244  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1245  * just-filled page.  If we can do this for free (without an extra lock),
1246  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1247  * request update still needs to be done, FALSE if we did it internally.
1248  *
1249  * Must be called with WALInsertLock held.
1250  */
1251 static bool
1252 AdvanceXLInsertBuffer(bool new_segment)
1253 {
1254         XLogCtlInsert *Insert = &XLogCtl->Insert;
1255         XLogCtlWrite *Write = &XLogCtl->Write;
1256         int                     nextidx = NextBufIdx(Insert->curridx);
1257         bool            update_needed = true;
1258         XLogRecPtr      OldPageRqstPtr;
1259         XLogwrtRqst WriteRqst;
1260         XLogRecPtr      NewPageEndPtr;
1261         XLogPageHeader NewPage;
1262
1263         /* Use Insert->LogwrtResult copy if it's more fresh */
1264         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
1265                 LogwrtResult = Insert->LogwrtResult;
1266
1267         /*
1268          * Get ending-offset of the buffer page we need to replace (this may be
1269          * zero if the buffer hasn't been used yet).  Fall through if it's already
1270          * written out.
1271          */
1272         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1273         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1274         {
1275                 /* nope, got work to do... */
1276                 XLogRecPtr      FinishedPageRqstPtr;
1277
1278                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1279
1280                 /* Before waiting, get info_lck and update LogwrtResult */
1281                 {
1282                         /* use volatile pointer to prevent code rearrangement */
1283                         volatile XLogCtlData *xlogctl = XLogCtl;
1284
1285                         SpinLockAcquire(&xlogctl->info_lck);
1286                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1287                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1288                         LogwrtResult = xlogctl->LogwrtResult;
1289                         SpinLockRelease(&xlogctl->info_lck);
1290                 }
1291
1292                 update_needed = false;  /* Did the shared-request update */
1293
1294                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1295                 {
1296                         /* OK, someone wrote it already */
1297                         Insert->LogwrtResult = LogwrtResult;
1298                 }
1299                 else
1300                 {
1301                         /* Must acquire write lock */
1302                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1303                         LogwrtResult = Write->LogwrtResult;
1304                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1305                         {
1306                                 /* OK, someone wrote it already */
1307                                 LWLockRelease(WALWriteLock);
1308                                 Insert->LogwrtResult = LogwrtResult;
1309                         }
1310                         else
1311                         {
1312                                 /*
1313                                  * Have to write buffers while holding insert lock. This is
1314                                  * not good, so only write as much as we absolutely must.
1315                                  */
1316                                 WriteRqst.Write = OldPageRqstPtr;
1317                                 WriteRqst.Flush.xlogid = 0;
1318                                 WriteRqst.Flush.xrecoff = 0;
1319                                 XLogWrite(WriteRqst, false, false);
1320                                 LWLockRelease(WALWriteLock);
1321                                 Insert->LogwrtResult = LogwrtResult;
1322                         }
1323                 }
1324         }
1325
1326         /*
1327          * Now the next buffer slot is free and we can set it up to be the next
1328          * output page.
1329          */
1330         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1331
1332         if (new_segment)
1333         {
1334                 /* force it to a segment start point */
1335                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1336                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1337         }
1338
1339         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1340         {
1341                 /* crossing a logid boundary */
1342                 NewPageEndPtr.xlogid += 1;
1343                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1344         }
1345         else
1346                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1347         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1348         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1349
1350         Insert->curridx = nextidx;
1351         Insert->currpage = NewPage;
1352
1353         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1354
1355         /*
1356          * Be sure to re-zero the buffer so that bytes beyond what we've written
1357          * will look like zeroes and not valid XLOG records...
1358          */
1359         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1360
1361         /*
1362          * Fill the new page's header
1363          */
1364         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1365
1366         /* NewPage->xlp_info = 0; */    /* done by memset */
1367         NewPage   ->xlp_tli = ThisTimeLineID;
1368         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1369         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1370
1371         /*
1372          * If first page of an XLOG segment file, make it a long header.
1373          */
1374         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1375         {
1376                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1377
1378                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1379                 NewLongPage->xlp_seg_size = XLogSegSize;
1380                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1381                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1382
1383                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1384         }
1385
1386         return update_needed;
1387 }
1388
1389 /*
1390  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1391  *
1392  * Caller must have just finished filling the open log file (so that
1393  * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
1394  * to the open log file and see if that exceeds CheckPointSegments.
1395  *
1396  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1397  */
1398 static bool
1399 XLogCheckpointNeeded(void)
1400 {
1401         /*
1402          * A straight computation of segment number could overflow 32 bits. Rather
1403          * than assuming we have working 64-bit arithmetic, we compare the
1404          * highest-order bits separately, and force a checkpoint immediately when
1405          * they change.
1406          */
1407         uint32          old_segno,
1408                                 new_segno;
1409         uint32          old_highbits,
1410                                 new_highbits;
1411
1412         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1413                 (RedoRecPtr.xrecoff / XLogSegSize);
1414         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1415         new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
1416         new_highbits = openLogId / XLogSegSize;
1417         if (new_highbits != old_highbits ||
1418                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1419                 return true;
1420         return false;
1421 }
1422
1423 /*
1424  * Write and/or fsync the log at least as far as WriteRqst indicates.
1425  *
1426  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1427  * may stop at any convenient boundary (such as a cache or logfile boundary).
1428  * This option allows us to avoid uselessly issuing multiple writes when a
1429  * single one would do.
1430  *
1431  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1432  * perform end-of-segment actions after writing the last page, even if
1433  * it's not physically the end of its segment.  (NB: this will work properly
1434  * only if caller specifies WriteRqst == page-end and flexible == false,
1435  * and there is some data to write.)
1436  *
1437  * Must be called with WALWriteLock held.
1438  */
1439 static void
1440 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1441 {
1442         XLogCtlWrite *Write = &XLogCtl->Write;
1443         bool            ispartialpage;
1444         bool            last_iteration;
1445         bool            finishing_seg;
1446         bool            use_existent;
1447         int                     curridx;
1448         int                     npages;
1449         int                     startidx;
1450         uint32          startoffset;
1451
1452         /* We should always be inside a critical section here */
1453         Assert(CritSectionCount > 0);
1454
1455         /*
1456          * Update local LogwrtResult (caller probably did this already, but...)
1457          */
1458         LogwrtResult = Write->LogwrtResult;
1459
1460         /*
1461          * Since successive pages in the xlog cache are consecutively allocated,
1462          * we can usually gather multiple pages together and issue just one
1463          * write() call.  npages is the number of pages we have determined can be
1464          * written together; startidx is the cache block index of the first one,
1465          * and startoffset is the file offset at which it should go. The latter
1466          * two variables are only valid when npages > 0, but we must initialize
1467          * all of them to keep the compiler quiet.
1468          */
1469         npages = 0;
1470         startidx = 0;
1471         startoffset = 0;
1472
1473         /*
1474          * Within the loop, curridx is the cache block index of the page to
1475          * consider writing.  We advance Write->curridx only after successfully
1476          * writing pages.  (Right now, this refinement is useless since we are
1477          * going to PANIC if any error occurs anyway; but someday it may come in
1478          * useful.)
1479          */
1480         curridx = Write->curridx;
1481
1482         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1483         {
1484                 /*
1485                  * Make sure we're not ahead of the insert process.  This could happen
1486                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1487                  * last page that's been initialized by AdvanceXLInsertBuffer.
1488                  */
1489                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1490                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1491                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1492                                  XLogCtl->xlblocks[curridx].xlogid,
1493                                  XLogCtl->xlblocks[curridx].xrecoff);
1494
1495                 /* Advance LogwrtResult.Write to end of current buffer page */
1496                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1497                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1498
1499                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1500                 {
1501                         /*
1502                          * Switch to new logfile segment.  We cannot have any pending
1503                          * pages here (since we dump what we have at segment end).
1504                          */
1505                         Assert(npages == 0);
1506                         if (openLogFile >= 0)
1507                                 XLogFileClose();
1508                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1509
1510                         /* create/use new log file */
1511                         use_existent = true;
1512                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1513                                                                            &use_existent, true);
1514                         openLogOff = 0;
1515                 }
1516
1517                 /* Make sure we have the current logfile open */
1518                 if (openLogFile < 0)
1519                 {
1520                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1521                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1522                         openLogOff = 0;
1523                 }
1524
1525                 /* Add current page to the set of pending pages-to-dump */
1526                 if (npages == 0)
1527                 {
1528                         /* first of group */
1529                         startidx = curridx;
1530                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1531                 }
1532                 npages++;
1533
1534                 /*
1535                  * Dump the set if this will be the last loop iteration, or if we are
1536                  * at the last page of the cache area (since the next page won't be
1537                  * contiguous in memory), or if we are at the end of the logfile
1538                  * segment.
1539                  */
1540                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1541
1542                 finishing_seg = !ispartialpage &&
1543                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1544
1545                 if (last_iteration ||
1546                         curridx == XLogCtl->XLogCacheBlck ||
1547                         finishing_seg)
1548                 {
1549                         char       *from;
1550                         Size            nbytes;
1551
1552                         /* Need to seek in the file? */
1553                         if (openLogOff != startoffset)
1554                         {
1555                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1556                                         ereport(PANIC,
1557                                                         (errcode_for_file_access(),
1558                                                          errmsg("could not seek in log file %u, "
1559                                                                         "segment %u to offset %u: %m",
1560                                                                         openLogId, openLogSeg, startoffset)));
1561                                 openLogOff = startoffset;
1562                         }
1563
1564                         /* OK to write the page(s) */
1565                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1566                         nbytes = npages * (Size) XLOG_BLCKSZ;
1567                         errno = 0;
1568                         if (write(openLogFile, from, nbytes) != nbytes)
1569                         {
1570                                 /* if write didn't set errno, assume no disk space */
1571                                 if (errno == 0)
1572                                         errno = ENOSPC;
1573                                 ereport(PANIC,
1574                                                 (errcode_for_file_access(),
1575                                                  errmsg("could not write to log file %u, segment %u "
1576                                                                 "at offset %u, length %lu: %m",
1577                                                                 openLogId, openLogSeg,
1578                                                                 openLogOff, (unsigned long) nbytes)));
1579                         }
1580
1581                         /* Update state for write */
1582                         openLogOff += nbytes;
1583                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1584                         npages = 0;
1585
1586                         /*
1587                          * If we just wrote the whole last page of a logfile segment,
1588                          * fsync the segment immediately.  This avoids having to go back
1589                          * and re-open prior segments when an fsync request comes along
1590                          * later. Doing it here ensures that one and only one backend will
1591                          * perform this fsync.
1592                          *
1593                          * We also do this if this is the last page written for an xlog
1594                          * switch.
1595                          *
1596                          * This is also the right place to notify the Archiver that the
1597                          * segment is ready to copy to archival storage, and to update the
1598                          * timer for archive_timeout, and to signal for a checkpoint if
1599                          * too many logfile segments have been used since the last
1600                          * checkpoint.
1601                          */
1602                         if (finishing_seg || (xlog_switch && last_iteration))
1603                         {
1604                                 issue_xlog_fsync();
1605                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1606
1607                                 if (XLogArchivingActive())
1608                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1609
1610                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1611
1612                                 /*
1613                                  * Signal bgwriter to start a checkpoint if we've consumed too
1614                                  * much xlog since the last one.  For speed, we first check
1615                                  * using the local copy of RedoRecPtr, which might be out of
1616                                  * date; if it looks like a checkpoint is needed, forcibly
1617                                  * update RedoRecPtr and recheck.
1618                                  */
1619                                 if (IsUnderPostmaster &&
1620                                         XLogCheckpointNeeded())
1621                                 {
1622                                         (void) GetRedoRecPtr();
1623                                         if (XLogCheckpointNeeded())
1624                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1625                                 }
1626                         }
1627                 }
1628
1629                 if (ispartialpage)
1630                 {
1631                         /* Only asked to write a partial page */
1632                         LogwrtResult.Write = WriteRqst.Write;
1633                         break;
1634                 }
1635                 curridx = NextBufIdx(curridx);
1636
1637                 /* If flexible, break out of loop as soon as we wrote something */
1638                 if (flexible && npages == 0)
1639                         break;
1640         }
1641
1642         Assert(npages == 0);
1643         Assert(curridx == Write->curridx);
1644
1645         /*
1646          * If asked to flush, do so
1647          */
1648         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1649                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1650         {
1651                 /*
1652                  * Could get here without iterating above loop, in which case we might
1653                  * have no open file or the wrong one.  However, we do not need to
1654                  * fsync more than one file.
1655                  */
1656                 if (sync_method != SYNC_METHOD_OPEN &&
1657                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1658                 {
1659                         if (openLogFile >= 0 &&
1660                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1661                                 XLogFileClose();
1662                         if (openLogFile < 0)
1663                         {
1664                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1665                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1666                                 openLogOff = 0;
1667                         }
1668                         issue_xlog_fsync();
1669                 }
1670                 LogwrtResult.Flush = LogwrtResult.Write;
1671         }
1672
1673         /*
1674          * Update shared-memory status
1675          *
1676          * We make sure that the shared 'request' values do not fall behind the
1677          * 'result' values.  This is not absolutely essential, but it saves some
1678          * code in a couple of places.
1679          */
1680         {
1681                 /* use volatile pointer to prevent code rearrangement */
1682                 volatile XLogCtlData *xlogctl = XLogCtl;
1683
1684                 SpinLockAcquire(&xlogctl->info_lck);
1685                 xlogctl->LogwrtResult = LogwrtResult;
1686                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1687                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1688                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1689                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1690                 SpinLockRelease(&xlogctl->info_lck);
1691         }
1692
1693         Write->LogwrtResult = LogwrtResult;
1694 }
1695
1696 /*
1697  * Record the LSN for an asynchronous transaction commit.
1698  * (This should not be called for aborts, nor for synchronous commits.)
1699  */
1700 void
1701 XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
1702 {
1703         /* use volatile pointer to prevent code rearrangement */
1704         volatile XLogCtlData *xlogctl = XLogCtl;
1705
1706         SpinLockAcquire(&xlogctl->info_lck);
1707         if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
1708                 xlogctl->asyncCommitLSN = asyncCommitLSN;
1709         SpinLockRelease(&xlogctl->info_lck);
1710 }
1711
1712 /*
1713  * Ensure that all XLOG data through the given position is flushed to disk.
1714  *
1715  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1716  * already held, and we try to avoid acquiring it if possible.
1717  */
1718 void
1719 XLogFlush(XLogRecPtr record)
1720 {
1721         XLogRecPtr      WriteRqstPtr;
1722         XLogwrtRqst WriteRqst;
1723
1724         /* Disabled during REDO */
1725         if (InRedo)
1726                 return;
1727
1728         /* Quick exit if already known flushed */
1729         if (XLByteLE(record, LogwrtResult.Flush))
1730                 return;
1731
1732 #ifdef WAL_DEBUG
1733         if (XLOG_DEBUG)
1734                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1735                          record.xlogid, record.xrecoff,
1736                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1737                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1738 #endif
1739
1740         START_CRIT_SECTION();
1741
1742         /*
1743          * Since fsync is usually a horribly expensive operation, we try to
1744          * piggyback as much data as we can on each fsync: if we see any more data
1745          * entered into the xlog buffer, we'll write and fsync that too, so that
1746          * the final value of LogwrtResult.Flush is as large as possible. This
1747          * gives us some chance of avoiding another fsync immediately after.
1748          */
1749
1750         /* initialize to given target; may increase below */
1751         WriteRqstPtr = record;
1752
1753         /* read LogwrtResult and update local state */
1754         {
1755                 /* use volatile pointer to prevent code rearrangement */
1756                 volatile XLogCtlData *xlogctl = XLogCtl;
1757
1758                 SpinLockAcquire(&xlogctl->info_lck);
1759                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
1760                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1761                 LogwrtResult = xlogctl->LogwrtResult;
1762                 SpinLockRelease(&xlogctl->info_lck);
1763         }
1764
1765         /* done already? */
1766         if (!XLByteLE(record, LogwrtResult.Flush))
1767         {
1768                 /* now wait for the write lock */
1769                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1770                 LogwrtResult = XLogCtl->Write.LogwrtResult;
1771                 if (!XLByteLE(record, LogwrtResult.Flush))
1772                 {
1773                         /* try to write/flush later additions to XLOG as well */
1774                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1775                         {
1776                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
1777                                 uint32          freespace = INSERT_FREESPACE(Insert);
1778
1779                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
1780                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1781                                 else
1782                                 {
1783                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1784                                         WriteRqstPtr.xrecoff -= freespace;
1785                                 }
1786                                 LWLockRelease(WALInsertLock);
1787                                 WriteRqst.Write = WriteRqstPtr;
1788                                 WriteRqst.Flush = WriteRqstPtr;
1789                         }
1790                         else
1791                         {
1792                                 WriteRqst.Write = WriteRqstPtr;
1793                                 WriteRqst.Flush = record;
1794                         }
1795                         XLogWrite(WriteRqst, false, false);
1796                 }
1797                 LWLockRelease(WALWriteLock);
1798         }
1799
1800         END_CRIT_SECTION();
1801
1802         /*
1803          * If we still haven't flushed to the request point then we have a
1804          * problem; most likely, the requested flush point is past end of XLOG.
1805          * This has been seen to occur when a disk page has a corrupted LSN.
1806          *
1807          * Formerly we treated this as a PANIC condition, but that hurts the
1808          * system's robustness rather than helping it: we do not want to take down
1809          * the whole system due to corruption on one data page.  In particular, if
1810          * the bad page is encountered again during recovery then we would be
1811          * unable to restart the database at all!  (This scenario has actually
1812          * happened in the field several times with 7.1 releases. Note that we
1813          * cannot get here while InRedo is true, but if the bad page is brought in
1814          * and marked dirty during recovery then CreateCheckPoint will try to
1815          * flush it at the end of recovery.)
1816          *
1817          * The current approach is to ERROR under normal conditions, but only
1818          * WARNING during recovery, so that the system can be brought up even if
1819          * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
1820          * be promoted to PANIC since xact.c calls this routine inside a critical
1821          * section.  However, calls from bufmgr.c are not within critical sections
1822          * and so we will not force a restart for a bad LSN on a data page.
1823          */
1824         if (XLByteLT(LogwrtResult.Flush, record))
1825                 elog(InRecovery ? WARNING : ERROR,
1826                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
1827                          record.xlogid, record.xrecoff,
1828                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1829 }
1830
1831 /*
1832  * Flush xlog, but without specifying exactly where to flush to.
1833  *
1834  * We normally flush only completed blocks; but if there is nothing to do on
1835  * that basis, we check for unflushed async commits in the current incomplete
1836  * block, and flush through the latest one of those.  Thus, if async commits
1837  * are not being used, we will flush complete blocks only.      We can guarantee
1838  * that async commits reach disk after at most three cycles; normally only
1839  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
1840  * at the end of the buffer ring; this makes a difference only with very high
1841  * load or long wal_writer_delay, but imposes one extra cycle for the worst
1842  * case for async commits.)
1843  *
1844  * This routine is invoked periodically by the background walwriter process.
1845  */
1846 void
1847 XLogBackgroundFlush(void)
1848 {
1849         XLogRecPtr      WriteRqstPtr;
1850         bool            flexible = true;
1851
1852         /* read LogwrtResult and update local state */
1853         {
1854                 /* use volatile pointer to prevent code rearrangement */
1855                 volatile XLogCtlData *xlogctl = XLogCtl;
1856
1857                 SpinLockAcquire(&xlogctl->info_lck);
1858                 LogwrtResult = xlogctl->LogwrtResult;
1859                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1860                 SpinLockRelease(&xlogctl->info_lck);
1861         }
1862
1863         /* back off to last completed page boundary */
1864         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
1865
1866         /* if we have already flushed that far, consider async commit records */
1867         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1868         {
1869                 /* use volatile pointer to prevent code rearrangement */
1870                 volatile XLogCtlData *xlogctl = XLogCtl;
1871
1872                 SpinLockAcquire(&xlogctl->info_lck);
1873                 WriteRqstPtr = xlogctl->asyncCommitLSN;
1874                 SpinLockRelease(&xlogctl->info_lck);
1875                 flexible = false;               /* ensure it all gets written */
1876         }
1877
1878         /* Done if already known flushed */
1879         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1880                 return;
1881
1882 #ifdef WAL_DEBUG
1883         if (XLOG_DEBUG)
1884                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
1885                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
1886                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1887                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1888 #endif
1889
1890         START_CRIT_SECTION();
1891
1892         /* now wait for the write lock */
1893         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1894         LogwrtResult = XLogCtl->Write.LogwrtResult;
1895         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1896         {
1897                 XLogwrtRqst WriteRqst;
1898
1899                 WriteRqst.Write = WriteRqstPtr;
1900                 WriteRqst.Flush = WriteRqstPtr;
1901                 XLogWrite(WriteRqst, flexible, false);
1902         }
1903         LWLockRelease(WALWriteLock);
1904
1905         END_CRIT_SECTION();
1906 }
1907
1908 /*
1909  * Flush any previous asynchronously-committed transactions' commit records.
1910  *
1911  * NOTE: it is unwise to assume that this provides any strong guarantees.
1912  * In particular, because of the inexact LSN bookkeeping used by clog.c,
1913  * we cannot assume that hint bits will be settable for these transactions.
1914  */
1915 void
1916 XLogAsyncCommitFlush(void)
1917 {
1918         XLogRecPtr      WriteRqstPtr;
1919
1920         /* use volatile pointer to prevent code rearrangement */
1921         volatile XLogCtlData *xlogctl = XLogCtl;
1922
1923         SpinLockAcquire(&xlogctl->info_lck);
1924         WriteRqstPtr = xlogctl->asyncCommitLSN;
1925         SpinLockRelease(&xlogctl->info_lck);
1926
1927         XLogFlush(WriteRqstPtr);
1928 }
1929
1930 /*
1931  * Test whether XLOG data has been flushed up to (at least) the given position.
1932  *
1933  * Returns true if a flush is still needed.  (It may be that someone else
1934  * is already in process of flushing that far, however.)
1935  */
1936 bool
1937 XLogNeedsFlush(XLogRecPtr record)
1938 {
1939         /* Quick exit if already known flushed */
1940         if (XLByteLE(record, LogwrtResult.Flush))
1941                 return false;
1942
1943         /* read LogwrtResult and update local state */
1944         {
1945                 /* use volatile pointer to prevent code rearrangement */
1946                 volatile XLogCtlData *xlogctl = XLogCtl;
1947
1948                 SpinLockAcquire(&xlogctl->info_lck);
1949                 LogwrtResult = xlogctl->LogwrtResult;
1950                 SpinLockRelease(&xlogctl->info_lck);
1951         }
1952
1953         /* check again */
1954         if (XLByteLE(record, LogwrtResult.Flush))
1955                 return false;
1956
1957         return true;
1958 }
1959
1960 /*
1961  * Create a new XLOG file segment, or open a pre-existing one.
1962  *
1963  * log, seg: identify segment to be created/opened.
1964  *
1965  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
1966  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
1967  * file was used.
1968  *
1969  * use_lock: if TRUE, acquire ControlFileLock while moving file into
1970  * place.  This should be TRUE except during bootstrap log creation.  The
1971  * caller must *not* hold the lock at call.
1972  *
1973  * Returns FD of opened file.
1974  *
1975  * Note: errors here are ERROR not PANIC because we might or might not be
1976  * inside a critical section (eg, during checkpoint there is no reason to
1977  * take down the system on failure).  They will promote to PANIC if we are
1978  * in a critical section.
1979  */
1980 static int
1981 XLogFileInit(uint32 log, uint32 seg,
1982                          bool *use_existent, bool use_lock)
1983 {
1984         char            path[MAXPGPATH];
1985         char            tmppath[MAXPGPATH];
1986         char       *zbuffer;
1987         uint32          installed_log;
1988         uint32          installed_seg;
1989         int                     max_advance;
1990         int                     fd;
1991         int                     nbytes;
1992
1993         XLogFilePath(path, ThisTimeLineID, log, seg);
1994
1995         /*
1996          * Try to use existent file (checkpoint maker may have created it already)
1997          */
1998         if (*use_existent)
1999         {
2000                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2001                                                    S_IRUSR | S_IWUSR);
2002                 if (fd < 0)
2003                 {
2004                         if (errno != ENOENT)
2005                                 ereport(ERROR,
2006                                                 (errcode_for_file_access(),
2007                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2008                                                                 path, log, seg)));
2009                 }
2010                 else
2011                         return fd;
2012         }
2013
2014         /*
2015          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2016          * another process is doing the same thing.  If so, we will end up
2017          * pre-creating an extra log segment.  That seems OK, and better than
2018          * holding the lock throughout this lengthy process.
2019          */
2020         elog(DEBUG2, "creating and filling new WAL file");
2021
2022         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2023
2024         unlink(tmppath);
2025
2026         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2027         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2028                                            S_IRUSR | S_IWUSR);
2029         if (fd < 0)
2030                 ereport(ERROR,
2031                                 (errcode_for_file_access(),
2032                                  errmsg("could not create file \"%s\": %m", tmppath)));
2033
2034         /*
2035          * Zero-fill the file.  We have to do this the hard way to ensure that all
2036          * the file space has really been allocated --- on platforms that allow
2037          * "holes" in files, just seeking to the end doesn't allocate intermediate
2038          * space.  This way, we know that we have all the space and (after the
2039          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2040          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2041          * log file.
2042          *
2043          * Note: palloc zbuffer, instead of just using a local char array, to
2044          * ensure it is reasonably well-aligned; this may save a few cycles
2045          * transferring data to the kernel.
2046          */
2047         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2048         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2049         {
2050                 errno = 0;
2051                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2052                 {
2053                         int                     save_errno = errno;
2054
2055                         /*
2056                          * If we fail to make the file, delete it to release disk space
2057                          */
2058                         unlink(tmppath);
2059                         /* if write didn't set errno, assume problem is no disk space */
2060                         errno = save_errno ? save_errno : ENOSPC;
2061
2062                         ereport(ERROR,
2063                                         (errcode_for_file_access(),
2064                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2065                 }
2066         }
2067         pfree(zbuffer);
2068
2069         if (pg_fsync(fd) != 0)
2070                 ereport(ERROR,
2071                                 (errcode_for_file_access(),
2072                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2073
2074         if (close(fd))
2075                 ereport(ERROR,
2076                                 (errcode_for_file_access(),
2077                                  errmsg("could not close file \"%s\": %m", tmppath)));
2078
2079         /*
2080          * Now move the segment into place with its final name.
2081          *
2082          * If caller didn't want to use a pre-existing file, get rid of any
2083          * pre-existing file.  Otherwise, cope with possibility that someone else
2084          * has created the file while we were filling ours: if so, use ours to
2085          * pre-create a future log segment.
2086          */
2087         installed_log = log;
2088         installed_seg = seg;
2089         max_advance = XLOGfileslop;
2090         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2091                                                                 *use_existent, &max_advance,
2092                                                                 use_lock))
2093         {
2094                 /* No need for any more future segments... */
2095                 unlink(tmppath);
2096         }
2097
2098         elog(DEBUG2, "done creating and filling new WAL file");
2099
2100         /* Set flag to tell caller there was no existent file */
2101         *use_existent = false;
2102
2103         /* Now open original target segment (might not be file I just made) */
2104         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2105                                            S_IRUSR | S_IWUSR);
2106         if (fd < 0)
2107                 ereport(ERROR,
2108                                 (errcode_for_file_access(),
2109                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2110                                   path, log, seg)));
2111
2112         return fd;
2113 }
2114
2115 /*
2116  * Create a new XLOG file segment by copying a pre-existing one.
2117  *
2118  * log, seg: identify segment to be created.
2119  *
2120  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2121  *              a different timeline)
2122  *
2123  * Currently this is only used during recovery, and so there are no locking
2124  * considerations.      But we should be just as tense as XLogFileInit to avoid
2125  * emplacing a bogus file.
2126  */
2127 static void
2128 XLogFileCopy(uint32 log, uint32 seg,
2129                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2130 {
2131         char            path[MAXPGPATH];
2132         char            tmppath[MAXPGPATH];
2133         char            buffer[XLOG_BLCKSZ];
2134         int                     srcfd;
2135         int                     fd;
2136         int                     nbytes;
2137
2138         /*
2139          * Open the source file
2140          */
2141         XLogFilePath(path, srcTLI, srclog, srcseg);
2142         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2143         if (srcfd < 0)
2144                 ereport(ERROR,
2145                                 (errcode_for_file_access(),
2146                                  errmsg("could not open file \"%s\": %m", path)));
2147
2148         /*
2149          * Copy into a temp file name.
2150          */
2151         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2152
2153         unlink(tmppath);
2154
2155         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2156         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2157                                            S_IRUSR | S_IWUSR);
2158         if (fd < 0)
2159                 ereport(ERROR,
2160                                 (errcode_for_file_access(),
2161                                  errmsg("could not create file \"%s\": %m", tmppath)));
2162
2163         /*
2164          * Do the data copying.
2165          */
2166         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2167         {
2168                 errno = 0;
2169                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2170                 {
2171                         if (errno != 0)
2172                                 ereport(ERROR,
2173                                                 (errcode_for_file_access(),
2174                                                  errmsg("could not read file \"%s\": %m", path)));
2175                         else
2176                                 ereport(ERROR,
2177                                                 (errmsg("not enough data in file \"%s\"", path)));
2178                 }
2179                 errno = 0;
2180                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2181                 {
2182                         int                     save_errno = errno;
2183
2184                         /*
2185                          * If we fail to make the file, delete it to release disk space
2186                          */
2187                         unlink(tmppath);
2188                         /* if write didn't set errno, assume problem is no disk space */
2189                         errno = save_errno ? save_errno : ENOSPC;
2190
2191                         ereport(ERROR,
2192                                         (errcode_for_file_access(),
2193                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2194                 }
2195         }
2196
2197         if (pg_fsync(fd) != 0)
2198                 ereport(ERROR,
2199                                 (errcode_for_file_access(),
2200                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2201
2202         if (close(fd))
2203                 ereport(ERROR,
2204                                 (errcode_for_file_access(),
2205                                  errmsg("could not close file \"%s\": %m", tmppath)));
2206
2207         close(srcfd);
2208
2209         /*
2210          * Now move the segment into place with its final name.
2211          */
2212         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2213                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2214 }
2215
2216 /*
2217  * Install a new XLOG segment file as a current or future log segment.
2218  *
2219  * This is used both to install a newly-created segment (which has a temp
2220  * filename while it's being created) and to recycle an old segment.
2221  *
2222  * *log, *seg: identify segment to install as (or first possible target).
2223  * When find_free is TRUE, these are modified on return to indicate the
2224  * actual installation location or last segment searched.
2225  *
2226  * tmppath: initial name of file to install.  It will be renamed into place.
2227  *
2228  * find_free: if TRUE, install the new segment at the first empty log/seg
2229  * number at or after the passed numbers.  If FALSE, install the new segment
2230  * exactly where specified, deleting any existing segment file there.
2231  *
2232  * *max_advance: maximum number of log/seg slots to advance past the starting
2233  * point.  Fail if no free slot is found in this range.  On return, reduced
2234  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2235  * when find_free is FALSE.)
2236  *
2237  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2238  * place.  This should be TRUE except during bootstrap log creation.  The
2239  * caller must *not* hold the lock at call.
2240  *
2241  * Returns TRUE if file installed, FALSE if not installed because of
2242  * exceeding max_advance limit.  On Windows, we also return FALSE if we
2243  * can't rename the file into place because someone's got it open.
2244  * (Any other kind of failure causes ereport().)
2245  */
2246 static bool
2247 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2248                                            bool find_free, int *max_advance,
2249                                            bool use_lock)
2250 {
2251         char            path[MAXPGPATH];
2252         struct stat stat_buf;
2253
2254         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2255
2256         /*
2257          * We want to be sure that only one process does this at a time.
2258          */
2259         if (use_lock)
2260                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2261
2262         if (!find_free)
2263         {
2264                 /* Force installation: get rid of any pre-existing segment file */
2265                 unlink(path);
2266         }
2267         else
2268         {
2269                 /* Find a free slot to put it in */
2270                 while (stat(path, &stat_buf) == 0)
2271                 {
2272                         if (*max_advance <= 0)
2273                         {
2274                                 /* Failed to find a free slot within specified range */
2275                                 if (use_lock)
2276                                         LWLockRelease(ControlFileLock);
2277                                 return false;
2278                         }
2279                         NextLogSeg(*log, *seg);
2280                         (*max_advance)--;
2281                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2282                 }
2283         }
2284
2285         /*
2286          * Prefer link() to rename() here just to be really sure that we don't
2287          * overwrite an existing logfile.  However, there shouldn't be one, so
2288          * rename() is an acceptable substitute except for the truly paranoid.
2289          */
2290 #if HAVE_WORKING_LINK
2291         if (link(tmppath, path) < 0)
2292                 ereport(ERROR,
2293                                 (errcode_for_file_access(),
2294                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2295                                                 tmppath, path, *log, *seg)));
2296         unlink(tmppath);
2297 #else
2298         if (rename(tmppath, path) < 0)
2299         {
2300 #ifdef WIN32
2301 #if !defined(__CYGWIN__)
2302                 if (GetLastError() == ERROR_ACCESS_DENIED)
2303 #else
2304                 if (errno == EACCES)
2305 #endif
2306                 {
2307                         if (use_lock)
2308                                 LWLockRelease(ControlFileLock);
2309                         return false;
2310                 }
2311 #endif   /* WIN32 */
2312
2313                 ereport(ERROR,
2314                                 (errcode_for_file_access(),
2315                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2316                                                 tmppath, path, *log, *seg)));
2317         }
2318 #endif
2319
2320         if (use_lock)
2321                 LWLockRelease(ControlFileLock);
2322
2323         return true;
2324 }
2325
2326 /*
2327  * Open a pre-existing logfile segment for writing.
2328  */
2329 static int
2330 XLogFileOpen(uint32 log, uint32 seg)
2331 {
2332         char            path[MAXPGPATH];
2333         int                     fd;
2334
2335         XLogFilePath(path, ThisTimeLineID, log, seg);
2336
2337         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2338                                            S_IRUSR | S_IWUSR);
2339         if (fd < 0)
2340                 ereport(PANIC,
2341                                 (errcode_for_file_access(),
2342                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2343                                   path, log, seg)));
2344
2345         return fd;
2346 }
2347
2348 /*
2349  * Open a logfile segment for reading (during recovery).
2350  */
2351 static int
2352 XLogFileRead(uint32 log, uint32 seg, int emode)
2353 {
2354         char            path[MAXPGPATH];
2355         char            xlogfname[MAXFNAMELEN];
2356         char            activitymsg[MAXFNAMELEN + 16];
2357         ListCell   *cell;
2358         int                     fd;
2359
2360         /*
2361          * Loop looking for a suitable timeline ID: we might need to read any of
2362          * the timelines listed in expectedTLIs.
2363          *
2364          * We expect curFileTLI on entry to be the TLI of the preceding file in
2365          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2366          * to go backwards; this prevents us from picking up the wrong file when a
2367          * parent timeline extends to higher segment numbers than the child we
2368          * want to read.
2369          */
2370         foreach(cell, expectedTLIs)
2371         {
2372                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2373
2374                 if (tli < curFileTLI)
2375                         break;                          /* don't bother looking at too-old TLIs */
2376
2377                 XLogFileName(xlogfname, tli, log, seg);
2378
2379                 if (InArchiveRecovery)
2380                 {
2381                         /* Report recovery progress in PS display */
2382                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2383                                          xlogfname);
2384                         set_ps_display(activitymsg, false);
2385
2386                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2387                                                                                                           "RECOVERYXLOG",
2388                                                                                                           XLogSegSize);
2389                 }
2390                 else
2391                         XLogFilePath(path, tli, log, seg);
2392
2393                 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2394                 if (fd >= 0)
2395                 {
2396                         /* Success! */
2397                         curFileTLI = tli;
2398
2399                         /* Report recovery progress in PS display */
2400                         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2401                                          xlogfname);
2402                         set_ps_display(activitymsg, false);
2403
2404                         return fd;
2405                 }
2406                 if (errno != ENOENT)    /* unexpected failure? */
2407                         ereport(PANIC,
2408                                         (errcode_for_file_access(),
2409                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2410                                    path, log, seg)));
2411         }
2412
2413         /* Couldn't find it.  For simplicity, complain about front timeline */
2414         XLogFilePath(path, recoveryTargetTLI, log, seg);
2415         errno = ENOENT;
2416         ereport(emode,
2417                         (errcode_for_file_access(),
2418                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2419                                   path, log, seg)));
2420         return -1;
2421 }
2422
2423 /*
2424  * Close the current logfile segment for writing.
2425  */
2426 static void
2427 XLogFileClose(void)
2428 {
2429         Assert(openLogFile >= 0);
2430
2431         /*
2432          * posix_fadvise is problematic on many platforms: on older x86 Linux it
2433          * just dumps core, and there are reports of problems on PPC platforms as
2434          * well.  The following is therefore disabled for the time being. We could
2435          * consider some kind of configure test to see if it's safe to use, but
2436          * since we lack hard evidence that there's any useful performance gain to
2437          * be had, spending time on that seems unprofitable for now.
2438          */
2439 #ifdef NOT_USED
2440
2441         /*
2442          * WAL segment files will not be re-read in normal operation, so we advise
2443          * OS to release any cached pages.      But do not do so if WAL archiving is
2444          * active, because archiver process could use the cache to read the WAL
2445          * segment.
2446          *
2447          * While O_DIRECT works for O_SYNC, posix_fadvise() works for fsync() and
2448          * O_SYNC, and some platforms only have posix_fadvise().
2449          */
2450 #if defined(HAVE_DECL_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2451         if (!XLogArchivingActive())
2452                 posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2453 #endif
2454 #endif   /* NOT_USED */
2455
2456         if (close(openLogFile))
2457                 ereport(PANIC,
2458                                 (errcode_for_file_access(),
2459                                  errmsg("could not close log file %u, segment %u: %m",
2460                                                 openLogId, openLogSeg)));
2461         openLogFile = -1;
2462 }
2463
2464 /*
2465  * Attempt to retrieve the specified file from off-line archival storage.
2466  * If successful, fill "path" with its complete path (note that this will be
2467  * a temp file name that doesn't follow the normal naming convention), and
2468  * return TRUE.
2469  *
2470  * If not successful, fill "path" with the name of the normal on-line file
2471  * (which may or may not actually exist, but we'll try to use it), and return
2472  * FALSE.
2473  *
2474  * For fixed-size files, the caller may pass the expected size as an
2475  * additional crosscheck on successful recovery.  If the file size is not
2476  * known, set expectedSize = 0.
2477  */
2478 static bool
2479 RestoreArchivedFile(char *path, const char *xlogfname,
2480                                         const char *recovername, off_t expectedSize)
2481 {
2482         char            xlogpath[MAXPGPATH];
2483         char            xlogRestoreCmd[MAXPGPATH];
2484         char            lastRestartPointFname[MAXPGPATH];
2485         char       *dp;
2486         char       *endp;
2487         const char *sp;
2488         int                     rc;
2489         bool            signaled;
2490         struct stat stat_buf;
2491         uint32          restartLog;
2492         uint32          restartSeg;
2493
2494         /*
2495          * When doing archive recovery, we always prefer an archived log file even
2496          * if a file of the same name exists in XLOGDIR.  The reason is that the
2497          * file in XLOGDIR could be an old, un-filled or partly-filled version
2498          * that was copied and restored as part of backing up $PGDATA.
2499          *
2500          * We could try to optimize this slightly by checking the local copy
2501          * lastchange timestamp against the archived copy, but we have no API to
2502          * do this, nor can we guarantee that the lastchange timestamp was
2503          * preserved correctly when we copied to archive. Our aim is robustness,
2504          * so we elect not to do this.
2505          *
2506          * If we cannot obtain the log file from the archive, however, we will try
2507          * to use the XLOGDIR file if it exists.  This is so that we can make use
2508          * of log segments that weren't yet transferred to the archive.
2509          *
2510          * Notice that we don't actually overwrite any files when we copy back
2511          * from archive because the recoveryRestoreCommand may inadvertently
2512          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2513          * fallback to the segments remaining in current XLOGDIR later. The
2514          * copy-from-archive filename is always the same, ensuring that we don't
2515          * run out of disk space on long recoveries.
2516          */
2517         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2518
2519         /*
2520          * Make sure there is no existing file named recovername.
2521          */
2522         if (stat(xlogpath, &stat_buf) != 0)
2523         {
2524                 if (errno != ENOENT)
2525                         ereport(FATAL,
2526                                         (errcode_for_file_access(),
2527                                          errmsg("could not stat file \"%s\": %m",
2528                                                         xlogpath)));
2529         }
2530         else
2531         {
2532                 if (unlink(xlogpath) != 0)
2533                         ereport(FATAL,
2534                                         (errcode_for_file_access(),
2535                                          errmsg("could not remove file \"%s\": %m",
2536                                                         xlogpath)));
2537         }
2538
2539         /*
2540          * Calculate the archive file cutoff point for use during log shipping
2541          * replication. All files earlier than this point can be deleted
2542          * from the archive, though there is no requirement to do so.
2543          *
2544          * We initialise this with the filename of an InvalidXLogRecPtr, which
2545          * will prevent the deletion of any WAL files from the archive
2546          * because of the alphabetic sorting property of WAL filenames.
2547          *
2548          * Once we have successfully located the redo pointer of the checkpoint
2549          * from which we start recovery we never request a file prior to the redo
2550          * pointer of the last restartpoint. When redo begins we know that we
2551          * have successfully located it, so there is no need for additional
2552          * status flags to signify the point when we can begin deleting WAL files
2553          * from the archive.
2554          */
2555         if (InRedo)
2556         {
2557                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2558                                         restartLog, restartSeg);
2559                 XLogFileName(lastRestartPointFname,
2560                                          ControlFile->checkPointCopy.ThisTimeLineID,
2561                                          restartLog, restartSeg);
2562                 /* we shouldn't need anything earlier than last restart point */
2563                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2564         }
2565         else
2566                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2567
2568         /*
2569          * construct the command to be executed
2570          */
2571         dp = xlogRestoreCmd;
2572         endp = xlogRestoreCmd + MAXPGPATH - 1;
2573         *endp = '\0';
2574
2575         for (sp = recoveryRestoreCommand; *sp; sp++)
2576         {
2577                 if (*sp == '%')
2578                 {
2579                         switch (sp[1])
2580                         {
2581                                 case 'p':
2582                                         /* %p: relative path of target file */
2583                                         sp++;
2584                                         StrNCpy(dp, xlogpath, endp - dp);
2585                                         make_native_path(dp);
2586                                         dp += strlen(dp);
2587                                         break;
2588                                 case 'f':
2589                                         /* %f: filename of desired file */
2590                                         sp++;
2591                                         StrNCpy(dp, xlogfname, endp - dp);
2592                                         dp += strlen(dp);
2593                                         break;
2594                                 case 'r':
2595                                         /* %r: filename of last restartpoint */
2596                                         sp++;
2597                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2598                                         dp += strlen(dp);
2599                                         break;
2600                                 case '%':
2601                                         /* convert %% to a single % */
2602                                         sp++;
2603                                         if (dp < endp)
2604                                                 *dp++ = *sp;
2605                                         break;
2606                                 default:
2607                                         /* otherwise treat the % as not special */
2608                                         if (dp < endp)
2609                                                 *dp++ = *sp;
2610                                         break;
2611                         }
2612                 }
2613                 else
2614                 {
2615                         if (dp < endp)
2616                                 *dp++ = *sp;
2617                 }
2618         }
2619         *dp = '\0';
2620
2621         ereport(DEBUG3,
2622                         (errmsg_internal("executing restore command \"%s\"",
2623                                                          xlogRestoreCmd)));
2624
2625         /*
2626          * Copy xlog from archival storage to XLOGDIR
2627          */
2628         rc = system(xlogRestoreCmd);
2629         if (rc == 0)
2630         {
2631                 /*
2632                  * command apparently succeeded, but let's make sure the file is
2633                  * really there now and has the correct size.
2634                  *
2635                  * XXX I made wrong-size a fatal error to ensure the DBA would notice
2636                  * it, but is that too strong?  We could try to plow ahead with a
2637                  * local copy of the file ... but the problem is that there probably
2638                  * isn't one, and we'd incorrectly conclude we've reached the end of
2639                  * WAL and we're done recovering ...
2640                  */
2641                 if (stat(xlogpath, &stat_buf) == 0)
2642                 {
2643                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2644                                 ereport(FATAL,
2645                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
2646                                                                 xlogfname,
2647                                                                 (unsigned long) stat_buf.st_size,
2648                                                                 (unsigned long) expectedSize)));
2649                         else
2650                         {
2651                                 ereport(LOG,
2652                                                 (errmsg("restored log file \"%s\" from archive",
2653                                                                 xlogfname)));
2654                                 strcpy(path, xlogpath);
2655                                 return true;
2656                         }
2657                 }
2658                 else
2659                 {
2660                         /* stat failed */
2661                         if (errno != ENOENT)
2662                                 ereport(FATAL,
2663                                                 (errcode_for_file_access(),
2664                                                  errmsg("could not stat file \"%s\": %m",
2665                                                                 xlogpath)));
2666                 }
2667         }
2668
2669         /*
2670          * Remember, we rollforward UNTIL the restore fails so failure here is
2671          * just part of the process... that makes it difficult to determine
2672          * whether the restore failed because there isn't an archive to restore,
2673          * or because the administrator has specified the restore program
2674          * incorrectly.  We have to assume the former.
2675          *
2676          * However, if the failure was due to any sort of signal, it's best to
2677          * punt and abort recovery.  (If we "return false" here, upper levels will
2678          * assume that recovery is complete and start up the database!) It's
2679          * essential to abort on child SIGINT and SIGQUIT, because per spec
2680          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
2681          * those it's a good bet we should have gotten it too.  Aborting on other
2682          * signals such as SIGTERM seems a good idea as well.
2683          *
2684          * Per the Single Unix Spec, shells report exit status > 128 when a called
2685          * command died on a signal.  Also, 126 and 127 are used to report
2686          * problems such as an unfindable command; treat those as fatal errors
2687          * too.
2688          */
2689         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2690
2691         ereport(signaled ? FATAL : DEBUG2,
2692                 (errmsg("could not restore file \"%s\" from archive: return code %d",
2693                                 xlogfname, rc)));
2694
2695         /*
2696          * if an archived file is not available, there might still be a version of
2697          * this file in XLOGDIR, so return that as the filename to open.
2698          *
2699          * In many recovery scenarios we expect this to fail also, but if so that
2700          * just means we've reached the end of WAL.
2701          */
2702         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2703         return false;
2704 }
2705
2706 /*
2707  * Preallocate log files beyond the specified log endpoint.
2708  *
2709  * XXX this is currently extremely conservative, since it forces only one
2710  * future log segment to exist, and even that only if we are 75% done with
2711  * the current one.  This is only appropriate for very low-WAL-volume systems.
2712  * High-volume systems will be OK once they've built up a sufficient set of
2713  * recycled log segments, but the startup transient is likely to include
2714  * a lot of segment creations by foreground processes, which is not so good.
2715  */
2716 static void
2717 PreallocXlogFiles(XLogRecPtr endptr)
2718 {
2719         uint32          _logId;
2720         uint32          _logSeg;
2721         int                     lf;
2722         bool            use_existent;
2723
2724         XLByteToPrevSeg(endptr, _logId, _logSeg);
2725         if ((endptr.xrecoff - 1) % XLogSegSize >=
2726                 (uint32) (0.75 * XLogSegSize))
2727         {
2728                 NextLogSeg(_logId, _logSeg);
2729                 use_existent = true;
2730                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
2731                 close(lf);
2732                 if (!use_existent)
2733                         CheckpointStats.ckpt_segs_added++;
2734         }
2735 }
2736
2737 /*
2738  * Recycle or remove all log files older or equal to passed log/seg#
2739  *
2740  * endptr is current (or recent) end of xlog; this is used to determine
2741  * whether we want to recycle rather than delete no-longer-wanted log files.
2742  */
2743 static void
2744 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
2745 {
2746         uint32          endlogId;
2747         uint32          endlogSeg;
2748         int                     max_advance;
2749         DIR                *xldir;
2750         struct dirent *xlde;
2751         char            lastoff[MAXFNAMELEN];
2752         char            path[MAXPGPATH];
2753
2754         /*
2755          * Initialize info about where to try to recycle to.  We allow recycling
2756          * segments up to XLOGfileslop segments beyond the current XLOG location.
2757          */
2758         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
2759         max_advance = XLOGfileslop;
2760
2761         xldir = AllocateDir(XLOGDIR);
2762         if (xldir == NULL)
2763                 ereport(ERROR,
2764                                 (errcode_for_file_access(),
2765                                  errmsg("could not open transaction log directory \"%s\": %m",
2766                                                 XLOGDIR)));
2767
2768         XLogFileName(lastoff, ThisTimeLineID, log, seg);
2769
2770         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2771         {
2772                 /*
2773                  * We ignore the timeline part of the XLOG segment identifiers in
2774                  * deciding whether a segment is still needed.  This ensures that we
2775                  * won't prematurely remove a segment from a parent timeline. We could
2776                  * probably be a little more proactive about removing segments of
2777                  * non-parent timelines, but that would be a whole lot more
2778                  * complicated.
2779                  *
2780                  * We use the alphanumeric sorting property of the filenames to decide
2781                  * which ones are earlier than the lastoff segment.
2782                  */
2783                 if (strlen(xlde->d_name) == 24 &&
2784                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2785                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
2786                 {
2787                         if (XLogArchiveCheckDone(xlde->d_name))
2788                         {
2789                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2790
2791                                 /*
2792                                  * Before deleting the file, see if it can be recycled as a
2793                                  * future log segment.
2794                                  */
2795                                 if (InstallXLogFileSegment(&endlogId, &endlogSeg, path,
2796                                                                                    true, &max_advance,
2797                                                                                    true))
2798                                 {
2799                                         ereport(DEBUG2,
2800                                                         (errmsg("recycled transaction log file \"%s\"",
2801                                                                         xlde->d_name)));
2802                                         CheckpointStats.ckpt_segs_recycled++;
2803                                         /* Needn't recheck that slot on future iterations */
2804                                         if (max_advance > 0)
2805                                         {
2806                                                 NextLogSeg(endlogId, endlogSeg);
2807                                                 max_advance--;
2808                                         }
2809                                 }
2810                                 else
2811                                 {
2812                                         /* No need for any more future segments... */
2813                                         ereport(DEBUG2,
2814                                                         (errmsg("removing transaction log file \"%s\"",
2815                                                                         xlde->d_name)));
2816                                         unlink(path);
2817                                         CheckpointStats.ckpt_segs_removed++;
2818                                 }
2819
2820                                 XLogArchiveCleanup(xlde->d_name);
2821                         }
2822                 }
2823         }
2824
2825         FreeDir(xldir);
2826 }
2827
2828 /*
2829  * Verify whether pg_xlog and pg_xlog/archive_status exist.
2830  * If the latter does not exist, recreate it.
2831  *
2832  * It is not the goal of this function to verify the contents of these
2833  * directories, but to help in cases where someone has performed a cluster
2834  * copy for PITR purposes but omitted pg_xlog from the copy.
2835  *
2836  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
2837  * policy decision was made not to.  It is fairly common for pg_xlog to be
2838  * a symlink, and if that was the DBA's intent then automatically making a
2839  * plain directory would result in degraded performance with no notice.
2840  */
2841 static void
2842 ValidateXLOGDirectoryStructure(void)
2843 {
2844         char            path[MAXPGPATH];
2845         struct stat     stat_buf;
2846
2847         /* Check for pg_xlog; if it doesn't exist, error out */
2848         if (stat(XLOGDIR, &stat_buf) != 0 ||
2849                 !S_ISDIR(stat_buf.st_mode))
2850                 ereport(FATAL,
2851                                 (errmsg("required WAL directory \"%s\" does not exist",
2852                                                 XLOGDIR)));
2853
2854         /* Check for archive_status */
2855         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
2856         if (stat(path, &stat_buf) == 0)
2857         {
2858                 /* Check for weird cases where it exists but isn't a directory */
2859                 if (!S_ISDIR(stat_buf.st_mode))
2860                         ereport(FATAL,
2861                                         (errmsg("required WAL directory \"%s\" does not exist",
2862                                                         path)));
2863         }
2864         else
2865         {
2866                 ereport(LOG,
2867                                 (errmsg("creating missing WAL directory \"%s\"", path)));
2868                 if (mkdir(path, 0700) < 0)
2869                         ereport(FATAL,
2870                                         (errmsg("could not create missing directory \"%s\": %m",
2871                                                         path)));
2872         }
2873 }
2874
2875 /*
2876  * Remove previous backup history files.  This also retries creation of
2877  * .ready files for any backup history files for which XLogArchiveNotify
2878  * failed earlier.
2879  */
2880 static void
2881 CleanupBackupHistory(void)
2882 {
2883         DIR                *xldir;
2884         struct dirent *xlde;
2885         char            path[MAXPGPATH];
2886
2887         xldir = AllocateDir(XLOGDIR);
2888         if (xldir == NULL)
2889                 ereport(ERROR,
2890                                 (errcode_for_file_access(),
2891                                  errmsg("could not open transaction log directory \"%s\": %m",
2892                                                 XLOGDIR)));
2893
2894         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2895         {
2896                 if (strlen(xlde->d_name) > 24 &&
2897                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2898                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
2899                                    ".backup") == 0)
2900                 {
2901                         if (XLogArchiveCheckDone(xlde->d_name))
2902                         {
2903                                 ereport(DEBUG2,
2904                                 (errmsg("removing transaction log backup history file \"%s\"",
2905                                                 xlde->d_name)));
2906                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2907                                 unlink(path);
2908                                 XLogArchiveCleanup(xlde->d_name);
2909                         }
2910                 }
2911         }
2912
2913         FreeDir(xldir);
2914 }
2915
2916 /*
2917  * Restore the backup blocks present in an XLOG record, if any.
2918  *
2919  * We assume all of the record has been read into memory at *record.
2920  *
2921  * Note: when a backup block is available in XLOG, we restore it
2922  * unconditionally, even if the page in the database appears newer.
2923  * This is to protect ourselves against database pages that were partially
2924  * or incorrectly written during a crash.  We assume that the XLOG data
2925  * must be good because it has passed a CRC check, while the database
2926  * page might not be.  This will force us to replay all subsequent
2927  * modifications of the page that appear in XLOG, rather than possibly
2928  * ignoring them as already applied, but that's not a huge drawback.
2929  */
2930 static void
2931 RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
2932 {
2933         Buffer          buffer;
2934         Page            page;
2935         BkpBlock        bkpb;
2936         char       *blk;
2937         int                     i;
2938
2939         blk = (char *) XLogRecGetData(record) + record->xl_len;
2940         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
2941         {
2942                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
2943                         continue;
2944
2945                 memcpy(&bkpb, blk, sizeof(BkpBlock));
2946                 blk += sizeof(BkpBlock);
2947
2948                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
2949                                                                                 RBM_ZERO);
2950                 Assert(BufferIsValid(buffer));
2951                 page = (Page) BufferGetPage(buffer);
2952
2953                 if (bkpb.hole_length == 0)
2954                 {
2955                         memcpy((char *) page, blk, BLCKSZ);
2956                 }
2957                 else
2958                 {
2959                         /* must zero-fill the hole */
2960                         MemSet((char *) page, 0, BLCKSZ);
2961                         memcpy((char *) page, blk, bkpb.hole_offset);
2962                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
2963                                    blk + bkpb.hole_offset,
2964                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
2965                 }
2966
2967                 PageSetLSN(page, lsn);
2968                 PageSetTLI(page, ThisTimeLineID);
2969                 MarkBufferDirty(buffer);
2970                 UnlockReleaseBuffer(buffer);
2971
2972                 blk += BLCKSZ - bkpb.hole_length;
2973         }
2974 }
2975
2976 /*
2977  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
2978  * record (other than to the minimal extent of computing the amount of
2979  * data to read in) until we've checked the CRCs.
2980  *
2981  * We assume all of the record has been read into memory at *record.
2982  */
2983 static bool
2984 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
2985 {
2986         pg_crc32        crc;
2987         int                     i;
2988         uint32          len = record->xl_len;
2989         BkpBlock        bkpb;
2990         char       *blk;
2991
2992         /* First the rmgr data */
2993         INIT_CRC32(crc);
2994         COMP_CRC32(crc, XLogRecGetData(record), len);
2995
2996         /* Add in the backup blocks, if any */
2997         blk = (char *) XLogRecGetData(record) + len;
2998         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
2999         {
3000                 uint32          blen;
3001
3002                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3003                         continue;
3004
3005                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3006                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3007                 {
3008                         ereport(emode,
3009                                         (errmsg("incorrect hole size in record at %X/%X",
3010                                                         recptr.xlogid, recptr.xrecoff)));
3011                         return false;
3012                 }
3013                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3014                 COMP_CRC32(crc, blk, blen);
3015                 blk += blen;
3016         }
3017
3018         /* Check that xl_tot_len agrees with our calculation */
3019         if (blk != (char *) record + record->xl_tot_len)
3020         {
3021                 ereport(emode,
3022                                 (errmsg("incorrect total length in record at %X/%X",
3023                                                 recptr.xlogid, recptr.xrecoff)));
3024                 return false;
3025         }
3026
3027         /* Finally include the record header */
3028         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3029                            SizeOfXLogRecord - sizeof(pg_crc32));
3030         FIN_CRC32(crc);
3031
3032         if (!EQ_CRC32(record->xl_crc, crc))
3033         {
3034                 ereport(emode,
3035                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3036                                 recptr.xlogid, recptr.xrecoff)));
3037                 return false;
3038         }
3039
3040         return true;
3041 }
3042
3043 /*
3044  * Attempt to read an XLOG record.
3045  *
3046  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3047  * try to read a record just after the last one previously read.
3048  *
3049  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3050  * (emode must be either PANIC or LOG.)
3051  *
3052  * The record is copied into readRecordBuf, so that on successful return,
3053  * the returned record pointer always points there.
3054  */
3055 static XLogRecord *
3056 ReadRecord(XLogRecPtr *RecPtr, int emode)
3057 {
3058         XLogRecord *record;
3059         char       *buffer;
3060         XLogRecPtr      tmpRecPtr = EndRecPtr;
3061         bool            randAccess = false;
3062         uint32          len,
3063                                 total_len;
3064         uint32          targetPageOff;
3065         uint32          targetRecOff;
3066         uint32          pageHeaderSize;
3067
3068         if (readBuf == NULL)
3069         {
3070                 /*
3071                  * First time through, permanently allocate readBuf.  We do it this
3072                  * way, rather than just making a static array, for two reasons: (1)
3073                  * no need to waste the storage in most instantiations of the backend;
3074                  * (2) a static char array isn't guaranteed to have any particular
3075                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3076                  */
3077                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3078                 Assert(readBuf != NULL);
3079         }
3080
3081         if (RecPtr == NULL)
3082         {
3083                 RecPtr = &tmpRecPtr;
3084                 /* fast case if next record is on same page */
3085                 if (nextRecord != NULL)
3086                 {
3087                         record = nextRecord;
3088                         goto got_record;
3089                 }
3090                 /* align old recptr to next page */
3091                 if (tmpRecPtr.xrecoff % XLOG_BLCKSZ != 0)
3092                         tmpRecPtr.xrecoff += (XLOG_BLCKSZ - tmpRecPtr.xrecoff % XLOG_BLCKSZ);
3093                 if (tmpRecPtr.xrecoff >= XLogFileSize)
3094                 {
3095                         (tmpRecPtr.xlogid)++;
3096                         tmpRecPtr.xrecoff = 0;
3097                 }
3098                 /* We will account for page header size below */
3099         }
3100         else
3101         {
3102                 if (!XRecOffIsValid(RecPtr->xrecoff))
3103                         ereport(PANIC,
3104                                         (errmsg("invalid record offset at %X/%X",
3105                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3106
3107                 /*
3108                  * Since we are going to a random position in WAL, forget any prior
3109                  * state about what timeline we were in, and allow it to be any
3110                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3111                  * to go backwards (but we can't reset that variable right here, since
3112                  * we might not change files at all).
3113                  */
3114                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3115                 randAccess = true;              /* allow curFileTLI to go backwards too */
3116         }
3117
3118         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
3119         {
3120                 close(readFile);
3121                 readFile = -1;
3122         }
3123         XLByteToSeg(*RecPtr, readId, readSeg);
3124         if (readFile < 0)
3125         {
3126                 /* Now it's okay to reset curFileTLI if random fetch */
3127                 if (randAccess)
3128                         curFileTLI = 0;
3129
3130                 readFile = XLogFileRead(readId, readSeg, emode);
3131                 if (readFile < 0)
3132                         goto next_record_is_invalid;
3133
3134                 /*
3135                  * Whenever switching to a new WAL segment, we read the first page of
3136                  * the file and validate its header, even if that's not where the
3137                  * target record is.  This is so that we can check the additional
3138                  * identification info that is present in the first page's "long"
3139                  * header.
3140                  */
3141                 readOff = 0;
3142                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3143                 {
3144                         ereport(emode,
3145                                         (errcode_for_file_access(),
3146                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3147                                                         readId, readSeg, readOff)));
3148                         goto next_record_is_invalid;
3149                 }
3150                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3151                         goto next_record_is_invalid;
3152         }
3153
3154         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3155         if (readOff != targetPageOff)
3156         {
3157                 readOff = targetPageOff;
3158                 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
3159                 {
3160                         ereport(emode,
3161                                         (errcode_for_file_access(),
3162                                          errmsg("could not seek in log file %u, segment %u to offset %u: %m",
3163                                                         readId, readSeg, readOff)));
3164                         goto next_record_is_invalid;
3165                 }
3166                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3167                 {
3168                         ereport(emode,
3169                                         (errcode_for_file_access(),
3170                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3171                                                         readId, readSeg, readOff)));
3172                         goto next_record_is_invalid;
3173                 }
3174                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3175                         goto next_record_is_invalid;
3176         }
3177         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3178         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3179         if (targetRecOff == 0)
3180         {
3181                 /*
3182                  * Can only get here in the continuing-from-prev-page case, because
3183                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
3184                  * to skip over the new page's header.
3185                  */
3186                 tmpRecPtr.xrecoff += pageHeaderSize;
3187                 targetRecOff = pageHeaderSize;
3188         }
3189         else if (targetRecOff < pageHeaderSize)
3190         {
3191                 ereport(emode,
3192                                 (errmsg("invalid record offset at %X/%X",
3193                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3194                 goto next_record_is_invalid;
3195         }
3196         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3197                 targetRecOff == pageHeaderSize)
3198         {
3199                 ereport(emode,
3200                                 (errmsg("contrecord is requested by %X/%X",
3201                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3202                 goto next_record_is_invalid;
3203         }
3204         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3205
3206 got_record:;
3207
3208         /*
3209          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3210          * required.
3211          */
3212         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3213         {
3214                 if (record->xl_len != 0)
3215                 {
3216                         ereport(emode,
3217                                         (errmsg("invalid xlog switch record at %X/%X",
3218                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3219                         goto next_record_is_invalid;
3220                 }
3221         }
3222         else if (record->xl_len == 0)
3223         {
3224                 ereport(emode,
3225                                 (errmsg("record with zero length at %X/%X",
3226                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3227                 goto next_record_is_invalid;
3228         }
3229         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3230                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3231                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3232         {
3233                 ereport(emode,
3234                                 (errmsg("invalid record length at %X/%X",
3235                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3236                 goto next_record_is_invalid;
3237         }
3238         if (record->xl_rmid > RM_MAX_ID)
3239         {
3240                 ereport(emode,
3241                                 (errmsg("invalid resource manager ID %u at %X/%X",
3242                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3243                 goto next_record_is_invalid;
3244         }
3245         if (randAccess)
3246         {
3247                 /*
3248                  * We can't exactly verify the prev-link, but surely it should be less
3249                  * than the record's own address.
3250                  */
3251                 if (!XLByteLT(record->xl_prev, *RecPtr))
3252                 {
3253                         ereport(emode,
3254                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3255                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3256                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3257                         goto next_record_is_invalid;
3258                 }
3259         }
3260         else
3261         {
3262                 /*
3263                  * Record's prev-link should exactly match our previous location. This
3264                  * check guards against torn WAL pages where a stale but valid-looking
3265                  * WAL record starts on a sector boundary.
3266                  */
3267                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3268                 {
3269                         ereport(emode,
3270                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3271                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3272                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3273                         goto next_record_is_invalid;
3274                 }
3275         }
3276
3277         /*
3278          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3279          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3280          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3281          * enough for all "normal" records, but very large commit or abort records
3282          * might need more space.)
3283          */
3284         total_len = record->xl_tot_len;
3285         if (total_len > readRecordBufSize)
3286         {
3287                 uint32          newSize = total_len;
3288
3289                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3290                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3291                 if (readRecordBuf)
3292                         free(readRecordBuf);
3293                 readRecordBuf = (char *) malloc(newSize);
3294                 if (!readRecordBuf)
3295                 {
3296                         readRecordBufSize = 0;
3297                         /* We treat this as a "bogus data" condition */
3298                         ereport(emode,
3299                                         (errmsg("record length %u at %X/%X too long",
3300                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3301                         goto next_record_is_invalid;
3302                 }
3303                 readRecordBufSize = newSize;
3304         }
3305
3306         buffer = readRecordBuf;
3307         nextRecord = NULL;
3308         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3309         if (total_len > len)
3310         {
3311                 /* Need to reassemble record */
3312                 XLogContRecord *contrecord;
3313                 uint32          gotlen = len;
3314
3315                 memcpy(buffer, record, len);
3316                 record = (XLogRecord *) buffer;
3317                 buffer += len;
3318                 for (;;)
3319                 {
3320                         readOff += XLOG_BLCKSZ;
3321                         if (readOff >= XLogSegSize)
3322                         {
3323                                 close(readFile);
3324                                 readFile = -1;
3325                                 NextLogSeg(readId, readSeg);
3326                                 readFile = XLogFileRead(readId, readSeg, emode);
3327                                 if (readFile < 0)
3328                                         goto next_record_is_invalid;
3329                                 readOff = 0;
3330                         }
3331                         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3332                         {
3333                                 ereport(emode,
3334                                                 (errcode_for_file_access(),
3335                                                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
3336                                                                 readId, readSeg, readOff)));
3337                                 goto next_record_is_invalid;
3338                         }
3339                         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3340                                 goto next_record_is_invalid;
3341                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3342                         {
3343                                 ereport(emode,
3344                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3345                                                                 readId, readSeg, readOff)));
3346                                 goto next_record_is_invalid;
3347                         }
3348                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3349                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
3350                         if (contrecord->xl_rem_len == 0 ||
3351                                 total_len != (contrecord->xl_rem_len + gotlen))
3352                         {
3353                                 ereport(emode,
3354                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
3355                                                                 contrecord->xl_rem_len,
3356                                                                 readId, readSeg, readOff)));
3357                                 goto next_record_is_invalid;
3358                         }
3359                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
3360                         if (contrecord->xl_rem_len > len)
3361                         {
3362                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
3363                                 gotlen += len;
3364                                 buffer += len;
3365                                 continue;
3366                         }
3367                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
3368                                    contrecord->xl_rem_len);
3369                         break;
3370                 }
3371                 if (!RecordIsValid(record, *RecPtr, emode))
3372                         goto next_record_is_invalid;
3373                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3374                 if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
3375                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
3376                 {
3377                         nextRecord = (XLogRecord *) ((char *) contrecord +
3378                                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
3379                 }
3380                 EndRecPtr.xlogid = readId;
3381                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3382                         pageHeaderSize +
3383                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3384                 ReadRecPtr = *RecPtr;
3385                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
3386                 return record;
3387         }
3388
3389         /* Record does not cross a page boundary */
3390         if (!RecordIsValid(record, *RecPtr, emode))
3391                 goto next_record_is_invalid;
3392         if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
3393                 MAXALIGN(total_len))
3394                 nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
3395         EndRecPtr.xlogid = RecPtr->xlogid;
3396         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3397         ReadRecPtr = *RecPtr;
3398         memcpy(buffer, record, total_len);
3399
3400         /*
3401          * Special processing if it's an XLOG SWITCH record
3402          */
3403         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3404         {
3405                 /* Pretend it extends to end of segment */
3406                 EndRecPtr.xrecoff += XLogSegSize - 1;
3407                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
3408                 nextRecord = NULL;              /* definitely not on same page */
3409
3410                 /*
3411                  * Pretend that readBuf contains the last page of the segment. This is
3412                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3413                  * segment.
3414                  */
3415                 readOff = XLogSegSize - XLOG_BLCKSZ;
3416         }
3417         return (XLogRecord *) buffer;
3418
3419 next_record_is_invalid:;
3420         if (readFile >= 0)
3421         {
3422                 close(readFile);
3423                 readFile = -1;
3424         }
3425         nextRecord = NULL;
3426         return NULL;
3427 }
3428
3429 /*
3430  * Check whether the xlog header of a page just read in looks valid.
3431  *
3432  * This is just a convenience subroutine to avoid duplicated code in
3433  * ReadRecord.  It's not intended for use from anywhere else.
3434  */
3435 static bool
3436 ValidXLOGHeader(XLogPageHeader hdr, int emode)
3437 {
3438         XLogRecPtr      recaddr;
3439
3440         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3441         {
3442                 ereport(emode,
3443                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
3444                                                 hdr->xlp_magic, readId, readSeg, readOff)));
3445                 return false;
3446         }
3447         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3448         {
3449                 ereport(emode,
3450                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3451                                                 hdr->xlp_info, readId, readSeg, readOff)));
3452                 return false;
3453         }
3454         if (hdr->xlp_info & XLP_LONG_HEADER)
3455         {
3456                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3457
3458                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3459                 {
3460                         char            fhdrident_str[32];
3461                         char            sysident_str[32];
3462
3463                         /*
3464                          * Format sysids separately to keep platform-dependent format code
3465                          * out of the translatable message string.
3466                          */
3467                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
3468                                          longhdr->xlp_sysid);
3469                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
3470                                          ControlFile->system_identifier);
3471                         ereport(emode,
3472                                         (errmsg("WAL file is from different system"),
3473                                          errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
3474                                                            fhdrident_str, sysident_str)));
3475                         return false;
3476                 }
3477                 if (longhdr->xlp_seg_size != XLogSegSize)
3478                 {
3479                         ereport(emode,
3480                                         (errmsg("WAL file is from different system"),
3481                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3482                         return false;
3483                 }
3484                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
3485                 {
3486                         ereport(emode,
3487                                         (errmsg("WAL file is from different system"),
3488                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
3489                         return false;
3490                 }
3491         }
3492         else if (readOff == 0)
3493         {
3494                 /* hmm, first page of file doesn't have a long header? */
3495                 ereport(emode,
3496                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3497                                                 hdr->xlp_info, readId, readSeg, readOff)));
3498                 return false;
3499         }
3500
3501         recaddr.xlogid = readId;
3502         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
3503         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
3504         {
3505                 ereport(emode,
3506                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
3507                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
3508                                                 readId, readSeg, readOff)));
3509                 return false;
3510         }
3511
3512         /*
3513          * Check page TLI is one of the expected values.
3514          */
3515         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
3516         {
3517                 ereport(emode,
3518                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
3519                                                 hdr->xlp_tli,
3520                                                 readId, readSeg, readOff)));
3521                 return false;
3522         }
3523
3524         /*
3525          * Since child timelines are always assigned a TLI greater than their
3526          * immediate parent's TLI, we should never see TLI go backwards across
3527          * successive pages of a consistent WAL sequence.
3528          *
3529          * Of course this check should only be applied when advancing sequentially
3530          * across pages; therefore ReadRecord resets lastPageTLI to zero when
3531          * going to a random page.
3532          */
3533         if (hdr->xlp_tli < lastPageTLI)
3534         {
3535                 ereport(emode,
3536                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
3537                                                 hdr->xlp_tli, lastPageTLI,
3538                                                 readId, readSeg, readOff)));
3539                 return false;
3540         }
3541         lastPageTLI = hdr->xlp_tli;
3542         return true;
3543 }
3544
3545 /*
3546  * Try to read a timeline's history file.
3547  *
3548  * If successful, return the list of component TLIs (the given TLI followed by
3549  * its ancestor TLIs).  If we can't find the history file, assume that the
3550  * timeline has no parents, and return a list of just the specified timeline
3551  * ID.
3552  */
3553 static List *
3554 readTimeLineHistory(TimeLineID targetTLI)
3555 {
3556         List       *result;
3557         char            path[MAXPGPATH];
3558         char            histfname[MAXFNAMELEN];
3559         char            fline[MAXPGPATH];
3560         FILE       *fd;
3561
3562         if (InArchiveRecovery)
3563         {
3564                 TLHistoryFileName(histfname, targetTLI);
3565                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3566         }
3567         else
3568                 TLHistoryFilePath(path, targetTLI);
3569
3570         fd = AllocateFile(path, "r");
3571         if (fd == NULL)
3572         {
3573                 if (errno != ENOENT)
3574                         ereport(FATAL,
3575                                         (errcode_for_file_access(),
3576                                          errmsg("could not open file \"%s\": %m", path)));
3577                 /* Not there, so assume no parents */
3578                 return list_make1_int((int) targetTLI);
3579         }
3580
3581         result = NIL;
3582
3583         /*
3584          * Parse the file...
3585          */
3586         while (fgets(fline, sizeof(fline), fd) != NULL)
3587         {
3588                 /* skip leading whitespace and check for # comment */
3589                 char       *ptr;
3590                 char       *endptr;
3591                 TimeLineID      tli;
3592
3593                 for (ptr = fline; *ptr; ptr++)
3594                 {
3595                         if (!isspace((unsigned char) *ptr))
3596                                 break;
3597                 }
3598                 if (*ptr == '\0' || *ptr == '#')
3599                         continue;
3600
3601                 /* expect a numeric timeline ID as first field of line */
3602                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
3603                 if (endptr == ptr)
3604                         ereport(FATAL,
3605                                         (errmsg("syntax error in history file: %s", fline),
3606                                          errhint("Expected a numeric timeline ID.")));
3607
3608                 if (result &&
3609                         tli <= (TimeLineID) linitial_int(result))
3610                         ereport(FATAL,
3611                                         (errmsg("invalid data in history file: %s", fline),
3612                                    errhint("Timeline IDs must be in increasing sequence.")));
3613
3614                 /* Build list with newest item first */
3615                 result = lcons_int((int) tli, result);
3616
3617                 /* we ignore the remainder of each line */
3618         }
3619
3620         FreeFile(fd);
3621
3622         if (result &&
3623                 targetTLI <= (TimeLineID) linitial_int(result))
3624                 ereport(FATAL,
3625                                 (errmsg("invalid data in history file \"%s\"", path),
3626                         errhint("Timeline IDs must be less than child timeline's ID.")));
3627
3628         result = lcons_int((int) targetTLI, result);
3629
3630         ereport(DEBUG3,
3631                         (errmsg_internal("history of timeline %u is %s",
3632                                                          targetTLI, nodeToString(result))));
3633
3634         return result;
3635 }
3636
3637 /*
3638  * Probe whether a timeline history file exists for the given timeline ID
3639  */
3640 static bool
3641 existsTimeLineHistory(TimeLineID probeTLI)
3642 {
3643         char            path[MAXPGPATH];
3644         char            histfname[MAXFNAMELEN];
3645         FILE       *fd;
3646
3647         if (InArchiveRecovery)
3648         {
3649                 TLHistoryFileName(histfname, probeTLI);
3650                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3651         }
3652         else
3653                 TLHistoryFilePath(path, probeTLI);
3654
3655         fd = AllocateFile(path, "r");
3656         if (fd != NULL)
3657         {
3658                 FreeFile(fd);
3659                 return true;
3660         }
3661         else
3662         {
3663                 if (errno != ENOENT)
3664                         ereport(FATAL,
3665                                         (errcode_for_file_access(),
3666                                          errmsg("could not open file \"%s\": %m", path)));
3667                 return false;
3668         }
3669 }
3670
3671 /*
3672  * Find the newest existing timeline, assuming that startTLI exists.
3673  *
3674  * Note: while this is somewhat heuristic, it does positively guarantee
3675  * that (result + 1) is not a known timeline, and therefore it should
3676  * be safe to assign that ID to a new timeline.
3677  */
3678 static TimeLineID
3679 findNewestTimeLine(TimeLineID startTLI)
3680 {
3681         TimeLineID      newestTLI;
3682         TimeLineID      probeTLI;
3683
3684         /*
3685          * The algorithm is just to probe for the existence of timeline history
3686          * files.  XXX is it useful to allow gaps in the sequence?
3687          */
3688         newestTLI = startTLI;
3689
3690         for (probeTLI = startTLI + 1;; probeTLI++)
3691         {
3692                 if (existsTimeLineHistory(probeTLI))
3693                 {
3694                         newestTLI = probeTLI;           /* probeTLI exists */
3695                 }
3696                 else
3697                 {
3698                         /* doesn't exist, assume we're done */
3699                         break;
3700                 }
3701         }
3702
3703         return newestTLI;
3704 }
3705
3706 /*
3707  * Create a new timeline history file.
3708  *
3709  *      newTLI: ID of the new timeline
3710  *      parentTLI: ID of its immediate parent
3711  *      endTLI et al: ID of the last used WAL file, for annotation purposes
3712  *
3713  * Currently this is only used during recovery, and so there are no locking
3714  * considerations.      But we should be just as tense as XLogFileInit to avoid
3715  * emplacing a bogus file.
3716  */
3717 static void
3718 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
3719                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
3720 {
3721         char            path[MAXPGPATH];
3722         char            tmppath[MAXPGPATH];
3723         char            histfname[MAXFNAMELEN];
3724         char            xlogfname[MAXFNAMELEN];
3725         char            buffer[BLCKSZ];
3726         int                     srcfd;
3727         int                     fd;
3728         int                     nbytes;
3729
3730         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
3731
3732         /*
3733          * Write into a temp file name.
3734          */
3735         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3736
3737         unlink(tmppath);
3738
3739         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3740         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
3741                                            S_IRUSR | S_IWUSR);
3742         if (fd < 0)
3743                 ereport(ERROR,
3744                                 (errcode_for_file_access(),
3745                                  errmsg("could not create file \"%s\": %m", tmppath)));
3746
3747         /*
3748          * If a history file exists for the parent, copy it verbatim
3749          */
3750         if (InArchiveRecovery)
3751         {
3752                 TLHistoryFileName(histfname, parentTLI);
3753                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3754         }
3755         else
3756                 TLHistoryFilePath(path, parentTLI);
3757
3758         srcfd = BasicOpenFile(path, O_RDONLY, 0);
3759         if (srcfd < 0)
3760         {
3761                 if (errno != ENOENT)
3762                         ereport(ERROR,
3763                                         (errcode_for_file_access(),
3764                                          errmsg("could not open file \"%s\": %m", path)));
3765                 /* Not there, so assume parent has no parents */
3766         }
3767         else
3768         {
3769                 for (;;)
3770                 {
3771                         errno = 0;
3772                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
3773                         if (nbytes < 0 || errno != 0)
3774                                 ereport(ERROR,
3775                                                 (errcode_for_file_access(),
3776                                                  errmsg("could not read file \"%s\": %m", path)));
3777                         if (nbytes == 0)
3778                                 break;
3779                         errno = 0;
3780                         if ((int) write(fd, buffer, nbytes) != nbytes)
3781                         {
3782                                 int                     save_errno = errno;
3783
3784                                 /*
3785                                  * If we fail to make the file, delete it to release disk
3786                                  * space
3787                                  */
3788                                 unlink(tmppath);
3789
3790                                 /*
3791                                  * if write didn't set errno, assume problem is no disk space
3792                                  */
3793                                 errno = save_errno ? save_errno : ENOSPC;
3794
3795                                 ereport(ERROR,
3796                                                 (errcode_for_file_access(),
3797                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3798                         }
3799                 }
3800                 close(srcfd);
3801         }
3802
3803         /*
3804          * Append one line with the details of this timeline split.
3805          *
3806          * If we did have a parent file, insert an extra newline just in case the
3807          * parent file failed to end with one.
3808          */
3809         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
3810
3811         snprintf(buffer, sizeof(buffer),
3812                          "%s%u\t%s\t%s transaction %u at %s\n",
3813                          (srcfd < 0) ? "" : "\n",
3814                          parentTLI,
3815                          xlogfname,
3816                          recoveryStopAfter ? "after" : "before",
3817                          recoveryStopXid,
3818                          timestamptz_to_str(recoveryStopTime));
3819
3820         nbytes = strlen(buffer);
3821         errno = 0;
3822         if ((int) write(fd, buffer, nbytes) != nbytes)
3823         {
3824                 int                     save_errno = errno;
3825
3826                 /*
3827                  * If we fail to make the file, delete it to release disk space
3828                  */
3829                 unlink(tmppath);
3830                 /* if write didn't set errno, assume problem is no disk space */
3831                 errno = save_errno ? save_errno : ENOSPC;
3832
3833                 ereport(ERROR,
3834                                 (errcode_for_file_access(),
3835                                  errmsg("could not write to file \"%s\": %m", tmppath)));
3836         }
3837
3838         if (pg_fsync(fd) != 0)
3839                 ereport(ERROR,
3840                                 (errcode_for_file_access(),
3841                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3842
3843         if (close(fd))
3844                 ereport(ERROR,
3845                                 (errcode_for_file_access(),
3846                                  errmsg("could not close file \"%s\": %m", tmppath)));
3847
3848
3849         /*
3850          * Now move the completed history file into place with its final name.
3851          */
3852         TLHistoryFilePath(path, newTLI);
3853
3854         /*
3855          * Prefer link() to rename() here just to be really sure that we don't
3856          * overwrite an existing logfile.  However, there shouldn't be one, so
3857          * rename() is an acceptable substitute except for the truly paranoid.
3858          */
3859 #if HAVE_WORKING_LINK
3860         if (link(tmppath, path) < 0)
3861                 ereport(ERROR,
3862                                 (errcode_for_file_access(),
3863                                  errmsg("could not link file \"%s\" to \"%s\": %m",
3864                                                 tmppath, path)));
3865         unlink(tmppath);
3866 #else
3867         if (rename(tmppath, path) < 0)
3868                 ereport(ERROR,
3869                                 (errcode_for_file_access(),
3870                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
3871                                                 tmppath, path)));
3872 #endif
3873
3874         /* The history file can be archived immediately. */
3875         TLHistoryFileName(histfname, newTLI);
3876         XLogArchiveNotify(histfname);
3877 }
3878
3879 /*
3880  * I/O routines for pg_control
3881  *
3882  * *ControlFile is a buffer in shared memory that holds an image of the
3883  * contents of pg_control.      WriteControlFile() initializes pg_control
3884  * given a preloaded buffer, ReadControlFile() loads the buffer from
3885  * the pg_control file (during postmaster or standalone-backend startup),
3886  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3887  *
3888  * For simplicity, WriteControlFile() initializes the fields of pg_control
3889  * that are related to checking backend/database compatibility, and
3890  * ReadControlFile() verifies they are correct.  We could split out the
3891  * I/O and compatibility-check functions, but there seems no need currently.
3892  */
3893 static void
3894 WriteControlFile(void)
3895 {
3896         int                     fd;
3897         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3898
3899         /*
3900          * Initialize version and compatibility-check fields
3901          */
3902         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3903         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3904
3905         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3906         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3907
3908         ControlFile->blcksz = BLCKSZ;
3909         ControlFile->relseg_size = RELSEG_SIZE;
3910         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3911         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3912
3913         ControlFile->nameDataLen = NAMEDATALEN;
3914         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3915
3916         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3917
3918 #ifdef HAVE_INT64_TIMESTAMP
3919         ControlFile->enableIntTimes = true;
3920 #else
3921         ControlFile->enableIntTimes = false;
3922 #endif
3923         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3924         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3925
3926         /* Contents are protected with a CRC */
3927         INIT_CRC32(ControlFile->crc);
3928         COMP_CRC32(ControlFile->crc,
3929                            (char *) ControlFile,
3930                            offsetof(ControlFileData, crc));
3931         FIN_CRC32(ControlFile->crc);
3932
3933         /*
3934          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3935          * excess over sizeof(ControlFileData).  This reduces the odds of
3936          * premature-EOF errors when reading pg_control.  We'll still fail when we
3937          * check the contents of the file, but hopefully with a more specific
3938          * error than "couldn't read pg_control".
3939          */
3940         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3941                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3942
3943         memset(buffer, 0, PG_CONTROL_SIZE);
3944         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3945
3946         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3947                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3948                                            S_IRUSR | S_IWUSR);
3949         if (fd < 0)
3950                 ereport(PANIC,
3951                                 (errcode_for_file_access(),
3952                                  errmsg("could not create control file \"%s\": %m",
3953                                                 XLOG_CONTROL_FILE)));
3954
3955         errno = 0;
3956         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3957         {
3958                 /* if write didn't set errno, assume problem is no disk space */
3959                 if (errno == 0)
3960                         errno = ENOSPC;
3961                 ereport(PANIC,
3962                                 (errcode_for_file_access(),
3963                                  errmsg("could not write to control file: %m")));
3964         }
3965
3966         if (pg_fsync(fd) != 0)
3967                 ereport(PANIC,
3968                                 (errcode_for_file_access(),
3969                                  errmsg("could not fsync control file: %m")));
3970
3971         if (close(fd))
3972                 ereport(PANIC,
3973                                 (errcode_for_file_access(),
3974                                  errmsg("could not close control file: %m")));
3975 }
3976
3977 static void
3978 ReadControlFile(void)
3979 {
3980         pg_crc32        crc;
3981         int                     fd;
3982
3983         /*
3984          * Read data...
3985          */
3986         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3987                                            O_RDWR | PG_BINARY,
3988                                            S_IRUSR | S_IWUSR);
3989         if (fd < 0)
3990                 ereport(PANIC,
3991                                 (errcode_for_file_access(),
3992                                  errmsg("could not open control file \"%s\": %m",
3993                                                 XLOG_CONTROL_FILE)));
3994
3995         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3996                 ereport(PANIC,
3997                                 (errcode_for_file_access(),
3998                                  errmsg("could not read from control file: %m")));
3999
4000         close(fd);
4001
4002         /*
4003          * Check for expected pg_control format version.  If this is wrong, the
4004          * CRC check will likely fail because we'll be checking the wrong number
4005          * of bytes.  Complaining about wrong version will probably be more
4006          * enlightening than complaining about wrong CRC.
4007          */
4008
4009         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4010                 ereport(FATAL,
4011                                 (errmsg("database files are incompatible with server"),
4012                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4013                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4014                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
4015                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4016                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4017
4018         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4019                 ereport(FATAL,
4020                                 (errmsg("database files are incompatible with server"),
4021                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4022                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4023                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4024                                  errhint("It looks like you need to initdb.")));
4025
4026         /* Now check the CRC. */
4027         INIT_CRC32(crc);
4028         COMP_CRC32(crc,
4029                            (char *) ControlFile,
4030                            offsetof(ControlFileData, crc));
4031         FIN_CRC32(crc);
4032
4033         if (!EQ_CRC32(crc, ControlFile->crc))
4034                 ereport(FATAL,
4035                                 (errmsg("incorrect checksum in control file")));
4036
4037         /*
4038          * Do compatibility checking immediately.  We do this here for 2 reasons:
4039          *
4040          * (1) if the database isn't compatible with the backend executable, we
4041          * want to abort before we can possibly do any damage;
4042          *
4043          * (2) this code is executed in the postmaster, so the setlocale() will
4044          * propagate to forked backends, which aren't going to read this file for
4045          * themselves.  (These locale settings are considered critical
4046          * compatibility items because they can affect sort order of indexes.)
4047          */
4048         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4049                 ereport(FATAL,
4050                                 (errmsg("database files are incompatible with server"),
4051                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4052                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4053                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4054                                  errhint("It looks like you need to initdb.")));
4055         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4056                 ereport(FATAL,
4057                                 (errmsg("database files are incompatible with server"),
4058                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4059                                          " but the server was compiled with MAXALIGN %d.",
4060                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4061                                  errhint("It looks like you need to initdb.")));
4062         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4063                 ereport(FATAL,
4064                                 (errmsg("database files are incompatible with server"),
4065                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4066                                  errhint("It looks like you need to initdb.")));
4067         if (ControlFile->blcksz != BLCKSZ)
4068                 ereport(FATAL,
4069                                 (errmsg("database files are incompatible with server"),
4070                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4071                                            " but the server was compiled with BLCKSZ %d.",
4072                                            ControlFile->blcksz, BLCKSZ),
4073                                  errhint("It looks like you need to recompile or initdb.")));
4074         if (ControlFile->relseg_size != RELSEG_SIZE)
4075                 ereport(FATAL,
4076                                 (errmsg("database files are incompatible with server"),
4077                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4078                                   " but the server was compiled with RELSEG_SIZE %d.",
4079                                   ControlFile->relseg_size, RELSEG_SIZE),
4080                                  errhint("It looks like you need to recompile or initdb.")));
4081         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4082                 ereport(FATAL,
4083                                 (errmsg("database files are incompatible with server"),
4084                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4085                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4086                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4087                                  errhint("It looks like you need to recompile or initdb.")));
4088         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4089                 ereport(FATAL,
4090                                 (errmsg("database files are incompatible with server"),
4091                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4092                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4093                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4094                                  errhint("It looks like you need to recompile or initdb.")));
4095         if (ControlFile->nameDataLen != NAMEDATALEN)
4096                 ereport(FATAL,
4097                                 (errmsg("database files are incompatible with server"),
4098                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4099                                   " but the server was compiled with NAMEDATALEN %d.",
4100                                   ControlFile->nameDataLen, NAMEDATALEN),
4101                                  errhint("It looks like you need to recompile or initdb.")));
4102         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4103                 ereport(FATAL,
4104                                 (errmsg("database files are incompatible with server"),
4105                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4106                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4107                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4108                                  errhint("It looks like you need to recompile or initdb.")));
4109         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4110                 ereport(FATAL,
4111                                 (errmsg("database files are incompatible with server"),
4112                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4113                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4114                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4115                                  errhint("It looks like you need to recompile or initdb.")));
4116
4117 #ifdef HAVE_INT64_TIMESTAMP
4118         if (ControlFile->enableIntTimes != true)
4119                 ereport(FATAL,
4120                                 (errmsg("database files are incompatible with server"),
4121                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4122                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4123                                  errhint("It looks like you need to recompile or initdb.")));
4124 #else
4125         if (ControlFile->enableIntTimes != false)
4126                 ereport(FATAL,
4127                                 (errmsg("database files are incompatible with server"),
4128                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4129                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4130                                  errhint("It looks like you need to recompile or initdb.")));
4131 #endif
4132
4133 #ifdef USE_FLOAT4_BYVAL
4134         if (ControlFile->float4ByVal != true)
4135                 ereport(FATAL,
4136                                 (errmsg("database files are incompatible with server"),
4137                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4138                                                    " but the server was compiled with USE_FLOAT4_BYVAL."),
4139                                  errhint("It looks like you need to recompile or initdb.")));
4140 #else
4141         if (ControlFile->float4ByVal != false)
4142                 ereport(FATAL,
4143                                 (errmsg("database files are incompatible with server"),
4144                                  errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4145                                                    " but the server was compiled without USE_FLOAT4_BYVAL."),
4146                                  errhint("It looks like you need to recompile or initdb.")));
4147 #endif
4148
4149 #ifdef USE_FLOAT8_BYVAL
4150         if (ControlFile->float8ByVal != true)
4151                 ereport(FATAL,
4152                                 (errmsg("database files are incompatible with server"),
4153                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4154                                                    " but the server was compiled with USE_FLOAT8_BYVAL."),
4155                                  errhint("It looks like you need to recompile or initdb.")));
4156 #else
4157         if (ControlFile->float8ByVal != false)
4158                 ereport(FATAL,
4159                                 (errmsg("database files are incompatible with server"),
4160                                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4161                                                    " but the server was compiled without USE_FLOAT8_BYVAL."),
4162                                  errhint("It looks like you need to recompile or initdb.")));
4163 #endif
4164 }
4165
4166 void
4167 UpdateControlFile(void)
4168 {
4169         int                     fd;
4170
4171         INIT_CRC32(ControlFile->crc);
4172         COMP_CRC32(ControlFile->crc,
4173                            (char *) ControlFile,
4174                            offsetof(ControlFileData, crc));
4175         FIN_CRC32(ControlFile->crc);
4176
4177         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4178                                            O_RDWR | PG_BINARY,
4179                                            S_IRUSR | S_IWUSR);
4180         if (fd < 0)
4181                 ereport(PANIC,
4182                                 (errcode_for_file_access(),
4183                                  errmsg("could not open control file \"%s\": %m",
4184                                                 XLOG_CONTROL_FILE)));
4185
4186         errno = 0;
4187         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4188         {
4189                 /* if write didn't set errno, assume problem is no disk space */
4190                 if (errno == 0)
4191                         errno = ENOSPC;
4192                 ereport(PANIC,
4193                                 (errcode_for_file_access(),
4194                                  errmsg("could not write to control file: %m")));
4195         }
4196
4197         if (pg_fsync(fd) != 0)
4198                 ereport(PANIC,
4199                                 (errcode_for_file_access(),
4200                                  errmsg("could not fsync control file: %m")));
4201
4202         if (close(fd))
4203                 ereport(PANIC,
4204                                 (errcode_for_file_access(),
4205                                  errmsg("could not close control file: %m")));
4206 }
4207
4208 /*
4209  * Initialization of shared memory for XLOG
4210  */
4211 Size
4212 XLOGShmemSize(void)
4213 {
4214         Size            size;
4215
4216         /* XLogCtl */
4217         size = sizeof(XLogCtlData);
4218         /* xlblocks array */
4219         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4220         /* extra alignment padding for XLOG I/O buffers */
4221         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4222         /* and the buffers themselves */
4223         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4224
4225         /*
4226          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4227          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4228          * routine again below to compute the actual allocation size.
4229          */
4230
4231         return size;
4232 }
4233
4234 void
4235 XLOGShmemInit(void)
4236 {
4237         bool            foundCFile,
4238                                 foundXLog;
4239         char       *allocptr;
4240
4241         ControlFile = (ControlFileData *)
4242                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4243         XLogCtl = (XLogCtlData *)
4244                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4245
4246         if (foundCFile || foundXLog)
4247         {
4248                 /* both should be present or neither */
4249                 Assert(foundCFile && foundXLog);
4250                 return;
4251         }
4252
4253         memset(XLogCtl, 0, sizeof(XLogCtlData));
4254
4255         /*
4256          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4257          * multiple of the alignment for same, so no extra alignment padding is
4258          * needed here.
4259          */
4260         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4261         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4262         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4263         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4264
4265         /*
4266          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4267          */
4268         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4269         XLogCtl->pages = allocptr;
4270         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4271
4272         /*
4273          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4274          * in additional info.)
4275          */
4276         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4277         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4278         SpinLockInit(&XLogCtl->info_lck);
4279
4280         /*
4281          * If we are not in bootstrap mode, pg_control should already exist. Read
4282          * and validate it immediately (see comments in ReadControlFile() for the
4283          * reasons why).
4284          */
4285         if (!IsBootstrapProcessingMode())
4286                 ReadControlFile();
4287 }
4288
4289 /*
4290  * This func must be called ONCE on system install.  It creates pg_control
4291  * and the initial XLOG segment.
4292  */
4293 void
4294 BootStrapXLOG(void)
4295 {
4296         CheckPoint      checkPoint;
4297         char       *buffer;
4298         XLogPageHeader page;
4299         XLogLongPageHeader longpage;
4300         XLogRecord *record;
4301         bool            use_existent;
4302         uint64          sysidentifier;
4303         struct timeval tv;
4304         pg_crc32        crc;
4305
4306         /*
4307          * Select a hopefully-unique system identifier code for this installation.
4308          * We use the result of gettimeofday(), including the fractional seconds
4309          * field, as being about as unique as we can easily get.  (Think not to
4310          * use random(), since it hasn't been seeded and there's no portable way
4311          * to seed it other than the system clock value...)  The upper half of the
4312          * uint64 value is just the tv_sec part, while the lower half is the XOR
4313          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4314          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4315          * knowing this encoding can determine the initialization time of the
4316          * installation, which could perhaps be useful sometimes.
4317          */
4318         gettimeofday(&tv, NULL);
4319         sysidentifier = ((uint64) tv.tv_sec) << 32;
4320         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4321
4322         /* First timeline ID is always 1 */
4323         ThisTimeLineID = 1;
4324
4325         /* page buffer must be aligned suitably for O_DIRECT */
4326         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4327         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4328         memset(page, 0, XLOG_BLCKSZ);
4329
4330         /* Set up information for the initial checkpoint record */
4331         checkPoint.redo.xlogid = 0;
4332         checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
4333         checkPoint.ThisTimeLineID = ThisTimeLineID;
4334         checkPoint.nextXidEpoch = 0;
4335         checkPoint.nextXid = FirstNormalTransactionId;
4336         checkPoint.nextOid = FirstBootstrapObjectId;
4337         checkPoint.nextMulti = FirstMultiXactId;
4338         checkPoint.nextMultiOffset = 0;
4339         checkPoint.time = (pg_time_t) time(NULL);
4340
4341         ShmemVariableCache->nextXid = checkPoint.nextXid;
4342         ShmemVariableCache->nextOid = checkPoint.nextOid;
4343         ShmemVariableCache->oidCount = 0;
4344         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4345
4346         /* Set up the XLOG page header */
4347         page->xlp_magic = XLOG_PAGE_MAGIC;
4348         page->xlp_info = XLP_LONG_HEADER;
4349         page->xlp_tli = ThisTimeLineID;
4350         page->xlp_pageaddr.xlogid = 0;
4351         page->xlp_pageaddr.xrecoff = 0;
4352         longpage = (XLogLongPageHeader) page;
4353         longpage->xlp_sysid = sysidentifier;
4354         longpage->xlp_seg_size = XLogSegSize;
4355         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4356
4357         /* Insert the initial checkpoint record */
4358         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4359         record->xl_prev.xlogid = 0;
4360         record->xl_prev.xrecoff = 0;
4361         record->xl_xid = InvalidTransactionId;
4362         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4363         record->xl_len = sizeof(checkPoint);
4364         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4365         record->xl_rmid = RM_XLOG_ID;
4366         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4367
4368         INIT_CRC32(crc);
4369         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4370         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
4371                            SizeOfXLogRecord - sizeof(pg_crc32));
4372         FIN_CRC32(crc);
4373         record->xl_crc = crc;
4374
4375         /* Create first XLOG segment file */
4376         use_existent = false;
4377         openLogFile = XLogFileInit(0, 0, &use_existent, false);
4378
4379         /* Write the first page with the initial record */
4380         errno = 0;
4381         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4382         {
4383                 /* if write didn't set errno, assume problem is no disk space */
4384                 if (errno == 0)
4385                         errno = ENOSPC;
4386                 ereport(PANIC,
4387                                 (errcode_for_file_access(),
4388                           errmsg("could not write bootstrap transaction log file: %m")));
4389         }
4390
4391         if (pg_fsync(openLogFile) != 0)
4392                 ereport(PANIC,
4393                                 (errcode_for_file_access(),
4394                           errmsg("could not fsync bootstrap transaction log file: %m")));
4395
4396         if (close(openLogFile))
4397                 ereport(PANIC,
4398                                 (errcode_for_file_access(),
4399                           errmsg("could not close bootstrap transaction log file: %m")));
4400
4401         openLogFile = -1;
4402
4403         /* Now create pg_control */
4404
4405         memset(ControlFile, 0, sizeof(ControlFileData));
4406         /* Initialize pg_control status fields */
4407         ControlFile->system_identifier = sysidentifier;
4408         ControlFile->state = DB_SHUTDOWNED;
4409         ControlFile->time = checkPoint.time;
4410         ControlFile->checkPoint = checkPoint.redo;
4411         ControlFile->checkPointCopy = checkPoint;
4412         /* some additional ControlFile fields are set in WriteControlFile() */
4413
4414         WriteControlFile();
4415
4416         /* Bootstrap the commit log, too */
4417         BootStrapCLOG();
4418         BootStrapSUBTRANS();
4419         BootStrapMultiXact();
4420
4421         pfree(buffer);
4422 }
4423
4424 static char *
4425 str_time(pg_time_t tnow)
4426 {
4427         static char buf[128];
4428
4429         pg_strftime(buf, sizeof(buf),
4430                                 "%Y-%m-%d %H:%M:%S %Z",
4431                                 pg_localtime(&tnow, log_timezone));
4432
4433         return buf;
4434 }
4435
4436 /*
4437  * See if there is a recovery command file (recovery.conf), and if so
4438  * read in parameters for archive recovery.
4439  *
4440  * XXX longer term intention is to expand this to
4441  * cater for additional parameters and controls
4442  * possibly use a flex lexer similar to the GUC one
4443  */
4444 static void
4445 readRecoveryCommandFile(void)
4446 {
4447         FILE       *fd;
4448         char            cmdline[MAXPGPATH];
4449         TimeLineID      rtli = 0;
4450         bool            rtliGiven = false;
4451         bool            syntaxError = false;
4452
4453         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4454         if (fd == NULL)
4455         {
4456                 if (errno == ENOENT)
4457                         return;                         /* not there, so no archive recovery */
4458                 ereport(FATAL,
4459                                 (errcode_for_file_access(),
4460                                  errmsg("could not open recovery command file \"%s\": %m",
4461                                                 RECOVERY_COMMAND_FILE)));
4462         }
4463
4464         ereport(LOG,
4465                         (errmsg("starting archive recovery")));
4466
4467         /*
4468          * Parse the file...
4469          */
4470         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
4471         {
4472                 /* skip leading whitespace and check for # comment */
4473                 char       *ptr;
4474                 char       *tok1;
4475                 char       *tok2;
4476
4477                 for (ptr = cmdline; *ptr; ptr++)
4478                 {
4479                         if (!isspace((unsigned char) *ptr))
4480                                 break;
4481                 }
4482                 if (*ptr == '\0' || *ptr == '#')
4483                         continue;
4484
4485                 /* identify the quoted parameter value */
4486                 tok1 = strtok(ptr, "'");
4487                 if (!tok1)
4488                 {
4489                         syntaxError = true;
4490                         break;
4491                 }
4492                 tok2 = strtok(NULL, "'");
4493                 if (!tok2)
4494                 {
4495                         syntaxError = true;
4496                         break;
4497                 }
4498                 /* reparse to get just the parameter name */
4499                 tok1 = strtok(ptr, " \t=");
4500                 if (!tok1)
4501                 {
4502                         syntaxError = true;
4503                         break;
4504                 }
4505
4506                 if (strcmp(tok1, "restore_command") == 0)
4507                 {
4508                         recoveryRestoreCommand = pstrdup(tok2);
4509                         ereport(LOG,
4510                                         (errmsg("restore_command = '%s'",
4511                                                         recoveryRestoreCommand)));
4512                 }
4513                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
4514                 {
4515                         rtliGiven = true;
4516                         if (strcmp(tok2, "latest") == 0)
4517                                 rtli = 0;
4518                         else
4519                         {
4520                                 errno = 0;
4521                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
4522                                 if (errno == EINVAL || errno == ERANGE)
4523                                         ereport(FATAL,
4524                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4525                                                                         tok2)));
4526                         }
4527                         if (rtli)
4528                                 ereport(LOG,
4529                                                 (errmsg("recovery_target_timeline = %u", rtli)));
4530                         else
4531                                 ereport(LOG,
4532                                                 (errmsg("recovery_target_timeline = latest")));
4533                 }
4534                 else if (strcmp(tok1, "recovery_target_xid") == 0)
4535                 {
4536                         errno = 0;
4537                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
4538                         if (errno == EINVAL || errno == ERANGE)
4539                                 ereport(FATAL,
4540                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4541                                                  tok2)));
4542                         ereport(LOG,
4543                                         (errmsg("recovery_target_xid = %u",
4544                                                         recoveryTargetXid)));
4545                         recoveryTarget = true;
4546                         recoveryTargetExact = true;
4547                 }
4548                 else if (strcmp(tok1, "recovery_target_time") == 0)
4549                 {
4550                         /*
4551                          * if recovery_target_xid specified, then this overrides
4552                          * recovery_target_time
4553                          */
4554                         if (recoveryTargetExact)
4555                                 continue;
4556                         recoveryTarget = true;
4557                         recoveryTargetExact = false;
4558
4559                         /*
4560                          * Convert the time string given by the user to TimestampTz form.
4561                          */
4562                         recoveryTargetTime =
4563                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4564                                                                                                                 CStringGetDatum(tok2),
4565                                                                                                 ObjectIdGetDatum(InvalidOid),
4566                                                                                                                 Int32GetDatum(-1)));
4567                         ereport(LOG,
4568                                         (errmsg("recovery_target_time = '%s'",
4569                                                         timestamptz_to_str(recoveryTargetTime))));
4570                 }
4571                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
4572                 {
4573                         /*
4574                          * does nothing if a recovery_target is not also set
4575                          */
4576                         if (!parse_bool(tok2, &recoveryTargetInclusive))
4577                                   ereport(ERROR,
4578                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4579                                           errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
4580                         ereport(LOG,
4581                                         (errmsg("recovery_target_inclusive = %s", tok2)));
4582                 }
4583                 else if (strcmp(tok1, "log_restartpoints") == 0)
4584                 {
4585                         /*
4586                          * does nothing if a recovery_target is not also set
4587                          */
4588                         if (!parse_bool(tok2, &recoveryLogRestartpoints))
4589                                   ereport(ERROR,
4590                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4591                                           errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
4592                         ereport(LOG,
4593                                         (errmsg("log_restartpoints = %s", tok2)));
4594                 }
4595                 else
4596                         ereport(FATAL,
4597                                         (errmsg("unrecognized recovery parameter \"%s\"",
4598                                                         tok1)));
4599         }
4600
4601         FreeFile(fd);
4602
4603         if (syntaxError)
4604                 ereport(FATAL,
4605                                 (errmsg("syntax error in recovery command file: %s",
4606                                                 cmdline),
4607                           errhint("Lines should have the format parameter = 'value'.")));
4608
4609         /* Check that required parameters were supplied */
4610         if (recoveryRestoreCommand == NULL)
4611                 ereport(FATAL,
4612                                 (errmsg("recovery command file \"%s\" did not specify restore_command",
4613                                                 RECOVERY_COMMAND_FILE)));
4614
4615         /* Enable fetching from archive recovery area */
4616         InArchiveRecovery = true;
4617
4618         /*
4619          * If user specified recovery_target_timeline, validate it or compute the
4620          * "latest" value.      We can't do this until after we've gotten the restore
4621          * command and set InArchiveRecovery, because we need to fetch timeline
4622          * history files from the archive.
4623          */
4624         if (rtliGiven)
4625         {
4626                 if (rtli)
4627                 {
4628                         /* Timeline 1 does not have a history file, all else should */
4629                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4630                                 ereport(FATAL,
4631                                                 (errmsg("recovery target timeline %u does not exist",
4632                                                                 rtli)));
4633                         recoveryTargetTLI = rtli;
4634                 }
4635                 else
4636                 {
4637                         /* We start the "latest" search from pg_control's timeline */
4638                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4639                 }
4640         }
4641 }
4642
4643 /*
4644  * Exit archive-recovery state
4645  */
4646 static void
4647 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4648 {
4649         char            recoveryPath[MAXPGPATH];
4650         char            xlogpath[MAXPGPATH];
4651
4652         /*
4653          * We are no longer in archive recovery state.
4654          */
4655         InArchiveRecovery = false;
4656
4657         /*
4658          * We should have the ending log segment currently open.  Verify, and then
4659          * close it (to avoid problems on Windows with trying to rename or delete
4660          * an open file).
4661          */
4662         Assert(readFile >= 0);
4663         Assert(readId == endLogId);
4664         Assert(readSeg == endLogSeg);
4665
4666         close(readFile);
4667         readFile = -1;
4668
4669         /*
4670          * If the segment was fetched from archival storage, we want to replace
4671          * the existing xlog segment (if any) with the archival version.  This is
4672          * because whatever is in XLOGDIR is very possibly older than what we have
4673          * from the archives, since it could have come from restoring a PGDATA
4674          * backup.      In any case, the archival version certainly is more
4675          * descriptive of what our current database state is, because that is what
4676          * we replayed from.
4677          *
4678          * Note that if we are establishing a new timeline, ThisTimeLineID is
4679          * already set to the new value, and so we will create a new file instead
4680          * of overwriting any existing file.  (This is, in fact, always the case
4681          * at present.)
4682          */
4683         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4684         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4685
4686         if (restoredFromArchive)
4687         {
4688                 ereport(DEBUG3,
4689                                 (errmsg_internal("moving last restored xlog to \"%s\"",
4690                                                                  xlogpath)));
4691                 unlink(xlogpath);               /* might or might not exist */
4692                 if (rename(recoveryPath, xlogpath) != 0)
4693                         ereport(FATAL,
4694                                         (errcode_for_file_access(),
4695                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
4696                                                         recoveryPath, xlogpath)));
4697                 /* XXX might we need to fix permissions on the file? */
4698         }
4699         else
4700         {
4701                 /*
4702                  * If the latest segment is not archival, but there's still a
4703                  * RECOVERYXLOG laying about, get rid of it.
4704                  */
4705                 unlink(recoveryPath);   /* ignore any error */
4706
4707                 /*
4708                  * If we are establishing a new timeline, we have to copy data from
4709                  * the last WAL segment of the old timeline to create a starting WAL
4710                  * segment for the new timeline.
4711                  */
4712                 if (endTLI != ThisTimeLineID)
4713                         XLogFileCopy(endLogId, endLogSeg,
4714                                                  endTLI, endLogId, endLogSeg);
4715         }
4716
4717         /*
4718          * Let's just make real sure there are not .ready or .done flags posted
4719          * for the new segment.
4720          */
4721         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4722         XLogArchiveCleanup(xlogpath);
4723
4724         /* Get rid of any remaining recovered timeline-history file, too */
4725         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
4726         unlink(recoveryPath);           /* ignore any error */
4727
4728         /*
4729          * Rename the config file out of the way, so that we don't accidentally
4730          * re-enter archive recovery mode in a subsequent crash.
4731          */
4732         unlink(RECOVERY_COMMAND_DONE);
4733         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
4734                 ereport(FATAL,
4735                                 (errcode_for_file_access(),
4736                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4737                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
4738
4739         ereport(LOG,
4740                         (errmsg("archive recovery complete")));
4741 }
4742
4743 /*
4744  * For point-in-time recovery, this function decides whether we want to
4745  * stop applying the XLOG at or after the current record.
4746  *
4747  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
4748  * *includeThis is set TRUE if we should apply this record before stopping.
4749  *
4750  * We also track the timestamp of the latest applied COMMIT/ABORT record
4751  * in recoveryLastXTime, for logging purposes.
4752  * Also, some information is saved in recoveryStopXid et al for use in
4753  * annotating the new timeline's history file.
4754  */
4755 static bool
4756 recoveryStopsHere(XLogRecord *record, bool *includeThis)
4757 {
4758         bool            stopsHere;
4759         uint8           record_info;
4760         TimestampTz recordXtime;
4761
4762         /* We only consider stopping at COMMIT or ABORT records */
4763         if (record->xl_rmid != RM_XACT_ID)
4764                 return false;
4765         record_info = record->xl_info & ~XLR_INFO_MASK;
4766         if (record_info == XLOG_XACT_COMMIT)
4767         {
4768                 xl_xact_commit *recordXactCommitData;
4769
4770                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
4771                 recordXtime = recordXactCommitData->xact_time;
4772         }
4773         else if (record_info == XLOG_XACT_ABORT)
4774         {
4775                 xl_xact_abort *recordXactAbortData;
4776
4777                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
4778                 recordXtime = recordXactAbortData->xact_time;
4779         }
4780         else
4781                 return false;
4782
4783         /* Do we have a PITR target at all? */
4784         if (!recoveryTarget)
4785         {
4786                 recoveryLastXTime = recordXtime;
4787                 return false;
4788         }
4789
4790         if (recoveryTargetExact)
4791         {
4792                 /*
4793                  * there can be only one transaction end record with this exact
4794                  * transactionid
4795                  *
4796                  * when testing for an xid, we MUST test for equality only, since
4797                  * transactions are numbered in the order they start, not the order
4798                  * they complete. A higher numbered xid will complete before you about
4799                  * 50% of the time...
4800                  */
4801                 stopsHere = (record->xl_xid == recoveryTargetXid);
4802                 if (stopsHere)
4803                         *includeThis = recoveryTargetInclusive;
4804         }
4805         else
4806         {
4807                 /*
4808                  * there can be many transactions that share the same commit time, so
4809                  * we stop after the last one, if we are inclusive, or stop at the
4810                  * first one if we are exclusive
4811                  */
4812                 if (recoveryTargetInclusive)
4813                         stopsHere = (recordXtime > recoveryTargetTime);
4814                 else
4815                         stopsHere = (recordXtime >= recoveryTargetTime);
4816                 if (stopsHere)
4817                         *includeThis = false;
4818         }
4819
4820         if (stopsHere)
4821         {
4822                 recoveryStopXid = record->xl_xid;
4823                 recoveryStopTime = recordXtime;
4824                 recoveryStopAfter = *includeThis;
4825
4826                 if (record_info == XLOG_XACT_COMMIT)
4827                 {
4828                         if (recoveryStopAfter)
4829                                 ereport(LOG,
4830                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
4831                                                                 recoveryStopXid,
4832                                                                 timestamptz_to_str(recoveryStopTime))));
4833                         else
4834                                 ereport(LOG,
4835                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
4836                                                                 recoveryStopXid,
4837                                                                 timestamptz_to_str(recoveryStopTime))));
4838                 }
4839                 else
4840                 {
4841                         if (recoveryStopAfter)
4842                                 ereport(LOG,
4843                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
4844                                                                 recoveryStopXid,
4845                                                                 timestamptz_to_str(recoveryStopTime))));
4846                         else
4847                                 ereport(LOG,
4848                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
4849                                                                 recoveryStopXid,
4850                                                                 timestamptz_to_str(recoveryStopTime))));
4851                 }
4852
4853                 if (recoveryStopAfter)
4854                         recoveryLastXTime = recordXtime;
4855         }
4856         else
4857                 recoveryLastXTime = recordXtime;
4858
4859         return stopsHere;
4860 }
4861
4862 /*
4863  * This must be called ONCE during postmaster or standalone-backend startup
4864  */
4865 void
4866 StartupXLOG(void)
4867 {
4868         XLogCtlInsert *Insert;
4869         CheckPoint      checkPoint;
4870         bool            wasShutdown;
4871         bool            reachedStopPoint = false;
4872         bool            haveBackupLabel = false;
4873         XLogRecPtr      RecPtr,
4874                                 LastRec,
4875                                 checkPointLoc,
4876                                 minRecoveryLoc,
4877                                 EndOfLog;
4878         uint32          endLogId;
4879         uint32          endLogSeg;
4880         XLogRecord *record;
4881         uint32          freespace;
4882         TransactionId oldestActiveXID;
4883
4884         /*
4885          * Read control file and check XLOG status looks valid.
4886          *
4887          * Note: in most control paths, *ControlFile is already valid and we need
4888          * not do ReadControlFile() here, but might as well do it to be sure.
4889          */
4890         ReadControlFile();
4891
4892         if (ControlFile->state < DB_SHUTDOWNED ||
4893                 ControlFile->state > DB_IN_PRODUCTION ||
4894                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
4895                 ereport(FATAL,
4896                                 (errmsg("control file contains invalid data")));
4897
4898         if (ControlFile->state == DB_SHUTDOWNED)
4899                 ereport(LOG,
4900                                 (errmsg("database system was shut down at %s",
4901                                                 str_time(ControlFile->time))));
4902         else if (ControlFile->state == DB_SHUTDOWNING)
4903                 ereport(LOG,
4904                                 (errmsg("database system shutdown was interrupted; last known up at %s",
4905                                                 str_time(ControlFile->time))));
4906         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
4907                 ereport(LOG,
4908                    (errmsg("database system was interrupted while in recovery at %s",
4909                                    str_time(ControlFile->time)),
4910                         errhint("This probably means that some data is corrupted and"
4911                                         " you will have to use the last backup for recovery.")));
4912         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
4913                 ereport(LOG,
4914                                 (errmsg("database system was interrupted while in recovery at log time %s",
4915                                                 str_time(ControlFile->checkPointCopy.time)),
4916                                  errhint("If this has occurred more than once some data might be corrupted"
4917                           " and you might need to choose an earlier recovery target.")));
4918         else if (ControlFile->state == DB_IN_PRODUCTION)
4919                 ereport(LOG,
4920                           (errmsg("database system was interrupted; last known up at %s",
4921                                           str_time(ControlFile->time))));
4922
4923         /* This is just to allow attaching to startup process with a debugger */
4924 #ifdef XLOG_REPLAY_DELAY
4925         if (ControlFile->state != DB_SHUTDOWNED)
4926                 pg_usleep(60000000L);
4927 #endif
4928
4929         /*
4930          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
4931          * someone has performed a copy for PITR, these directories may have
4932          * been excluded and need to be re-created.
4933          */
4934         ValidateXLOGDirectoryStructure();
4935
4936         /*
4937          * Initialize on the assumption we want to recover to the same timeline
4938          * that's active according to pg_control.
4939          */
4940         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
4941
4942         /*
4943          * Check for recovery control file, and if so set up state for offline
4944          * recovery
4945          */
4946         readRecoveryCommandFile();
4947
4948         /* Now we can determine the list of expected TLIs */
4949         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
4950
4951         /*
4952          * If pg_control's timeline is not in expectedTLIs, then we cannot
4953          * proceed: the backup is not part of the history of the requested
4954          * timeline.
4955          */
4956         if (!list_member_int(expectedTLIs,
4957                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
4958                 ereport(FATAL,
4959                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
4960                                                 recoveryTargetTLI,
4961                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
4962
4963         if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
4964         {
4965                 /*
4966                  * When a backup_label file is present, we want to roll forward from
4967                  * the checkpoint it identifies, rather than using pg_control.
4968                  */
4969                 record = ReadCheckpointRecord(checkPointLoc, 0);
4970                 if (record != NULL)
4971                 {
4972                         ereport(DEBUG1,
4973                                         (errmsg("checkpoint record is at %X/%X",
4974                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
4975                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
4976                 }
4977                 else
4978                 {
4979                         ereport(PANIC,
4980                                         (errmsg("could not locate required checkpoint record"),
4981                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
4982                 }
4983                 /* set flag to delete it later */
4984                 haveBackupLabel = true;
4985         }
4986         else
4987         {
4988                 /*
4989                  * Get the last valid checkpoint record.  If the latest one according
4990                  * to pg_control is broken, try the next-to-last one.
4991                  */
4992                 checkPointLoc = ControlFile->checkPoint;
4993                 record = ReadCheckpointRecord(checkPointLoc, 1);
4994                 if (record != NULL)
4995                 {
4996                         ereport(DEBUG1,
4997                                         (errmsg("checkpoint record is at %X/%X",
4998                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
4999                 }
5000                 else
5001                 {
5002                         checkPointLoc = ControlFile->prevCheckPoint;
5003                         record = ReadCheckpointRecord(checkPointLoc, 2);
5004                         if (record != NULL)
5005                         {
5006                                 ereport(LOG,
5007                                                 (errmsg("using previous checkpoint record at %X/%X",
5008                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5009                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5010                         }
5011                         else
5012                                 ereport(PANIC,
5013                                          (errmsg("could not locate a valid checkpoint record")));
5014                 }
5015         }
5016
5017         LastRec = RecPtr = checkPointLoc;
5018         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5019         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5020
5021         ereport(DEBUG1,
5022                         (errmsg("redo record is at %X/%X; shutdown %s",
5023                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
5024                                         wasShutdown ? "TRUE" : "FALSE")));
5025         ereport(DEBUG1,
5026                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5027                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5028                                         checkPoint.nextOid)));
5029         ereport(DEBUG1,
5030                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5031                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5032         if (!TransactionIdIsNormal(checkPoint.nextXid))
5033                 ereport(PANIC,
5034                                 (errmsg("invalid next transaction ID")));
5035
5036         ShmemVariableCache->nextXid = checkPoint.nextXid;
5037         ShmemVariableCache->nextOid = checkPoint.nextOid;
5038         ShmemVariableCache->oidCount = 0;
5039         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5040
5041         /*
5042          * We must replay WAL entries using the same TimeLineID they were created
5043          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5044          * also xlog_redo()).
5045          */
5046         ThisTimeLineID = checkPoint.ThisTimeLineID;
5047
5048         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5049
5050         if (XLByteLT(RecPtr, checkPoint.redo))
5051                 ereport(PANIC,
5052                                 (errmsg("invalid redo in checkpoint record")));
5053
5054         /*
5055          * Check whether we need to force recovery from WAL.  If it appears to
5056          * have been a clean shutdown and we did not have a recovery.conf file,
5057          * then assume no recovery needed.
5058          */
5059         if (XLByteLT(checkPoint.redo, RecPtr))
5060         {
5061                 if (wasShutdown)
5062                         ereport(PANIC,
5063                                         (errmsg("invalid redo record in shutdown checkpoint")));
5064                 InRecovery = true;
5065         }
5066         else if (ControlFile->state != DB_SHUTDOWNED)
5067                 InRecovery = true;
5068         else if (InArchiveRecovery)
5069         {
5070                 /* force recovery due to presence of recovery.conf */
5071                 InRecovery = true;
5072         }
5073
5074         /* REDO */
5075         if (InRecovery)
5076         {
5077                 int                     rmid;
5078
5079                 /*
5080                  * Update pg_control to show that we are recovering and to show the
5081                  * selected checkpoint as the place we are starting from. We also mark
5082                  * pg_control with any minimum recovery stop point obtained from a
5083                  * backup history file.
5084                  */
5085                 if (InArchiveRecovery)
5086                 {
5087                         ereport(LOG,
5088                                         (errmsg("automatic recovery in progress")));
5089                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5090                 }
5091                 else
5092                 {
5093                         ereport(LOG,
5094                                         (errmsg("database system was not properly shut down; "
5095                                                         "automatic recovery in progress")));
5096                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5097                 }
5098                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5099                 ControlFile->checkPoint = checkPointLoc;
5100                 ControlFile->checkPointCopy = checkPoint;
5101                 if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
5102                         ControlFile->minRecoveryPoint = minRecoveryLoc;
5103                 ControlFile->time = (pg_time_t) time(NULL);
5104                 UpdateControlFile();
5105
5106                 /*
5107                  * If there was a backup label file, it's done its job and the info
5108                  * has now been propagated into pg_control.  We must get rid of the
5109                  * label file so that if we crash during recovery, we'll pick up at
5110                  * the latest recovery restartpoint instead of going all the way back
5111                  * to the backup start point.  It seems prudent though to just rename
5112                  * the file out of the way rather than delete it completely.
5113                  */
5114                 if (haveBackupLabel)
5115                 {
5116                         unlink(BACKUP_LABEL_OLD);
5117                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5118                                 ereport(FATAL,
5119                                                 (errcode_for_file_access(),
5120                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5121                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5122                 }
5123
5124                 /* Initialize resource managers */
5125                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5126                 {
5127                         if (RmgrTable[rmid].rm_startup != NULL)
5128                                 RmgrTable[rmid].rm_startup();
5129                 }
5130
5131                 /*
5132                  * Find the first record that logically follows the checkpoint --- it
5133                  * might physically precede it, though.
5134                  */
5135                 if (XLByteLT(checkPoint.redo, RecPtr))
5136                 {
5137                         /* back up to find the record */
5138                         record = ReadRecord(&(checkPoint.redo), PANIC);
5139                 }
5140                 else
5141                 {
5142                         /* just have to read next record after CheckPoint */
5143                         record = ReadRecord(NULL, LOG);
5144                 }
5145
5146                 if (record != NULL)
5147                 {
5148                         bool            recoveryContinue = true;
5149                         bool            recoveryApply = true;
5150                         ErrorContextCallback errcontext;
5151
5152                         InRedo = true;
5153                         ereport(LOG,
5154                                         (errmsg("redo starts at %X/%X",
5155                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5156
5157                         /*
5158                          * main redo apply loop
5159                          */
5160                         do
5161                         {
5162 #ifdef WAL_DEBUG
5163                                 if (XLOG_DEBUG)
5164                                 {
5165                                         StringInfoData buf;
5166
5167                                         initStringInfo(&buf);
5168                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5169                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5170                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
5171                                         xlog_outrec(&buf, record);
5172                                         appendStringInfo(&buf, " - ");
5173                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5174                                                                                                            record->xl_info,
5175                                                                                                          XLogRecGetData(record));
5176                                         elog(LOG, "%s", buf.data);
5177                                         pfree(buf.data);
5178                                 }
5179 #endif
5180
5181                                 /*
5182                                  * Have we reached our recovery target?
5183                                  */
5184                                 if (recoveryStopsHere(record, &recoveryApply))
5185                                 {
5186                                         reachedStopPoint = true;        /* see below */
5187                                         recoveryContinue = false;
5188                                         if (!recoveryApply)
5189                                                 break;
5190                                 }
5191
5192                                 /* Setup error traceback support for ereport() */
5193                                 errcontext.callback = rm_redo_error_callback;
5194                                 errcontext.arg = (void *) record;
5195                                 errcontext.previous = error_context_stack;
5196                                 error_context_stack = &errcontext;
5197
5198                                 /* nextXid must be beyond record's xid */
5199                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5200                                                                                                  ShmemVariableCache->nextXid))
5201                                 {
5202                                         ShmemVariableCache->nextXid = record->xl_xid;
5203                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5204                                 }
5205
5206                                 if (record->xl_info & XLR_BKP_BLOCK_MASK)
5207                                         RestoreBkpBlocks(record, EndRecPtr);
5208
5209                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5210
5211                                 /* Pop the error context stack */
5212                                 error_context_stack = errcontext.previous;
5213
5214                                 LastRec = ReadRecPtr;
5215
5216                                 record = ReadRecord(NULL, LOG);
5217                         } while (record != NULL && recoveryContinue);
5218
5219                         /*
5220                          * end of main redo apply loop
5221                          */
5222
5223                         ereport(LOG,
5224                                         (errmsg("redo done at %X/%X",
5225                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5226                         if (recoveryLastXTime)
5227                                 ereport(LOG,
5228                                          (errmsg("last completed transaction was at log time %s",
5229                                                          timestamptz_to_str(recoveryLastXTime))));
5230                         InRedo = false;
5231                 }
5232                 else
5233                 {
5234                         /* there are no WAL records following the checkpoint */
5235                         ereport(LOG,
5236                                         (errmsg("redo is not required")));
5237                 }
5238         }
5239
5240         /*
5241          * Re-fetch the last valid or last applied record, so we can identify the
5242          * exact endpoint of what we consider the valid portion of WAL.
5243          */
5244         record = ReadRecord(&LastRec, PANIC);
5245         EndOfLog = EndRecPtr;
5246         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
5247
5248         /*
5249          * Complain if we did not roll forward far enough to render the backup
5250          * dump consistent.
5251          */
5252         if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
5253         {
5254                 if (reachedStopPoint)   /* stopped because of stop request */
5255                         ereport(FATAL,
5256                                         (errmsg("requested recovery stop point is before end time of backup dump")));
5257                 else    /* ran off end of WAL */
5258                         ereport(FATAL,
5259                                         (errmsg("WAL ends before end time of backup dump")));
5260         }
5261
5262         /*
5263          * Consider whether we need to assign a new timeline ID.
5264          *
5265          * If we are doing an archive recovery, we always assign a new ID.      This
5266          * handles a couple of issues.  If we stopped short of the end of WAL
5267          * during recovery, then we are clearly generating a new timeline and must
5268          * assign it a unique new ID.  Even if we ran to the end, modifying the
5269          * current last segment is problematic because it may result in trying to
5270          * overwrite an already-archived copy of that segment, and we encourage
5271          * DBAs to make their archive_commands reject that.  We can dodge the
5272          * problem by making the new active segment have a new timeline ID.
5273          *
5274          * In a normal crash recovery, we can just extend the timeline we were in.
5275          */
5276         if (InArchiveRecovery)
5277         {
5278                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5279                 ereport(LOG,
5280                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5281                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5282                                                          curFileTLI, endLogId, endLogSeg);
5283         }
5284
5285         /* Save the selected TimeLineID in shared memory, too */
5286         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5287
5288         /*
5289          * We are now done reading the old WAL.  Turn off archive fetching if it
5290          * was active, and make a writable copy of the last WAL segment. (Note
5291          * that we also have a copy of the last block of the old WAL in readBuf;
5292          * we will use that below.)
5293          */
5294         if (InArchiveRecovery)
5295                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
5296
5297         /*
5298          * Prepare to write WAL starting at EndOfLog position, and init xlog
5299          * buffer cache using the block containing the last record from the
5300          * previous incarnation.
5301          */
5302         openLogId = endLogId;
5303         openLogSeg = endLogSeg;
5304         openLogFile = XLogFileOpen(openLogId, openLogSeg);
5305         openLogOff = 0;
5306         Insert = &XLogCtl->Insert;
5307         Insert->PrevRecord = LastRec;
5308         XLogCtl->xlblocks[0].xlogid = openLogId;
5309         XLogCtl->xlblocks[0].xrecoff =
5310                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5311
5312         /*
5313          * Tricky point here: readBuf contains the *last* block that the LastRec
5314          * record spans, not the one it starts in.      The last block is indeed the
5315          * one we want to use.
5316          */
5317         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
5318         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
5319         Insert->currpos = (char *) Insert->currpage +
5320                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
5321
5322         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5323
5324         XLogCtl->Write.LogwrtResult = LogwrtResult;
5325         Insert->LogwrtResult = LogwrtResult;
5326         XLogCtl->LogwrtResult = LogwrtResult;
5327
5328         XLogCtl->LogwrtRqst.Write = EndOfLog;
5329         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5330
5331         freespace = INSERT_FREESPACE(Insert);
5332         if (freespace > 0)
5333         {
5334                 /* Make sure rest of page is zero */
5335                 MemSet(Insert->currpos, 0, freespace);
5336                 XLogCtl->Write.curridx = 0;
5337         }
5338         else
5339         {
5340                 /*
5341                  * Whenever Write.LogwrtResult points to exactly the end of a page,
5342                  * Write.curridx must point to the *next* page (see XLogWrite()).
5343                  *
5344                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5345                  * this is sufficient.  The first actual attempt to insert a log
5346                  * record will advance the insert state.
5347                  */
5348                 XLogCtl->Write.curridx = NextBufIdx(0);
5349         }
5350
5351         /* Pre-scan prepared transactions to find out the range of XIDs present */
5352         oldestActiveXID = PrescanPreparedTransactions();
5353
5354         if (InRecovery)
5355         {
5356                 int                     rmid;
5357
5358                 /*
5359                  * Allow resource managers to do any required cleanup.
5360                  */
5361                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5362                 {
5363                         if (RmgrTable[rmid].rm_cleanup != NULL)
5364                                 RmgrTable[rmid].rm_cleanup();
5365                 }
5366
5367                 /*
5368                  * Check to see if the XLOG sequence contained any unresolved
5369                  * references to uninitialized pages.
5370                  */
5371                 XLogCheckInvalidPages();
5372
5373                 /*
5374                  * Reset pgstat data, because it may be invalid after recovery.
5375                  */
5376                 pgstat_reset_all();
5377
5378                 /*
5379                  * Perform a checkpoint to update all our recovery activity to disk.
5380                  *
5381                  * Note that we write a shutdown checkpoint rather than an on-line
5382                  * one. This is not particularly critical, but since we may be
5383                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5384                  * the rule that TLI only changes in shutdown checkpoints, which
5385                  * allows some extra error checking in xlog_redo.
5386                  */
5387                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
5388         }
5389
5390         /*
5391          * Preallocate additional log files, if wanted.
5392          */
5393         PreallocXlogFiles(EndOfLog);
5394
5395         /*
5396          * Okay, we're officially UP.
5397          */
5398         InRecovery = false;
5399
5400         ControlFile->state = DB_IN_PRODUCTION;
5401         ControlFile->time = (pg_time_t) time(NULL);
5402         UpdateControlFile();
5403
5404         /* start the archive_timeout timer running */
5405         XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
5406
5407         /* initialize shared-memory copy of latest checkpoint XID/epoch */
5408         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5409         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
5410
5411         /* also initialize latestCompletedXid, to nextXid - 1 */
5412         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
5413         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
5414
5415         /* Start up the commit log and related stuff, too */
5416         StartupCLOG();
5417         StartupSUBTRANS(oldestActiveXID);
5418         StartupMultiXact();
5419
5420         /* Reload shared-memory state for prepared transactions */
5421         RecoverPreparedTransactions();
5422
5423         /* Shut down readFile facility, free space */
5424         if (readFile >= 0)
5425         {
5426                 close(readFile);
5427                 readFile = -1;
5428         }
5429         if (readBuf)
5430         {
5431                 free(readBuf);
5432                 readBuf = NULL;
5433         }
5434         if (readRecordBuf)
5435         {
5436                 free(readRecordBuf);
5437                 readRecordBuf = NULL;
5438                 readRecordBufSize = 0;
5439         }
5440 }
5441
5442 /*
5443  * Subroutine to try to fetch and validate a prior checkpoint record.
5444  *
5445  * whichChkpt identifies the checkpoint (merely for reporting purposes).
5446  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
5447  */
5448 static XLogRecord *
5449 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
5450 {
5451         XLogRecord *record;
5452
5453         if (!XRecOffIsValid(RecPtr.xrecoff))
5454         {
5455                 switch (whichChkpt)
5456                 {
5457                         case 1:
5458                                 ereport(LOG,
5459                                 (errmsg("invalid primary checkpoint link in control file")));
5460                                 break;
5461                         case 2:
5462                                 ereport(LOG,
5463                                                 (errmsg("invalid secondary checkpoint link in control file")));
5464                                 break;
5465                         default:
5466                                 ereport(LOG,
5467                                    (errmsg("invalid checkpoint link in backup_label file")));
5468                                 break;
5469                 }
5470                 return NULL;
5471         }
5472
5473         record = ReadRecord(&RecPtr, LOG);
5474
5475         if (record == NULL)
5476         {
5477                 switch (whichChkpt)
5478                 {
5479                         case 1:
5480                                 ereport(LOG,
5481                                                 (errmsg("invalid primary checkpoint record")));
5482                                 break;
5483                         case 2:
5484                                 ereport(LOG,
5485                                                 (errmsg("invalid secondary checkpoint record")));
5486                                 break;
5487                         default:
5488                                 ereport(LOG,
5489                                                 (errmsg("invalid checkpoint record")));
5490                                 break;
5491                 }
5492                 return NULL;
5493         }
5494         if (record->xl_rmid != RM_XLOG_ID)
5495         {
5496                 switch (whichChkpt)
5497                 {
5498                         case 1:
5499                                 ereport(LOG,
5500                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
5501                                 break;
5502                         case 2:
5503                                 ereport(LOG,
5504                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
5505                                 break;
5506                         default:
5507                                 ereport(LOG,
5508                                 (errmsg("invalid resource manager ID in checkpoint record")));
5509                                 break;
5510                 }
5511                 return NULL;
5512         }
5513         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
5514                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
5515         {
5516                 switch (whichChkpt)
5517                 {
5518                         case 1:
5519                                 ereport(LOG,
5520                                    (errmsg("invalid xl_info in primary checkpoint record")));
5521                                 break;
5522                         case 2:
5523                                 ereport(LOG,
5524                                  (errmsg("invalid xl_info in secondary checkpoint record")));
5525                                 break;
5526                         default:
5527                                 ereport(LOG,
5528                                                 (errmsg("invalid xl_info in checkpoint record")));
5529                                 break;
5530                 }
5531                 return NULL;
5532         }
5533         if (record->xl_len != sizeof(CheckPoint) ||
5534                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
5535         {
5536                 switch (whichChkpt)
5537                 {
5538                         case 1:
5539                                 ereport(LOG,
5540                                         (errmsg("invalid length of primary checkpoint record")));
5541                                 break;
5542                         case 2:
5543                                 ereport(LOG,
5544                                   (errmsg("invalid length of secondary checkpoint record")));
5545                                 break;
5546                         default:
5547                                 ereport(LOG,
5548                                                 (errmsg("invalid length of checkpoint record")));
5549                                 break;
5550                 }
5551                 return NULL;
5552         }
5553         return record;
5554 }
5555
5556 /*
5557  * This must be called during startup of a backend process, except that
5558  * it need not be called in a standalone backend (which does StartupXLOG
5559  * instead).  We need to initialize the local copies of ThisTimeLineID and
5560  * RedoRecPtr.
5561  *
5562  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
5563  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
5564  * unnecessary however, since the postmaster itself never touches XLOG anyway.
5565  */
5566 void
5567 InitXLOGAccess(void)
5568 {
5569         /* ThisTimeLineID doesn't change so we need no lock to copy it */
5570         ThisTimeLineID = XLogCtl->ThisTimeLineID;
5571         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
5572         (void) GetRedoRecPtr();
5573 }
5574
5575 /*
5576  * Once spawned, a backend may update its local RedoRecPtr from
5577  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
5578  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
5579  */
5580 XLogRecPtr
5581 GetRedoRecPtr(void)
5582 {
5583         /* use volatile pointer to prevent code rearrangement */
5584         volatile XLogCtlData *xlogctl = XLogCtl;
5585
5586         SpinLockAcquire(&xlogctl->info_lck);
5587         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
5588         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
5589         SpinLockRelease(&xlogctl->info_lck);
5590
5591         return RedoRecPtr;
5592 }
5593
5594 /*
5595  * GetInsertRecPtr -- Returns the current insert position.
5596  *
5597  * NOTE: The value *actually* returned is the position of the last full
5598  * xlog page. It lags behind the real insert position by at most 1 page.
5599  * For that, we don't need to acquire WALInsertLock which can be quite
5600  * heavily contended, and an approximation is enough for the current
5601  * usage of this function.
5602  */
5603 XLogRecPtr
5604 GetInsertRecPtr(void)
5605 {
5606         /* use volatile pointer to prevent code rearrangement */
5607         volatile XLogCtlData *xlogctl = XLogCtl;
5608         XLogRecPtr      recptr;
5609
5610         SpinLockAcquire(&xlogctl->info_lck);
5611         recptr = xlogctl->LogwrtRqst.Write;
5612         SpinLockRelease(&xlogctl->info_lck);
5613
5614         return recptr;
5615 }
5616
5617 /*
5618  * Get the time of the last xlog segment switch
5619  */
5620 pg_time_t
5621 GetLastSegSwitchTime(void)
5622 {
5623         pg_time_t       result;
5624
5625         /* Need WALWriteLock, but shared lock is sufficient */
5626         LWLockAcquire(WALWriteLock, LW_SHARED);
5627         result = XLogCtl->Write.lastSegSwitchTime;
5628         LWLockRelease(WALWriteLock);
5629
5630         return result;
5631 }
5632
5633 /*
5634  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
5635  *
5636  * This is exported for use by code that would like to have 64-bit XIDs.
5637  * We don't really support such things, but all XIDs within the system
5638  * can be presumed "close to" the result, and thus the epoch associated
5639  * with them can be determined.
5640  */
5641 void
5642 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
5643 {
5644         uint32          ckptXidEpoch;
5645         TransactionId ckptXid;
5646         TransactionId nextXid;
5647
5648         /* Must read checkpoint info first, else have race condition */
5649         {
5650                 /* use volatile pointer to prevent code rearrangement */
5651                 volatile XLogCtlData *xlogctl = XLogCtl;
5652
5653                 SpinLockAcquire(&xlogctl->info_lck);
5654                 ckptXidEpoch = xlogctl->ckptXidEpoch;
5655                 ckptXid = xlogctl->ckptXid;
5656                 SpinLockRelease(&xlogctl->info_lck);
5657         }
5658
5659         /* Now fetch current nextXid */
5660         nextXid = ReadNewTransactionId();
5661
5662         /*
5663          * nextXid is certainly logically later than ckptXid.  So if it's
5664          * numerically less, it must have wrapped into the next epoch.
5665          */
5666         if (nextXid < ckptXid)
5667                 ckptXidEpoch++;
5668
5669         *xid = nextXid;
5670         *epoch = ckptXidEpoch;
5671 }
5672
5673 /*
5674  * This must be called ONCE during postmaster or standalone-backend shutdown
5675  */
5676 void
5677 ShutdownXLOG(int code, Datum arg)
5678 {
5679         ereport(LOG,
5680                         (errmsg("shutting down")));
5681
5682         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
5683         ShutdownCLOG();
5684         ShutdownSUBTRANS();
5685         ShutdownMultiXact();
5686
5687         ereport(LOG,
5688                         (errmsg("database system is shut down")));
5689 }
5690
5691 /*
5692  * Log start of a checkpoint.
5693  */
5694 static void
5695 LogCheckpointStart(int flags)
5696 {
5697         elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
5698                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
5699                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
5700                  (flags & CHECKPOINT_FORCE) ? " force" : "",
5701                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
5702                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
5703                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
5704 }
5705
5706 /*
5707  * Log end of a checkpoint.
5708  */
5709 static void
5710 LogCheckpointEnd(void)
5711 {
5712         long            write_secs,
5713                                 sync_secs,
5714                                 total_secs;
5715         int                     write_usecs,
5716                                 sync_usecs,
5717                                 total_usecs;
5718
5719         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
5720
5721         TimestampDifference(CheckpointStats.ckpt_start_t,
5722                                                 CheckpointStats.ckpt_end_t,
5723                                                 &total_secs, &total_usecs);
5724
5725         TimestampDifference(CheckpointStats.ckpt_write_t,
5726                                                 CheckpointStats.ckpt_sync_t,
5727                                                 &write_secs, &write_usecs);
5728
5729         TimestampDifference(CheckpointStats.ckpt_sync_t,
5730                                                 CheckpointStats.ckpt_sync_end_t,
5731                                                 &sync_secs, &sync_usecs);
5732
5733         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
5734                  "%d transaction log file(s) added, %d removed, %d recycled; "
5735                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
5736                  CheckpointStats.ckpt_bufs_written,
5737                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
5738                  CheckpointStats.ckpt_segs_added,
5739                  CheckpointStats.ckpt_segs_removed,
5740                  CheckpointStats.ckpt_segs_recycled,
5741                  write_secs, write_usecs / 1000,
5742                  sync_secs, sync_usecs / 1000,
5743                  total_secs, total_usecs / 1000);
5744 }
5745
5746 /*
5747  * Perform a checkpoint --- either during shutdown, or on-the-fly
5748  *
5749  * flags is a bitwise OR of the following:
5750  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
5751  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
5752  *              ignoring checkpoint_completion_target parameter.
5753  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
5754  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
5755  *
5756  * Note: flags contains other bits, of interest here only for logging purposes.
5757  * In particular note that this routine is synchronous and does not pay
5758  * attention to CHECKPOINT_WAIT.
5759  */
5760 void
5761 CreateCheckPoint(int flags)
5762 {
5763         bool            shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
5764         CheckPoint      checkPoint;
5765         XLogRecPtr      recptr;
5766         XLogCtlInsert *Insert = &XLogCtl->Insert;
5767         XLogRecData rdata;
5768         uint32          freespace;
5769         uint32          _logId;
5770         uint32          _logSeg;
5771         TransactionId *inCommitXids;
5772         int                     nInCommit;
5773
5774         /*
5775          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
5776          * (This is just pro forma, since in the present system structure there is
5777          * only one process that is allowed to issue checkpoints at any given
5778          * time.)
5779          */
5780         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
5781
5782         /*
5783          * Prepare to accumulate statistics.
5784          *
5785          * Note: because it is possible for log_checkpoints to change while a
5786          * checkpoint proceeds, we always accumulate stats, even if
5787          * log_checkpoints is currently off.
5788          */
5789         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
5790         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
5791
5792         /*
5793          * Use a critical section to force system panic if we have trouble.
5794          */
5795         START_CRIT_SECTION();
5796
5797         if (shutdown)
5798         {
5799                 ControlFile->state = DB_SHUTDOWNING;
5800                 ControlFile->time = (pg_time_t) time(NULL);
5801                 UpdateControlFile();
5802         }
5803
5804         /*
5805          * Let smgr prepare for checkpoint; this has to happen before we determine
5806          * the REDO pointer.  Note that smgr must not do anything that'd have to
5807          * be undone if we decide no checkpoint is needed.
5808          */
5809         smgrpreckpt();
5810
5811         /* Begin filling in the checkpoint WAL record */
5812         MemSet(&checkPoint, 0, sizeof(checkPoint));
5813         checkPoint.ThisTimeLineID = ThisTimeLineID;
5814         checkPoint.time = (pg_time_t) time(NULL);
5815
5816         /*
5817          * We must hold WALInsertLock while examining insert state to determine
5818          * the checkpoint REDO pointer.
5819          */
5820         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
5821
5822         /*
5823          * If this isn't a shutdown or forced checkpoint, and we have not inserted
5824          * any XLOG records since the start of the last checkpoint, skip the
5825          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
5826          * when the system is idle. That wastes log space, and more importantly it
5827          * exposes us to possible loss of both current and previous checkpoint
5828          * records if the machine crashes just as we're writing the update.
5829          * (Perhaps it'd make even more sense to checkpoint only when the previous
5830          * checkpoint record is in a different xlog page?)
5831          *
5832          * We have to make two tests to determine that nothing has happened since
5833          * the start of the last checkpoint: current insertion point must match
5834          * the end of the last checkpoint record, and its redo pointer must point
5835          * to itself.
5836          */
5837         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
5838         {
5839                 XLogRecPtr      curInsert;
5840
5841                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
5842                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
5843                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
5844                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
5845                         ControlFile->checkPoint.xlogid ==
5846                         ControlFile->checkPointCopy.redo.xlogid &&
5847                         ControlFile->checkPoint.xrecoff ==
5848                         ControlFile->checkPointCopy.redo.xrecoff)
5849                 {
5850                         LWLockRelease(WALInsertLock);
5851                         LWLockRelease(CheckpointLock);
5852                         END_CRIT_SECTION();
5853                         return;
5854                 }
5855         }
5856
5857         /*
5858          * Compute new REDO record ptr = location of next XLOG record.
5859          *
5860          * NB: this is NOT necessarily where the checkpoint record itself will be,
5861          * since other backends may insert more XLOG records while we're off doing
5862          * the buffer flush work.  Those XLOG records are logically after the
5863          * checkpoint, even though physically before it.  Got that?
5864          */
5865         freespace = INSERT_FREESPACE(Insert);
5866         if (freespace < SizeOfXLogRecord)
5867         {
5868                 (void) AdvanceXLInsertBuffer(false);
5869                 /* OK to ignore update return flag, since we will do flush anyway */
5870                 freespace = INSERT_FREESPACE(Insert);
5871         }
5872         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
5873
5874         /*
5875          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
5876          * must be done while holding the insert lock AND the info_lck.
5877          *
5878          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
5879          * pointing past where it really needs to point.  This is okay; the only
5880          * consequence is that XLogInsert might back up whole buffers that it
5881          * didn't really need to.  We can't postpone advancing RedoRecPtr because
5882          * XLogInserts that happen while we are dumping buffers must assume that
5883          * their buffer changes are not included in the checkpoint.
5884          */
5885         {
5886                 /* use volatile pointer to prevent code rearrangement */
5887                 volatile XLogCtlData *xlogctl = XLogCtl;
5888
5889                 SpinLockAcquire(&xlogctl->info_lck);
5890                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
5891                 SpinLockRelease(&xlogctl->info_lck);
5892         }
5893
5894         /*
5895          * Now we can release WAL insert lock, allowing other xacts to proceed
5896          * while we are flushing disk buffers.
5897          */
5898         LWLockRelease(WALInsertLock);
5899
5900         /*
5901          * If enabled, log checkpoint start.  We postpone this until now so as not
5902          * to log anything if we decided to skip the checkpoint.
5903          */
5904         if (log_checkpoints)
5905                 LogCheckpointStart(flags);
5906
5907         /*
5908          * Before flushing data, we must wait for any transactions that are
5909          * currently in their commit critical sections.  If an xact inserted its
5910          * commit record into XLOG just before the REDO point, then a crash
5911          * restart from the REDO point would not replay that record, which means
5912          * that our flushing had better include the xact's update of pg_clog.  So
5913          * we wait till he's out of his commit critical section before proceeding.
5914          * See notes in RecordTransactionCommit().
5915          *
5916          * Because we've already released WALInsertLock, this test is a bit fuzzy:
5917          * it is possible that we will wait for xacts we didn't really need to
5918          * wait for.  But the delay should be short and it seems better to make
5919          * checkpoint take a bit longer than to hold locks longer than necessary.
5920          * (In fact, the whole reason we have this issue is that xact.c does
5921          * commit record XLOG insertion and clog update as two separate steps
5922          * protected by different locks, but again that seems best on grounds of
5923          * minimizing lock contention.)
5924          *
5925          * A transaction that has not yet set inCommit when we look cannot be at
5926          * risk, since he's not inserted his commit record yet; and one that's
5927          * already cleared it is not at risk either, since he's done fixing clog
5928          * and we will correctly flush the update below.  So we cannot miss any
5929          * xacts we need to wait for.
5930          */
5931         nInCommit = GetTransactionsInCommit(&inCommitXids);
5932         if (nInCommit > 0)
5933         {
5934                 do
5935                 {
5936                         pg_usleep(10000L);      /* wait for 10 msec */
5937                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
5938         }
5939         pfree(inCommitXids);
5940
5941         /*
5942          * Get the other info we need for the checkpoint record.
5943          */
5944         LWLockAcquire(XidGenLock, LW_SHARED);
5945         checkPoint.nextXid = ShmemVariableCache->nextXid;
5946         LWLockRelease(XidGenLock);
5947
5948         /* Increase XID epoch if we've wrapped around since last checkpoint */
5949         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5950         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
5951                 checkPoint.nextXidEpoch++;
5952
5953         LWLockAcquire(OidGenLock, LW_SHARED);
5954         checkPoint.nextOid = ShmemVariableCache->nextOid;
5955         if (!shutdown)
5956                 checkPoint.nextOid += ShmemVariableCache->oidCount;
5957         LWLockRelease(OidGenLock);
5958
5959         MultiXactGetCheckptMulti(shutdown,
5960                                                          &checkPoint.nextMulti,
5961                                                          &checkPoint.nextMultiOffset);
5962
5963         /*
5964          * Having constructed the checkpoint record, ensure all shmem disk buffers
5965          * and commit-log buffers are flushed to disk.
5966          *
5967          * This I/O could fail for various reasons.  If so, we will fail to
5968          * complete the checkpoint, but there is no reason to force a system
5969          * panic. Accordingly, exit critical section while doing it.
5970          */
5971         END_CRIT_SECTION();
5972
5973         CheckPointGuts(checkPoint.redo, flags);
5974
5975         START_CRIT_SECTION();
5976
5977         /*
5978          * Now insert the checkpoint record into XLOG.
5979          */
5980         rdata.data = (char *) (&checkPoint);
5981         rdata.len = sizeof(checkPoint);
5982         rdata.buffer = InvalidBuffer;
5983         rdata.next = NULL;
5984
5985         recptr = XLogInsert(RM_XLOG_ID,
5986                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
5987                                                 XLOG_CHECKPOINT_ONLINE,
5988                                                 &rdata);
5989
5990         XLogFlush(recptr);
5991
5992         /*
5993          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
5994          * = end of actual checkpoint record.
5995          */
5996         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
5997                 ereport(PANIC,
5998                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
5999
6000         /*
6001          * Select point at which we can truncate the log, which we base on the
6002          * prior checkpoint's earliest info.
6003          */
6004         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
6005
6006         /*
6007          * Update the control file.
6008          */
6009         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6010         if (shutdown)
6011                 ControlFile->state = DB_SHUTDOWNED;
6012         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6013         ControlFile->checkPoint = ProcLastRecPtr;
6014         ControlFile->checkPointCopy = checkPoint;
6015         ControlFile->time = (pg_time_t) time(NULL);
6016         UpdateControlFile();
6017         LWLockRelease(ControlFileLock);
6018
6019         /* Update shared-memory copy of checkpoint XID/epoch */
6020         {
6021                 /* use volatile pointer to prevent code rearrangement */
6022                 volatile XLogCtlData *xlogctl = XLogCtl;
6023
6024                 SpinLockAcquire(&xlogctl->info_lck);
6025                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
6026                 xlogctl->ckptXid = checkPoint.nextXid;
6027                 SpinLockRelease(&xlogctl->info_lck);
6028         }
6029
6030         /*
6031          * We are now done with critical updates; no need for system panic if we
6032          * have trouble while fooling with old log segments.
6033          */
6034         END_CRIT_SECTION();
6035
6036         /*
6037          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
6038          */
6039         smgrpostckpt();
6040
6041         /*
6042          * Delete old log files (those no longer needed even for previous
6043          * checkpoint).
6044          */
6045         if (_logId || _logSeg)
6046         {
6047                 PrevLogSeg(_logId, _logSeg);
6048                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
6049         }
6050
6051         /*
6052          * Make more log segments if needed.  (Do this after recycling old log
6053          * segments, since that may supply some of the needed files.)
6054          */
6055         if (!shutdown)
6056                 PreallocXlogFiles(recptr);
6057
6058         /*
6059          * Truncate pg_subtrans if possible.  We can throw away all data before
6060          * the oldest XMIN of any running transaction.  No future transaction will
6061          * attempt to reference any pg_subtrans entry older than that (see Asserts
6062          * in subtrans.c).      During recovery, though, we mustn't do this because
6063          * StartupSUBTRANS hasn't been called yet.
6064          */
6065         if (!InRecovery)
6066                 TruncateSUBTRANS(GetOldestXmin(true, false));
6067
6068         /* All real work is done, but log before releasing lock. */
6069         if (log_checkpoints)
6070                 LogCheckpointEnd();
6071
6072         LWLockRelease(CheckpointLock);
6073 }
6074
6075 /*
6076  * Flush all data in shared memory to disk, and fsync
6077  *
6078  * This is the common code shared between regular checkpoints and
6079  * recovery restartpoints.
6080  */
6081 static void
6082 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
6083 {
6084         CheckPointCLOG();
6085         CheckPointSUBTRANS();
6086         CheckPointMultiXact();
6087         CheckPointBuffers(flags);       /* performs all required fsyncs */
6088         /* We deliberately delay 2PC checkpointing as long as possible */
6089         CheckPointTwoPhase(checkPointRedo);
6090 }
6091
6092 /*
6093  * Set a recovery restart point if appropriate
6094  *
6095  * This is similar to CreateCheckPoint, but is used during WAL recovery
6096  * to establish a point from which recovery can roll forward without
6097  * replaying the entire recovery log.  This function is called each time
6098  * a checkpoint record is read from XLOG; it must determine whether a
6099  * restartpoint is needed or not.
6100  */
6101 static void
6102 RecoveryRestartPoint(const CheckPoint *checkPoint)
6103 {
6104         int                     elapsed_secs;
6105         int                     rmid;
6106
6107         /*
6108          * Do nothing if the elapsed time since the last restartpoint is less than
6109          * half of checkpoint_timeout.  (We use a value less than
6110          * checkpoint_timeout so that variations in the timing of checkpoints on
6111          * the master, or speed of transmission of WAL segments to a slave, won't
6112          * make the slave skip a restartpoint once it's synced with the master.)
6113          * Checking true elapsed time keeps us from doing restartpoints too often
6114          * while rapidly scanning large amounts of WAL.
6115          */
6116         elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
6117         if (elapsed_secs < CheckPointTimeout / 2)
6118                 return;
6119
6120         /*
6121          * Is it safe to checkpoint?  We must ask each of the resource managers
6122          * whether they have any partial state information that might prevent a
6123          * correct restart from this point.  If so, we skip this opportunity, but
6124          * return at the next checkpoint record for another try.
6125          */
6126         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6127         {
6128                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
6129                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
6130                         {
6131                                 elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
6132                                          rmid,
6133                                          checkPoint->redo.xlogid,
6134                                          checkPoint->redo.xrecoff);
6135                                 return;
6136                         }
6137         }
6138
6139         /*
6140          * OK, force data out to disk
6141          */
6142         CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
6143
6144         /*
6145          * Update pg_control so that any subsequent crash will restart from this
6146          * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
6147          * record itself.
6148          */
6149         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6150         ControlFile->checkPoint = ReadRecPtr;
6151         ControlFile->checkPointCopy = *checkPoint;
6152         ControlFile->time = (pg_time_t) time(NULL);
6153         UpdateControlFile();
6154
6155         ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
6156                         (errmsg("recovery restart point at %X/%X",
6157                                         checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
6158         if (recoveryLastXTime)
6159                 ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
6160                                 (errmsg("last completed transaction was at log time %s",
6161                                                 timestamptz_to_str(recoveryLastXTime))));
6162 }
6163
6164 /*
6165  * Write a NEXTOID log record
6166  */
6167 void
6168 XLogPutNextOid(Oid nextOid)
6169 {
6170         XLogRecData rdata;
6171
6172         rdata.data = (char *) (&nextOid);
6173         rdata.len = sizeof(Oid);
6174         rdata.buffer = InvalidBuffer;
6175         rdata.next = NULL;
6176         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
6177
6178         /*
6179          * We need not flush the NEXTOID record immediately, because any of the
6180          * just-allocated OIDs could only reach disk as part of a tuple insert or
6181          * update that would have its own XLOG record that must follow the NEXTOID
6182          * record.      Therefore, the standard buffer LSN interlock applied to those
6183          * records will ensure no such OID reaches disk before the NEXTOID record
6184          * does.
6185          *
6186          * Note, however, that the above statement only covers state "within" the
6187          * database.  When we use a generated OID as a file or directory name, we
6188          * are in a sense violating the basic WAL rule, because that filesystem
6189          * change may reach disk before the NEXTOID WAL record does.  The impact
6190          * of this is that if a database crash occurs immediately afterward, we
6191          * might after restart re-generate the same OID and find that it conflicts
6192          * with the leftover file or directory.  But since for safety's sake we
6193          * always loop until finding a nonconflicting filename, this poses no real
6194          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
6195          */
6196 }
6197
6198 /*
6199  * Write an XLOG SWITCH record.
6200  *
6201  * Here we just blindly issue an XLogInsert request for the record.
6202  * All the magic happens inside XLogInsert.
6203  *
6204  * The return value is either the end+1 address of the switch record,
6205  * or the end+1 address of the prior segment if we did not need to
6206  * write a switch record because we are already at segment start.
6207  */
6208 XLogRecPtr
6209 RequestXLogSwitch(void)
6210 {
6211         XLogRecPtr      RecPtr;
6212         XLogRecData rdata;
6213
6214         /* XLOG SWITCH, alone among xlog record types, has no data */
6215         rdata.buffer = InvalidBuffer;
6216         rdata.data = NULL;
6217         rdata.len = 0;
6218         rdata.next = NULL;
6219
6220         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
6221
6222         return RecPtr;
6223 }
6224
6225 /*
6226  * XLOG resource manager's routines
6227  */
6228 void
6229 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
6230 {
6231         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6232
6233         if (info == XLOG_NEXTOID)
6234         {
6235                 Oid                     nextOid;
6236
6237                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
6238                 if (ShmemVariableCache->nextOid < nextOid)
6239                 {
6240                         ShmemVariableCache->nextOid = nextOid;
6241                         ShmemVariableCache->oidCount = 0;
6242                 }
6243         }
6244         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
6245         {
6246                 CheckPoint      checkPoint;
6247
6248                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6249                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
6250                 ShmemVariableCache->nextXid = checkPoint.nextXid;
6251                 ShmemVariableCache->nextOid = checkPoint.nextOid;
6252                 ShmemVariableCache->oidCount = 0;
6253                 MultiXactSetNextMXact(checkPoint.nextMulti,
6254                                                           checkPoint.nextMultiOffset);
6255
6256                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6257                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6258                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6259
6260                 /*
6261                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
6262                  */
6263                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6264                 {
6265                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
6266                                 !list_member_int(expectedTLIs,
6267                                                                  (int) checkPoint.ThisTimeLineID))
6268                                 ereport(PANIC,
6269                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
6270                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
6271                         /* Following WAL records should be run with new TLI */
6272                         ThisTimeLineID = checkPoint.ThisTimeLineID;
6273                 }
6274
6275                 RecoveryRestartPoint(&checkPoint);
6276         }
6277         else if (info == XLOG_CHECKPOINT_ONLINE)
6278         {
6279                 CheckPoint      checkPoint;
6280
6281                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6282                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
6283                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
6284                                                                   checkPoint.nextXid))
6285                         ShmemVariableCache->nextXid = checkPoint.nextXid;
6286                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
6287                 {
6288                         ShmemVariableCache->nextOid = checkPoint.nextOid;
6289                         ShmemVariableCache->oidCount = 0;
6290                 }
6291                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
6292                                                                   checkPoint.nextMultiOffset);
6293
6294                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6295                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6296                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6297
6298                 /* TLI should not change in an on-line checkpoint */
6299                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6300                         ereport(PANIC,
6301                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
6302                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
6303
6304                 RecoveryRestartPoint(&checkPoint);
6305         }
6306         else if (info == XLOG_NOOP)
6307         {
6308                 /* nothing to do here */
6309         }
6310         else if (info == XLOG_SWITCH)
6311         {
6312                 /* nothing to do here */
6313         }
6314 }
6315
6316 void
6317 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
6318 {
6319         uint8           info = xl_info & ~XLR_INFO_MASK;
6320
6321         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
6322                 info == XLOG_CHECKPOINT_ONLINE)
6323         {
6324                 CheckPoint *checkpoint = (CheckPoint *) rec;
6325
6326                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
6327                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; %s",
6328                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
6329                                                  checkpoint->ThisTimeLineID,
6330                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
6331                                                  checkpoint->nextOid,
6332                                                  checkpoint->nextMulti,
6333                                                  checkpoint->nextMultiOffset,
6334                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
6335         }
6336         else if (info == XLOG_NOOP)
6337         {
6338                 appendStringInfo(buf, "xlog no-op");
6339         }
6340         else if (info == XLOG_NEXTOID)
6341         {
6342                 Oid                     nextOid;
6343
6344                 memcpy(&nextOid, rec, sizeof(Oid));
6345                 appendStringInfo(buf, "nextOid: %u", nextOid);
6346         }
6347         else if (info == XLOG_SWITCH)
6348         {
6349                 appendStringInfo(buf, "xlog switch");
6350         }
6351         else
6352                 appendStringInfo(buf, "UNKNOWN");
6353 }
6354
6355 #ifdef WAL_DEBUG
6356
6357 static void
6358 xlog_outrec(StringInfo buf, XLogRecord *record)
6359 {
6360         int                     i;
6361
6362         appendStringInfo(buf, "prev %X/%X; xid %u",
6363                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
6364                                          record->xl_xid);
6365
6366         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
6367         {
6368                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
6369                         appendStringInfo(buf, "; bkpb%d", i + 1);
6370         }
6371
6372         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
6373 }
6374 #endif   /* WAL_DEBUG */
6375
6376
6377 /*
6378  * Return the (possible) sync flag used for opening a file, depending on the
6379  * value of the GUC wal_sync_method.
6380  */
6381 static int
6382 get_sync_bit(int method)
6383 {
6384         /* If fsync is disabled, never open in sync mode */
6385         if (!enableFsync)
6386                 return 0;
6387
6388         switch (method)
6389         {
6390                 /*
6391                  * enum values for all sync options are defined even if they are not
6392                  * supported on the current platform.  But if not, they are not
6393                  * included in the enum option array, and therefore will never be seen
6394                  * here.
6395                  */
6396                 case SYNC_METHOD_FSYNC:
6397                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
6398                 case SYNC_METHOD_FDATASYNC:
6399                         return 0;
6400 #ifdef OPEN_SYNC_FLAG
6401                 case SYNC_METHOD_OPEN:
6402                         return OPEN_SYNC_FLAG;
6403 #endif
6404 #ifdef OPEN_DATASYNC_FLAG
6405                 case SYNC_METHOD_OPEN_DSYNC:
6406                         return OPEN_DATASYNC_FLAG;
6407 #endif
6408                 default:
6409                         /* can't happen (unless we are out of sync with option array) */
6410                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
6411                         return 0; /* silence warning */
6412         }
6413 }
6414
6415 /*
6416  * GUC support
6417  */
6418 bool
6419 assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
6420 {
6421         if (!doit)
6422                 return true;
6423
6424         if (sync_method != new_sync_method)
6425         {
6426                 /*
6427                  * To ensure that no blocks escape unsynced, force an fsync on the
6428                  * currently open log segment (if any).  Also, if the open flag is
6429                  * changing, close the log file so it will be reopened (with new flag
6430                  * bit) at next use.
6431                  */
6432                 if (openLogFile >= 0)
6433                 {
6434                         if (pg_fsync(openLogFile) != 0)
6435                                 ereport(PANIC,
6436                                                 (errcode_for_file_access(),
6437                                                  errmsg("could not fsync log file %u, segment %u: %m",
6438                                                                 openLogId, openLogSeg)));
6439                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
6440                                 XLogFileClose();
6441                 }
6442         }
6443
6444         return true;
6445 }
6446
6447
6448 /*
6449  * Issue appropriate kind of fsync (if any) on the current XLOG output file
6450  */
6451 static void
6452 issue_xlog_fsync(void)
6453 {
6454         switch (sync_method)
6455         {
6456                 case SYNC_METHOD_FSYNC:
6457                         if (pg_fsync_no_writethrough(openLogFile) != 0)
6458                                 ereport(PANIC,
6459                                                 (errcode_for_file_access(),
6460                                                  errmsg("could not fsync log file %u, segment %u: %m",
6461                                                                 openLogId, openLogSeg)));
6462                         break;
6463 #ifdef HAVE_FSYNC_WRITETHROUGH
6464                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
6465                         if (pg_fsync_writethrough(openLogFile) != 0)
6466                                 ereport(PANIC,
6467                                                 (errcode_for_file_access(),
6468                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
6469                                                                 openLogId, openLogSeg)));
6470                         break;
6471 #endif
6472 #ifdef HAVE_FDATASYNC
6473                 case SYNC_METHOD_FDATASYNC:
6474                         if (pg_fdatasync(openLogFile) != 0)
6475                                 ereport(PANIC,
6476                                                 (errcode_for_file_access(),
6477                                         errmsg("could not fdatasync log file %u, segment %u: %m",
6478                                                    openLogId, openLogSeg)));
6479                         break;
6480 #endif
6481                 case SYNC_METHOD_OPEN:
6482                 case SYNC_METHOD_OPEN_DSYNC:
6483                         /* write synced it already */
6484                         break;
6485                 default:
6486                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
6487                         break;
6488         }
6489 }
6490
6491
6492 /*
6493  * pg_start_backup: set up for taking an on-line backup dump
6494  *
6495  * Essentially what this does is to create a backup label file in $PGDATA,
6496  * where it will be archived as part of the backup dump.  The label file
6497  * contains the user-supplied label string (typically this would be used
6498  * to tell where the backup dump will be stored) and the starting time and
6499  * starting WAL location for the dump.
6500  */
6501 Datum
6502 pg_start_backup(PG_FUNCTION_ARGS)
6503 {
6504         text       *backupid = PG_GETARG_TEXT_P(0);
6505         char       *backupidstr;
6506         XLogRecPtr      checkpointloc;
6507         XLogRecPtr      startpoint;
6508         pg_time_t       stamp_time;
6509         char            strfbuf[128];
6510         char            xlogfilename[MAXFNAMELEN];
6511         uint32          _logId;
6512         uint32          _logSeg;
6513         struct stat stat_buf;
6514         FILE       *fp;
6515
6516         if (!superuser())
6517                 ereport(ERROR,
6518                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6519                                  errmsg("must be superuser to run a backup")));
6520
6521         if (!XLogArchivingActive())
6522                 ereport(ERROR,
6523                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6524                                  errmsg("WAL archiving is not active"),
6525                                  errhint("archive_mode must be enabled at server start.")));
6526
6527         if (!XLogArchiveCommandSet())
6528                 ereport(ERROR,
6529                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6530                                  errmsg("WAL archiving is not active"),
6531                                  errhint("archive_command must be defined before "
6532                                                  "online backups can be made safely.")));
6533
6534         backupidstr = text_to_cstring(backupid);
6535
6536         /*
6537          * Mark backup active in shared memory.  We must do full-page WAL writes
6538          * during an on-line backup even if not doing so at other times, because
6539          * it's quite possible for the backup dump to obtain a "torn" (partially
6540          * written) copy of a database page if it reads the page concurrently with
6541          * our write to the same page.  This can be fixed as long as the first
6542          * write to the page in the WAL sequence is a full-page write. Hence, we
6543          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
6544          * are no dirty pages in shared memory that might get dumped while the
6545          * backup is in progress without having a corresponding WAL record.  (Once
6546          * the backup is complete, we need not force full-page writes anymore,
6547          * since we expect that any pages not modified during the backup interval
6548          * must have been correctly captured by the backup.)
6549          *
6550          * We must hold WALInsertLock to change the value of forcePageWrites, to
6551          * ensure adequate interlocking against XLogInsert().
6552          */
6553         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6554         if (XLogCtl->Insert.forcePageWrites)
6555         {
6556                 LWLockRelease(WALInsertLock);
6557                 ereport(ERROR,
6558                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6559                                  errmsg("a backup is already in progress"),
6560                                  errhint("Run pg_stop_backup() and try again.")));
6561         }
6562         XLogCtl->Insert.forcePageWrites = true;
6563         LWLockRelease(WALInsertLock);
6564
6565         /* Ensure we release forcePageWrites if fail below */
6566         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
6567         {
6568                 /*
6569                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
6570                  * page problems, this guarantees that two successive backup runs will
6571                  * have different checkpoint positions and hence different history
6572                  * file names, even if nothing happened in between.
6573                  *
6574                  * We don't use CHECKPOINT_IMMEDIATE, hence this can take awhile.
6575                  */
6576                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT);
6577
6578                 /*
6579                  * Now we need to fetch the checkpoint record location, and also its
6580                  * REDO pointer.  The oldest point in WAL that would be needed to
6581                  * restore starting from the checkpoint is precisely the REDO pointer.
6582                  */
6583                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6584                 checkpointloc = ControlFile->checkPoint;
6585                 startpoint = ControlFile->checkPointCopy.redo;
6586                 LWLockRelease(ControlFileLock);
6587
6588                 XLByteToSeg(startpoint, _logId, _logSeg);
6589                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
6590
6591                 /* Use the log timezone here, not the session timezone */
6592                 stamp_time = (pg_time_t) time(NULL);
6593                 pg_strftime(strfbuf, sizeof(strfbuf),
6594                                         "%Y-%m-%d %H:%M:%S %Z",
6595                                         pg_localtime(&stamp_time, log_timezone));
6596
6597                 /*
6598                  * Check for existing backup label --- implies a backup is already
6599                  * running.  (XXX given that we checked forcePageWrites above, maybe
6600                  * it would be OK to just unlink any such label file?)
6601                  */
6602                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
6603                 {
6604                         if (errno != ENOENT)
6605                                 ereport(ERROR,
6606                                                 (errcode_for_file_access(),
6607                                                  errmsg("could not stat file \"%s\": %m",
6608                                                                 BACKUP_LABEL_FILE)));
6609                 }
6610                 else
6611                         ereport(ERROR,
6612                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6613                                          errmsg("a backup is already in progress"),
6614                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
6615                                                          BACKUP_LABEL_FILE)));
6616
6617                 /*
6618                  * Okay, write the file
6619                  */
6620                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
6621                 if (!fp)
6622                         ereport(ERROR,
6623                                         (errcode_for_file_access(),
6624                                          errmsg("could not create file \"%s\": %m",
6625                                                         BACKUP_LABEL_FILE)));
6626                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
6627                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
6628                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
6629                                 checkpointloc.xlogid, checkpointloc.xrecoff);
6630                 fprintf(fp, "START TIME: %s\n", strfbuf);
6631                 fprintf(fp, "LABEL: %s\n", backupidstr);
6632                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
6633                         ereport(ERROR,
6634                                         (errcode_for_file_access(),
6635                                          errmsg("could not write file \"%s\": %m",
6636                                                         BACKUP_LABEL_FILE)));
6637         }
6638         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
6639
6640         /*
6641          * We're done.  As a convenience, return the starting WAL location.
6642          */
6643         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
6644                          startpoint.xlogid, startpoint.xrecoff);
6645         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
6646 }
6647
6648 /* Error cleanup callback for pg_start_backup */
6649 static void
6650 pg_start_backup_callback(int code, Datum arg)
6651 {
6652         /* Turn off forcePageWrites on failure */
6653         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6654         XLogCtl->Insert.forcePageWrites = false;
6655         LWLockRelease(WALInsertLock);
6656 }
6657
6658 /*
6659  * pg_stop_backup: finish taking an on-line backup dump
6660  *
6661  * We remove the backup label file created by pg_start_backup, and instead
6662  * create a backup history file in pg_xlog (whence it will immediately be
6663  * archived).  The backup history file contains the same info found in
6664  * the label file, plus the backup-end time and WAL location.
6665  * Note: different from CancelBackup which just cancels online backup mode.
6666  */
6667 Datum
6668 pg_stop_backup(PG_FUNCTION_ARGS)
6669 {
6670         XLogRecPtr      startpoint;
6671         XLogRecPtr      stoppoint;
6672         pg_time_t       stamp_time;
6673         char            strfbuf[128];
6674         char            histfilepath[MAXPGPATH];
6675         char            startxlogfilename[MAXFNAMELEN];
6676         char            stopxlogfilename[MAXFNAMELEN];
6677         char            lastxlogfilename[MAXFNAMELEN];
6678         char            histfilename[MAXFNAMELEN];
6679         uint32          _logId;
6680         uint32          _logSeg;
6681         FILE       *lfp;
6682         FILE       *fp;
6683         char            ch;
6684         int                     ich;
6685         int                     seconds_before_warning;
6686         int                     waits = 0;
6687
6688         if (!superuser())
6689                 ereport(ERROR,
6690                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6691                                  (errmsg("must be superuser to run a backup"))));
6692
6693         if (!XLogArchivingActive())
6694                 ereport(ERROR,
6695                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6696                                  errmsg("WAL archiving is not active"),
6697                                  errhint("archive_mode must be enabled at server start.")));
6698
6699         /*
6700          * OK to clear forcePageWrites
6701          */
6702         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6703         XLogCtl->Insert.forcePageWrites = false;
6704         LWLockRelease(WALInsertLock);
6705
6706         /*
6707          * Force a switch to a new xlog segment file, so that the backup is valid
6708          * as soon as archiver moves out the current segment file. We'll report
6709          * the end address of the XLOG SWITCH record as the backup stopping point.
6710          */
6711         stoppoint = RequestXLogSwitch();
6712
6713         XLByteToSeg(stoppoint, _logId, _logSeg);
6714         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
6715
6716         /* Use the log timezone here, not the session timezone */
6717         stamp_time = (pg_time_t) time(NULL);
6718         pg_strftime(strfbuf, sizeof(strfbuf),
6719                                 "%Y-%m-%d %H:%M:%S %Z",
6720                                 pg_localtime(&stamp_time, log_timezone));
6721
6722         /*
6723          * Open the existing label file
6724          */
6725         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
6726         if (!lfp)
6727         {
6728                 if (errno != ENOENT)
6729                         ereport(ERROR,
6730                                         (errcode_for_file_access(),
6731                                          errmsg("could not read file \"%s\": %m",
6732                                                         BACKUP_LABEL_FILE)));
6733                 ereport(ERROR,
6734                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6735                                  errmsg("a backup is not in progress")));
6736         }
6737
6738         /*
6739          * Read and parse the START WAL LOCATION line (this code is pretty crude,
6740          * but we are not expecting any variability in the file format).
6741          */
6742         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
6743                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
6744                            &ch) != 4 || ch != '\n')
6745                 ereport(ERROR,
6746                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6747                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
6748
6749         /*
6750          * Write the backup history file
6751          */
6752         XLByteToSeg(startpoint, _logId, _logSeg);
6753         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
6754                                                   startpoint.xrecoff % XLogSegSize);
6755         fp = AllocateFile(histfilepath, "w");
6756         if (!fp)
6757                 ereport(ERROR,
6758                                 (errcode_for_file_access(),
6759                                  errmsg("could not create file \"%s\": %m",
6760                                                 histfilepath)));
6761         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
6762                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
6763         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
6764                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
6765         /* transfer remaining lines from label to history file */
6766         while ((ich = fgetc(lfp)) != EOF)
6767                 fputc(ich, fp);
6768         fprintf(fp, "STOP TIME: %s\n", strfbuf);
6769         if (fflush(fp) || ferror(fp) || FreeFile(fp))
6770                 ereport(ERROR,
6771                                 (errcode_for_file_access(),
6772                                  errmsg("could not write file \"%s\": %m",
6773                                                 histfilepath)));
6774
6775         /*
6776          * Close and remove the backup label file
6777          */
6778         if (ferror(lfp) || FreeFile(lfp))
6779                 ereport(ERROR,
6780                                 (errcode_for_file_access(),
6781                                  errmsg("could not read file \"%s\": %m",
6782                                                 BACKUP_LABEL_FILE)));
6783         if (unlink(BACKUP_LABEL_FILE) != 0)
6784                 ereport(ERROR,
6785                                 (errcode_for_file_access(),
6786                                  errmsg("could not remove file \"%s\": %m",
6787                                                 BACKUP_LABEL_FILE)));
6788
6789         /*
6790          * Clean out any no-longer-needed history files.  As a side effect, this
6791          * will post a .ready file for the newly created history file, notifying
6792          * the archiver that history file may be archived immediately.
6793          */
6794         CleanupBackupHistory();
6795
6796         /*
6797          * Wait until both the last WAL file filled during backup and the history
6798          * file have been archived.  We assume that the alphabetic sorting
6799          * property of the WAL files ensures any earlier WAL files are safely
6800          * archived as well.
6801          *
6802          * We wait forever, since archive_command is supposed to work and
6803          * we assume the admin wanted his backup to work completely. If you
6804          * don't wish to wait, you can set statement_timeout.
6805          */
6806         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
6807         XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
6808
6809         XLByteToSeg(startpoint, _logId, _logSeg);
6810         BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
6811                                                   startpoint.xrecoff % XLogSegSize);
6812
6813         seconds_before_warning = 60;
6814         waits = 0;
6815
6816         while (XLogArchiveIsBusy(lastxlogfilename) ||
6817                    XLogArchiveIsBusy(histfilename))
6818         {
6819                 CHECK_FOR_INTERRUPTS();
6820
6821                 pg_usleep(1000000L);
6822
6823                 if (++waits >= seconds_before_warning)
6824                 {
6825                         seconds_before_warning *= 2;     /* This wraps in >10 years... */
6826                         ereport(WARNING,
6827                                         (errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
6828                                                         waits)));
6829                 }
6830         }
6831
6832         /*
6833          * We're done.  As a convenience, return the ending WAL location.
6834          */
6835         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
6836                          stoppoint.xlogid, stoppoint.xrecoff);
6837         PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
6838 }
6839
6840 /*
6841  * pg_switch_xlog: switch to next xlog file
6842  */
6843 Datum
6844 pg_switch_xlog(PG_FUNCTION_ARGS)
6845 {
6846         XLogRecPtr      switchpoint;
6847         char            location[MAXFNAMELEN];
6848
6849         if (!superuser())
6850                 ereport(ERROR,
6851                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6852                          (errmsg("must be superuser to switch transaction log files"))));
6853
6854         switchpoint = RequestXLogSwitch();
6855
6856         /*
6857          * As a convenience, return the WAL location of the switch record
6858          */
6859         snprintf(location, sizeof(location), "%X/%X",
6860                          switchpoint.xlogid, switchpoint.xrecoff);
6861         PG_RETURN_TEXT_P(cstring_to_text(location));
6862 }
6863
6864 /*
6865  * Report the current WAL write location (same format as pg_start_backup etc)
6866  *
6867  * This is useful for determining how much of WAL is visible to an external
6868  * archiving process.  Note that the data before this point is written out
6869  * to the kernel, but is not necessarily synced to disk.
6870  */
6871 Datum
6872 pg_current_xlog_location(PG_FUNCTION_ARGS)
6873 {
6874         char            location[MAXFNAMELEN];
6875
6876         /* Make sure we have an up-to-date local LogwrtResult */
6877         {
6878                 /* use volatile pointer to prevent code rearrangement */
6879                 volatile XLogCtlData *xlogctl = XLogCtl;
6880
6881                 SpinLockAcquire(&xlogctl->info_lck);
6882                 LogwrtResult = xlogctl->LogwrtResult;
6883                 SpinLockRelease(&xlogctl->info_lck);
6884         }
6885
6886         snprintf(location, sizeof(location), "%X/%X",
6887                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
6888         PG_RETURN_TEXT_P(cstring_to_text(location));
6889 }
6890
6891 /*
6892  * Report the current WAL insert location (same format as pg_start_backup etc)
6893  *
6894  * This function is mostly for debugging purposes.
6895  */
6896 Datum
6897 pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
6898 {
6899         XLogCtlInsert *Insert = &XLogCtl->Insert;
6900         XLogRecPtr      current_recptr;
6901         char            location[MAXFNAMELEN];
6902
6903         /*
6904          * Get the current end-of-WAL position ... shared lock is sufficient
6905          */
6906         LWLockAcquire(WALInsertLock, LW_SHARED);
6907         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
6908         LWLockRelease(WALInsertLock);
6909
6910         snprintf(location, sizeof(location), "%X/%X",
6911                          current_recptr.xlogid, current_recptr.xrecoff);
6912         PG_RETURN_TEXT_P(cstring_to_text(location));
6913 }
6914
6915 /*
6916  * Compute an xlog file name and decimal byte offset given a WAL location,
6917  * such as is returned by pg_stop_backup() or pg_xlog_switch().
6918  *
6919  * Note that a location exactly at a segment boundary is taken to be in
6920  * the previous segment.  This is usually the right thing, since the
6921  * expected usage is to determine which xlog file(s) are ready to archive.
6922  */
6923 Datum
6924 pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
6925 {
6926         text       *location = PG_GETARG_TEXT_P(0);
6927         char       *locationstr;
6928         unsigned int uxlogid;
6929         unsigned int uxrecoff;
6930         uint32          xlogid;
6931         uint32          xlogseg;
6932         uint32          xrecoff;
6933         XLogRecPtr      locationpoint;
6934         char            xlogfilename[MAXFNAMELEN];
6935         Datum           values[2];
6936         bool            isnull[2];
6937         TupleDesc       resultTupleDesc;
6938         HeapTuple       resultHeapTuple;
6939         Datum           result;
6940
6941         /*
6942          * Read input and parse
6943          */
6944         locationstr = text_to_cstring(location);
6945
6946         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
6947                 ereport(ERROR,
6948                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6949                                  errmsg("could not parse transaction log location \"%s\"",
6950                                                 locationstr)));
6951
6952         locationpoint.xlogid = uxlogid;
6953         locationpoint.xrecoff = uxrecoff;
6954
6955         /*
6956          * Construct a tuple descriptor for the result row.  This must match this
6957          * function's pg_proc entry!
6958          */
6959         resultTupleDesc = CreateTemplateTupleDesc(2, false);
6960         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
6961                                            TEXTOID, -1, 0);
6962         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
6963                                            INT4OID, -1, 0);
6964
6965         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
6966
6967         /*
6968          * xlogfilename
6969          */
6970         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
6971         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
6972
6973         values[0] = CStringGetTextDatum(xlogfilename);
6974         isnull[0] = false;
6975
6976         /*
6977          * offset
6978          */
6979         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
6980
6981         values[1] = UInt32GetDatum(xrecoff);
6982         isnull[1] = false;
6983
6984         /*
6985          * Tuple jam: Having first prepared your Datums, then squash together
6986          */
6987         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
6988
6989         result = HeapTupleGetDatum(resultHeapTuple);
6990
6991         PG_RETURN_DATUM(result);
6992 }
6993
6994 /*
6995  * Compute an xlog file name given a WAL location,
6996  * such as is returned by pg_stop_backup() or pg_xlog_switch().
6997  */
6998 Datum
6999 pg_xlogfile_name(PG_FUNCTION_ARGS)
7000 {
7001         text       *location = PG_GETARG_TEXT_P(0);
7002         char       *locationstr;
7003         unsigned int uxlogid;
7004         unsigned int uxrecoff;
7005         uint32          xlogid;
7006         uint32          xlogseg;
7007         XLogRecPtr      locationpoint;
7008         char            xlogfilename[MAXFNAMELEN];
7009
7010         locationstr = text_to_cstring(location);
7011
7012         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
7013                 ereport(ERROR,
7014                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
7015                                  errmsg("could not parse transaction log location \"%s\"",
7016                                                 locationstr)));
7017
7018         locationpoint.xlogid = uxlogid;
7019         locationpoint.xrecoff = uxrecoff;
7020
7021         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
7022         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
7023
7024         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7025 }
7026
7027 /*
7028  * read_backup_label: check to see if a backup_label file is present
7029  *
7030  * If we see a backup_label during recovery, we assume that we are recovering
7031  * from a backup dump file, and we therefore roll forward from the checkpoint
7032  * identified by the label file, NOT what pg_control says.      This avoids the
7033  * problem that pg_control might have been archived one or more checkpoints
7034  * later than the start of the dump, and so if we rely on it as the start
7035  * point, we will fail to restore a consistent database state.
7036  *
7037  * We also attempt to retrieve the corresponding backup history file.
7038  * If successful, set *minRecoveryLoc to constrain valid PITR stopping
7039  * points.
7040  *
7041  * Returns TRUE if a backup_label was found (and fills the checkpoint
7042  * location into *checkPointLoc); returns FALSE if not.
7043  */
7044 static bool
7045 read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc)
7046 {
7047         XLogRecPtr      startpoint;
7048         XLogRecPtr      stoppoint;
7049         char            histfilename[MAXFNAMELEN];
7050         char            histfilepath[MAXPGPATH];
7051         char            startxlogfilename[MAXFNAMELEN];
7052         char            stopxlogfilename[MAXFNAMELEN];
7053         TimeLineID      tli;
7054         uint32          _logId;
7055         uint32          _logSeg;
7056         FILE       *lfp;
7057         FILE       *fp;
7058         char            ch;
7059
7060         /* Default is to not constrain recovery stop point */
7061         minRecoveryLoc->xlogid = 0;
7062         minRecoveryLoc->xrecoff = 0;
7063
7064         /*
7065          * See if label file is present
7066          */
7067         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7068         if (!lfp)
7069         {
7070                 if (errno != ENOENT)
7071                         ereport(FATAL,
7072                                         (errcode_for_file_access(),
7073                                          errmsg("could not read file \"%s\": %m",
7074                                                         BACKUP_LABEL_FILE)));
7075                 return false;                   /* it's not there, all is fine */
7076         }
7077
7078         /*
7079          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
7080          * is pretty crude, but we are not expecting any variability in the file
7081          * format).
7082          */
7083         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
7084                            &startpoint.xlogid, &startpoint.xrecoff, &tli,
7085                            startxlogfilename, &ch) != 5 || ch != '\n')
7086                 ereport(FATAL,
7087                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7088                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7089         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
7090                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
7091                            &ch) != 3 || ch != '\n')
7092                 ereport(FATAL,
7093                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7094                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7095         if (ferror(lfp) || FreeFile(lfp))
7096                 ereport(FATAL,
7097                                 (errcode_for_file_access(),
7098                                  errmsg("could not read file \"%s\": %m",
7099                                                 BACKUP_LABEL_FILE)));
7100
7101         /*
7102          * Try to retrieve the backup history file (no error if we can't)
7103          */
7104         XLByteToSeg(startpoint, _logId, _logSeg);
7105         BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
7106                                                   startpoint.xrecoff % XLogSegSize);
7107
7108         if (InArchiveRecovery)
7109                 RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
7110         else
7111                 BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
7112                                                           startpoint.xrecoff % XLogSegSize);
7113
7114         fp = AllocateFile(histfilepath, "r");
7115         if (fp)
7116         {
7117                 /*
7118                  * Parse history file to identify stop point.
7119                  */
7120                 if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
7121                                    &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7122                                    &ch) != 4 || ch != '\n')
7123                         ereport(FATAL,
7124                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7125                                          errmsg("invalid data in file \"%s\"", histfilename)));
7126                 if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
7127                                    &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
7128                                    &ch) != 4 || ch != '\n')
7129                         ereport(FATAL,
7130                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7131                                          errmsg("invalid data in file \"%s\"", histfilename)));
7132                 *minRecoveryLoc = stoppoint;
7133                 if (ferror(fp) || FreeFile(fp))
7134                         ereport(FATAL,
7135                                         (errcode_for_file_access(),
7136                                          errmsg("could not read file \"%s\": %m",
7137                                                         histfilepath)));
7138         }
7139
7140         return true;
7141 }
7142
7143 /*
7144  * Error context callback for errors occurring during rm_redo().
7145  */
7146 static void
7147 rm_redo_error_callback(void *arg)
7148 {
7149         XLogRecord *record = (XLogRecord *) arg;
7150         StringInfoData buf;
7151
7152         initStringInfo(&buf);
7153         RmgrTable[record->xl_rmid].rm_desc(&buf,
7154                                                                            record->xl_info,
7155                                                                            XLogRecGetData(record));
7156
7157         /* don't bother emitting empty description */
7158         if (buf.len > 0)
7159                 errcontext("xlog redo %s", buf.data);
7160
7161         pfree(buf.data);
7162 }
7163
7164 /*
7165  * BackupInProgress: check if online backup mode is active
7166  *
7167  * This is done by checking for existence of the "backup_label" file.
7168  */
7169 bool
7170 BackupInProgress(void)
7171 {
7172         struct stat stat_buf;
7173
7174         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
7175 }
7176
7177 /*
7178  * CancelBackup: rename the "backup_label" file to cancel backup mode
7179  *
7180  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
7181  * Note that this will render an online backup in progress useless.
7182  * To correctly finish an online backup, pg_stop_backup must be called.
7183  */
7184 void
7185 CancelBackup(void)
7186 {
7187         struct stat stat_buf;
7188
7189         /* if the file is not there, return */
7190         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
7191                 return;
7192
7193         /* remove leftover file from previously cancelled backup if it exists */
7194         unlink(BACKUP_LABEL_OLD);
7195
7196         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
7197         {
7198                 ereport(LOG,
7199                                 (errmsg("online backup mode cancelled"),
7200                                  errdetail("\"%s\" was renamed to \"%s\".",
7201                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7202         }
7203         else
7204         {
7205                 ereport(WARNING,
7206                                 (errcode_for_file_access(),
7207                                  errmsg("online backup mode was not cancelled"),
7208                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
7209                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7210         }
7211 }
7212